Source code for genewiseMAFC

# -*- coding: utf-8 -*-

"""

.. Created on Wed Aug 29 12:37:31 2018

   @author: cow082

"""

import numpy as np
import pandas as pd
import math
import argparse
import sys


[docs]def meanAbsoluteFoldChange(frame): """find the average log2 fold change of SNPs (alleles) for a single gene """ #print('meanAbsoluteFoldChange: run...', frame) frame['fc'] = frame.apply( lambda row: row.a_altCount / row.a_refCount, axis=1) frame['log2fc'] = frame.apply( lambda row: math.log(row.fc, 2), axis=1) frame['absLog2fc'] = frame.apply( lambda row: abs(row.log2fc), axis=1) frame['aveLog2fc'] = np.mean(frame['absLog2fc']) return pd.Series({ 'snp_count': int(len(frame)), 'ave_read_depth': round((np.sum(frame.a_altCount) + np.sum(frame.a_refCount)) / len(frame), 1), 'aveLog2fc': frame['aveLog2fc'].iloc[0], })
[docs]def enrichASE(args): """identify genes with differential expression between two alleles requires: annotated SNP data (*.merged.csv) output from bedtools intersect How it works: 1. group SNPs together by gene 2. For each SNP in that gene, calculate the absolute(log2(fold change)) 3. take the average of absolute(log2(fold change)) values across all SNPS for that gene 4. filter on: - average read depth at variable positions for the gene - number of SNPs in the gene - average(absolute(log2(fold change))) for the gene Two optional args with no defaults change the program behaviour: 1. --output <file> sends the results to file rather than stdout 2. --gene <gene> yields data for every SNP in a specific gene. Without this arg, the program produces instead a summary table of all significant genes in order of priority (sorted by: average(absolute(log2(fold change)))) """ # the input has no header, so supply one headers = ['a_chrom', 'a_chromStart', 'a_chromEnd', 'a_refBase', 'a_altBase', 'a_refCount', 'a_altCount', 'b_chrom', 'b_chromStart', 'b_chromEnd', 'b_featureType', 'b_gene', 'b_orientation'] # read the input data df1 = pd.read_csv(args.input, sep='\t', header=None, names=headers) # calculate meanAbsoluteFoldChange per gene df2 = df1.groupby(by='b_gene').apply(meanAbsoluteFoldChange) df2 = df2.sort_values(['aveLog2fc'], ascending=False) # filter hits df2 = df2[df2['ave_read_depth'] >= args.minCoverage] df2 = df2[df2['snp_count'] >= args.minSnps] df2 = df2[df2['aveLog2fc'] >= math.log(args.minFoldChange, 2)] df2.index.name = 'gene' # filter the main data table to identify expression at the per-SNP level # for the significant genes identified above hits = pd.Series({'gene': sorted(df2.index)}) df3 = df1[df1.b_gene.isin(hits.gene)] df3.set_index('b_gene', inplace=True) df3.index.name = 'gene' df3 = df3.drop(['b_chrom', 'b_chromStart', 'b_chromEnd', 'b_featureType', 'b_orientation'], axis=1) df3.columns = ['chrom', 'start', 'end', 'refBase', 'altBase', 'refCount', 'altCount'] # choose where to send the output if args.output: save_location = args.output else: save_location = sys.stdout if args.gene: # print specific details for a single target gene df3.loc[args.gene,:].to_csv(save_location, sep='\t') else: # display a summary of all hits df2.head(len(df2)).to_csv(save_location, sep='\t')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', required=True, help="annotated SNP data from bedtools intersect") parser.add_argument('-r', '--minCoverage', required=False, type=int, default=10, help="minimun average read depth at variable sites") parser.add_argument('-s', '--minSnps', required=False, type=int, default=5, help="minimum number of SNPs in a given gene") parser.add_argument('-f', '--minFoldChange', required=False, type=float, default=2, help="minimum average absolute fold change") parser.add_argument('-g', '--gene', required=False, help="gene of interest") parser.add_argument('-o', '--output', required=False, help="output file") enrichASE(parser.parse_args()) if __name__ == "__main__": main()