# -*- coding: utf-8 -*-
"""
.. Created on Mon Aug 20 15:17:59 2018
@author: cow082
"""
import sys
[docs]def gtf2bed(gtf, bed=None):
"""Convert a *.gtf file into a *.bed file
Note: discard everything except exons
"""
outlines = []
count = 0
with open(gtf, 'r') as v:
for i, line in enumerate(v.readlines()):
if line[0] == '#': # headers
pass
#print(line.strip())
else:
count += 1
elements = line.strip().split('\t')
if not len(elements) == 9:
#print(repr(elements))
pass
elif elements[2] not in ['CDS', 'gene', 'exon', 'cDNA_match',
'lnc_RNA', 'mRNA', 'tRNA',
'origin_of_replication', 'pseudogene',
'region', 'transcript', 'V_gene_segment',
'rRNA', 'D_loop', 'snoRNA',
'snRNA', 'C_gene_segment', 'guide_RNA']:
#print(elements[2])
pass
if elements[2] == 'exon':
meta = dict([item.split('=') for item in elements[8].split(';')])
data = [
elements[0], # chromosome
elements[3], # start
elements[4], # end
elements[2], # name ('exon')
meta.get('gene', '.'), # "score" (HGNC gene id)
elements[6], # strand (+/-)
]
outlines.append('\t'.join(data))
if not bed:
bed = gtf.split('.gtf')[0]+'.bed'
with open(bed, 'w') as fOut:
fOut.write('\n'.join(outlines))
[docs]def vcf2bed(vcf, bed=None, MIN_TOTAL_COUNT=10, MIN_ALLELE_COUNT=1):
"""Convert a *.vcf file output from bcftools::call into a *.bed file
MIN_TOTAL_COUNT specifies the minimum number of high quality reads
for a candidate to be retained
MIN_ALLELE_COUNT sets a lower limit on the read depth of the
minor allele
"""
outlines = []
count = 0
with open(vcf, 'r') as v:
for i, line in enumerate(v.readlines()):
if line[0] == '#': # headers
#print(line.strip())
pass
else:
count += 1
elements = line.strip().split('\t')
if not len(elements) == 10:
#print(repr(elements))
pass
else:
metadata = dict([item.split('=') for item in elements[7].strip().split(';')])
#print(metadata)
"""
# example:
{'DP': '3', 'VDB': '0.66', 'SGB': '-0.453602', 'MQSB': '1',
'MQ0F': '0', 'AF1': '1', 'AC1': '2', 'DP4': '0,0,1,1',
'MQ': '60', 'FQ': '-32.988'}
DP4: "Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases"
"""
counts = metadata.get('DP4', '').split(',')
if len(counts) == 4:
counts = [int(s) for s in counts]
else:
counts = [0,0,0,0]
if sum(counts) < MIN_TOTAL_COUNT:
continue
refCount = sum(counts[:2])
altCount = sum(counts[2:])
if min(refCount, altCount) < MIN_ALLELE_COUNT:
continue
data = [
elements[0], # chrom
elements[1], # chromStart
str(int(elements[1])+1), # chromEnd
elements[3], # refAllele
elements[4], # altAllele
str(refCount), # refCount
str(altCount), # altCount
]
outlines.append('\t'.join(data))
if not bed:
bed = vcf.split('.vcf')[0]+'.bed'
with open(bed, 'w') as fOut:
fOut.write('\n'.join(outlines))
# print('\n'.join(outlines))
[docs]def csv2bed(csv, bed=None):
"""Convert a *.csv file output from allelecounter.py into a *.bed file
"""
outlines = []
headers = ['contig', 'position', 'variantID', 'refAllele', 'altAllele',
'refCount', 'altCount', 'totalCount', 'lowMAPQDepth',
'lowBaseQDepth', 'rawDepth', 'otherBases' ]
with open(csv, 'r') as v:
for i, line in enumerate(v.readlines()):
if i == 0: # header
cols = line.strip().split()
if cols != headers:
print('\n'.join([
"Incorrect file type:",
"This function is solely designed to work on",
"output files produced by allelecounter.py",
]))
sys.exit()
else:
elements = line.strip().split('\t')
if not len(elements) == 12:
#print(repr(elements))
pass
else:
data = [
elements[0], # chrom
elements[1], # chromStart
str(int(elements[1])+1), # chromEnd
elements[3], # refAllele
elements[4], # altAllele
elements[5], # refCount
elements[6], # altCount
]
outlines.append('\t'.join(data))
if not bed:
bed = csv.split('.csv')[0]+'.bed'
with open(bed, 'w') as fOut:
fOut.write('\n'.join(outlines))
# with open(bed, 'r') as fIn:
# print(fIn.read())
def main():
# gtf2bed("C:/Users/cow082/water_buffalo_genome.gtf")
# vcf2bed("C:/Users/cow082/ERR2353209.vcf")
# csv2bed("C:/Users/cow082/ERR2353209.m2.csv")
if sys.argv[1][-3:] == 'gtf':
gtf2bed(sys.argv[1])
elif sys.argv[1][-3:] == 'vcf':
vcf2bed(sys.argv[1])
elif sys.argv[1][-3:] == 'csv':
csv2bed(sys.argv[1])
if __name__ == '__main__':
main()