#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
.. Created on Thu Sep 20 13:53:33 2018
@author: chris
"""
import os
import argparse
import logging
import random
import numpy as np
import scipy.stats as stats
import pandas as pd
from randomdict import RandomDict
from Bio import Seq
"""
.. Note: the first 3 functions to generate a random sequence library,
are not presently used, but could be useful later
"""
[docs]def createRandomSequence(mu=3392, sigma=2600, m=300, M=10000):
'''create a random ATCG string with random length.
Note: use the normal distribution to choose the length (mean: 3392bp, s.d 2600bp),
but discard lengths outside the range [m,M]
'''
length = 0
while not m <= length <= M:
length = int(np.random.normal(mu, sigma))
return ''.join([random.choice('ATCG') for i in range(length)])
[docs]def createRandomSequenceIndex(n):
'''constructs a dictionary {seq_id: sequence} of random ATCG sequences
(for default values see the randomGene() function
'''
return dict([('gene_%s' % (i+1), createRandomSequence()) for i in range(n)])
[docs]def createRandomSequenceLibrary(n_seqs=400):
'''generate a random sequence library and save to file
'''
seqs = createRandomSequenceIndex(n_seqs)
transcriptome = fasta(seqs)
with open('simulated_transcriptome.fasta', 'w') as F:
F.write(transcriptome)
[docs]def fastWrap(text, width=None):
'''much quicker than textwrap.wrap
'''
if not width:
return text
return '\n'.join(text[i:i+width] for i in range(0, len(text), width))
[docs]def fasta(seq_index, wrap=None):
'''format a dictionary of {label:sequence} pairs into a fasta string
'''
return '\n'.join(['>%s\n%s' % (k, fastWrap(v, wrap)) for k,v in seq_index.items()])
[docs]def fastq(seq_index, args, wrap=None):
'''format a dictionary of {label:sequence} pairs into a fastq string
'''
return '\n'.join(['@%s\n%s\n+\n%s' % (k, fastWrap(v, wrap), \
fastWrap(qualScore(args), wrap)) for k,v in seq_index.items()])
[docs]def qualScore(args):
'''select a quality string at random from a real RNA-Seq dataset
NOTE: It is assumed that the available quality strings are >= the
length of the sequence strings. There is an assert statement in
createReadLibrary() which enforces this at the time of args.qualdata
construction.
'''
return args.qual_scores.random_value()[:args.read_length]
[docs]def randomDecision(probability=0.5):
'''returns a random True/False based on:
<probability> chance of returning True
1 - <probability> chance of returning False
'''
assert 0 <= probability <= 1
return random.random() < probability
[docs]def indexFasta(fasta, limit=None):
'''re-written to account for some entries having '>' in the title string!
'''
index = {}
with open(fasta) as F:
head = ''
tail = []
records = 0
for line in F.readlines():
if line[0] == '>':
if tail:
if records >= limit:
break
seq = ''.join(tail)
if head in index:
if len(seq) > len(index[head]): # keep only the longest transcript for each gene
index[head] = seq
records += 1
else:
index[head] = seq
records += 1
head = parseHeaders(line.lstrip('>').strip())
tail = []
else:
tail.append(line.strip())
return index
[docs]def createAltAllele(seq, SNP_frequency):
'''simulate the mutation process to generate a heterozygous allele
'''
ref_seq = list(seq)
# duplicate the refseq
alt_seq = ref_seq.copy()
# choose randomised positions to mutate
mutation_indices = mutateString(ref_seq, SNP_frequency)
# get the ref nucleotides at these positions
ref_nucleotides = [ref_seq[i] for i in mutation_indices]
# choose random nucleotides for each position. make sure they are different from the ref nucleotides
alt_nucleotides = [random.choice(list(set(list('ATCG')).difference(set(list(r))))) for r in ref_nucleotides]
#perform the substitutions in the alt_seq
for index, ref, alt in zip(mutation_indices, ref_nucleotides, alt_nucleotides):
alt_seq[index] = alt
# return the mutated sequence plus the details of any alt/ref bases it contains
return ''.join(alt_seq), zip(mutation_indices, ref_nucleotides, alt_nucleotides)
[docs]def mutateString(ref_seq, SNP_frequency):
'''based on a specified frequency, return a list of positions to mutate
Note: there are many possible ways this could be done
'''
i = len(ref_seq)
t = 0
mutation_indices = []
while t < i:
p = np.random.geometric(SNP_frequency)
t += p
if t < i:
mutation_indices.append(t)
return mutation_indices
[docs]def prepareData(args):
'''index the reference transcriptome, assign read counts, then decide
which genes are going to be ASE
'''
assert args.output_tag # blank tags not allowed
assert os.path.exists(args.rpkm_file)
assert os.path.exists(args.transcriptome)
assert os.path.exists(args.fastq)
# index the reference expression data
RPKM_index = pd.read_csv(args.rpkm_file)
n_seqs = RPKM_index.shape[0] # number of rows in the df
# extract sequence data from an existing reference library, e.g. a transcriptome file
seqs = indexFasta(args.transcriptome, limit=args.gene_limit)
heads, tails = list(zip(*seqs.items()))
n_seqs = min([n_seqs, len(seqs)])
# randomly obtain a list of RPKM values with a realistic distribution profile
rpkm = random.sample(list(RPKM_index[args.tissue]), n_seqs)
'''alternative models for testing'''
# rpkm = [10]*n_seqs
# rpkm = sorted(list(RPKM_index[args.tissue]), reverse=True)[:n_seqs]
# randomly select ASE genes at the desired frequency
decisions = [randomDecision(args.ase_frequency) for i in range(n_seqs)]
df = pd.DataFrame({
'gene':heads,
'seq' :tails,
'ase' :decisions,
'rpkm':rpkm,
})
# determine the total read count per locus from the RPKM values, desired library size and gene lengths
df['reads'] = df.apply(lambda row: \
round(row.rpkm * (len(row.seq) / 1000) * (args.lib_size / 1000000)),
axis=1)
#print(np.sum(df.reads))
return df
[docs]def defineCountsAndSequences(df, args):
'''loop through each gene (transcript) constructing a second
allele (based on a defined mutation rate) and read counts for each allele
(divide up the total reads between ref & alt alleles: proportions depend on whether the
gene has been assigned as ASE or not)
'''
statistics = {}
snp_counter = 1
ratio = 1/args.inbalance, 1 # e.g. (0.5, 1)
proportions = [i/sum(ratio) for i in ratio] # e.g. (0.33, 0.66)
# make a copy of the dataframe
df1 = df.copy()
# add some empty columns to the new dataframe
df1['alt_seq'] = ''
df1['ref_read_count'] = 0
df1['alt_read_count'] = 0
# iterate over each gene/transcript
for row in df.itertuples():
# create the alt allele
alt_seq, stats = createAltAllele(row.seq, args.snp_frequency)
# determine allele-specific read counts
if row.ase:
ref_read_count, alt_read_count = [round(row.reads*p) for p in proportions]
else:
ref_read_count, alt_read_count = [round(row.reads*p) for p in [0.5, 0.5]]
# add the allele data to the new dataframe
df1.loc[row[0], 'alt_seq'] = str(alt_seq)
df1.loc[row[0], 'ref_read_count'] = int(ref_read_count)
df1.loc[row[0], 'alt_read_count'] = int(alt_read_count)
genewise_snp_count = 1 # only used to assign <transcript_snp_id>
# format stats for logging
for pos, ref, alt in stats:
statistics['SNP_%s' % snp_counter] = pd.Series({
'gene_id' : row.gene,
'transcript_snp_id': '%s_%d' % (row.gene, genewise_snp_count),
'ase': row.ase,
'rpkm': row.rpkm,
'total_read_count': row.reads,
'ref_read_count': ref_read_count,
'alt_read_count': alt_read_count,
'transcript_length': len(row.seq),
'relative_snp_position': pos,
'ref_nucleotide': ref,
'alt_nucleotide': alt,
})
snp_counter += 1
genewise_snp_count += 1
# process and format the stats output
statistics = pd.DataFrame(statistics).T
statistics['average_coverage'] = statistics.total_read_count * args.read_length / statistics.transcript_length
snp_count = dict([(gene, list(statistics.gene_id).count(gene)) for gene in list(set(statistics.gene_id))])
statistics['snps_per_transcript'] = statistics.apply(lambda row: snp_count[row.gene_id], axis=1)
return df1, statistics
[docs]def chooseInsert(seq, args, READ_COUNTER, max_attempts=5):
'''choose the insert start and end positions based on a
predetermined distribution of insert sizes
'''
start = random.choice(range(len(seq)))
if args.uniform_insert_sizes: # try to make all reads the same length
insert_size = min([
args.insert_mean, # this is our target size
len(seq) - start, # but reads sometimes need to be smaller if they are close to the end
])
else: # we are going to use a (potentially skewed) normal distribution to determine read sizes
'''make sure reads do not exceed the specified max length,
but also that they do not exceed the end of the transcript
'''
insert_max = min([args.insert_max, len(seq)-start])
'''if the maximum insert size is smaller than the smallest value in the
insert_distribution, it is not possible to use random sampling to get a
suitable value, so use one we already have'''
if insert_max <= args.insert_distribution_min:
insert_size = insert_max
else:
count = 0
while True:
insert_size = args.insert_distribution.random_value()
if args.insert_min <= insert_size <= insert_max:
break
'''Avoid spending too much time sampling. If the insert_max
is close to the margins of the insert_distribution, it may take
millions of attempts to locate a suitable value!'''
if count == max_attempts: # give up after n failed attempts!
insert_size = insert_max
break
count += 1
end = int(start+insert_size)
return start, end
[docs]def invert(i, s):
'''return a coordinate corresponding to a position on the reverse strand
that matches a coordinate on the forward strand, i.e.
::
0 5
| --> |
AAAAAAAAAAAAAAAA
TTTTTTTTTTTTTTTT
| |
9 <-- 0
'''
return len(s) - i - 1
#def pad(s, args):
# '''if a read is shorter than intended, pad with N's
# '''
# return s + 'N'*max(0, args.read_length-len(s))
[docs]def createReadPair(seq, reads_file_1, reads_file_2, READ_COUNTER, args):
'''simulate a single pair of reads in an NGS experiment
'''
# select an insert
start, end = chooseInsert(seq, args, READ_COUNTER) # forward strand position
insert = seq[start:end+1]
template_1 = insert + args.adaptor + ''.join('N'*args.read_length)
read_1 = template_1[:args.read_length]
template_2 = str(Seq.Seq(insert).reverse_complement()) + args.adaptor + ''.join('N'*args.read_length)
read_2 = template_2[:args.read_length]
# ensure the first read starts on line 0
if READ_COUNTER == 0:
linebreak = ''
else:
linebreak = '\n'
reads_file_1.write(linebreak+fastq({
'simulation '+str(READ_COUNTER)+'/1': read_1,
}, args))
reads_file_2.write(linebreak+fastq({
'simulation '+str(READ_COUNTER)+'/2': read_2,
}, args))
if READ_COUNTER % 1000 == 0: # write buffer to disk every 1000 reads
reads_file_1.flush()
reads_file_2.flush()
READ_COUNTER += 1
return READ_COUNTER
[docs]def readQualscores(args):
'''basic function for extracting quality strings from a fastq file
'''
quals = []
with open(args.fastq, 'r') as fastq:
for i, line in enumerate(fastq.readlines()):
if i % 4 == 3: # process the 3rd line out of every 4 lines
line = line.strip()
if len(line) >= args.read_length:
quals.append(line)
random.shuffle(quals)
return quals
[docs]def createReadLibrary(df, reads_file_1, reads_file_2, args):
'''loop through a dataframe containing sequences (ref and alt alleles)
and expression levels (ref and alt counts) and create two dictionaries of
simulated NGS reads (fwd and reverse read pairs)
'''
READ_COUNTER = 0
args.read_target = np.sum(df.ref_read_count) + np.sum(df.alt_read_count)
insert_distribution = [int(i) for i in list(stats.skewnorm.rvs(
a=args.skew,
size=args.read_target+10000, # add a buffer in case of rounding errors etc.
loc=args.insert_mean,
scale=args.insert_sd,
))]
random.shuffle(insert_distribution)
args.insert_distribution = RandomDict([(k,v) for k,v in enumerate(insert_distribution)])
args.insert_distribution_min = min(insert_distribution)
args.qual_scores = RandomDict([(k,v) for k,v in enumerate(readQualscores(args))])
# iterate over each gene/transcript
for row in df.itertuples():
# create ref_seq reads
for i in range(row.ref_read_count):
READ_COUNTER = createReadPair(
row.seq, reads_file_1, reads_file_2, READ_COUNTER, args)
# create alt_seq reads
for i in range(row.alt_read_count):
READ_COUNTER = createReadPair(
row.alt_seq, reads_file_1, reads_file_2, READ_COUNTER, args)
[docs]def simulateSequencingExperiment(args):
'''create a simulated NGS dataset containing allelic inbalance in the
expression of a defined proportion of genes
this function can be modified to output multiple sequencing simulations
from a single simulated RNA sample, but with varying read counts to
assess the effect of increasing sequencing depth
'''
logging.basicConfig(filename=args.logfile, level=logging.DEBUG)
for arg, value in vars(args).items():
logging.info('%s: %r' % (arg, value))
# simulate generation of the RNA library
df = prepareData(args)
df, statistics = defineCountsAndSequences(df, args)
outfile_1 = 'simulation_%s.stats.txt' % args.output_tag
if not args.clobber:
assert not os.path.exists(outfile_1)
statistics.to_csv(outfile_1)
# simulate the sequencing run/s
multipliers = [int(i) for i in args.multipliers.split(',')]
for m in multipliers:
df_new = df.copy()
df_new.ref_read_count *= m
df_new.alt_read_count *= m
outfile_2 = 'simulation_%s_x%d.1.fastq' % (args.output_tag, m)
outfile_3 = 'simulation_%s_x%d.2.fastq' % (args.output_tag, m)
for f in [outfile_2, outfile_3]:
if not args.clobber:
assert not os.path.exists(f)
reads_file_1 = open(outfile_2, 'w')
reads_file_2 = open(outfile_3, 'w')
createReadLibrary(df_new, reads_file_1, reads_file_2, args)
reads_file_1.close()
reads_file_2.close()
[docs]def createTestInsert(s_fwd, args):
'''create an insert (for testing purposes only)
'''
start_1 = random.choice(range(len(s_fwd) - args.insert_min + 1))
start_2 = chooseInsert(s_fwd, start_1, args) # forward strand position
return start_2 - start_1
def printStats(a, args):
print('\t'.join([str(obj) for obj in ['', 'target', 'actual']]))
print('\t'.join([str(obj) for obj in ['mean', args.insert_mean, np.mean(a)]]))
print('\t'.join([str(obj) for obj in ['median', '-', np.median(a)]]))
print('\t'.join([str(obj) for obj in ['sd', args.insert_sd, np.std(a)]]))
print('\t'.join([str(obj) for obj in ['min', args.insert_min, np.min(a)]]))
print('\t'.join([str(obj) for obj in ['max', args.insert_max, np.max(a)]]))
[docs]def test_1(args):
'''plot insert sizes from an invariant pseudo-transcriptome
'''
import matplotlib.pyplot as plt
a = np.array([createTestInsert('A'*10000, args) for i in range(5000)])
plt.hist(a, bins=50)
plt.show()
printStats(a, args)
[docs]def test_2(args):
'''plot insert sizes from an actual transcriptome
'''
import matplotlib.pyplot as plt
seqs = indexFasta(args.transcriptome, limit=args.gene_limit)
heads, tails = list(zip(*seqs.items()))
a = np.array([createTestInsert(random.choice(tails), args) for i in range(5000)])
plt.hist(a, bins=50)
plt.show()
printStats(a, args)
[docs]def test_3(args):
'''
model the insert size distribution of an actual transcriptome
>>> from scipy import stats
# choose some parameters
>>> a, loc, scale = 1.3, -0.1, 2.2
# draw a sample
>>> sample = stats.skewnorm(a, loc, scale).rvs(1000)
# estimate parameters from sample
>>> ae, loce, scalee = stats.skewnorm.fit(sample)
>>> ae
1.2495366661560348
>>> loce
-0.039775813819310835
>>> scalee
2.1126121580965536
NOTE: the file 'ERR2353209.insert_size_metrics.txt' has insert data based on
mapping RNA-Seq reads to a genome. Therefore, some reported inserts will
span splice junctions and thus be recorded as longer than they actually are.
It was generated by running picard::CollectInsertSizeMetrics
I do not plan to modify it, however one way would be to map reads to the
transcriptome instead, then re-measure the insert sizes.
'''
import matplotlib.pyplot as plt
df = pd.read_csv('ERR2353209.insert_size_metrics.txt',
delim_whitespace=True, header=0, skiprows=10)
sample = []
for row in df.itertuples():
for i in range(int(row[2])):
sample.append(row[1])
ae, loce, scalee = stats.skewnorm.fit(sample)
# print(ae, loce, scalee)
# 8.4 97.3 149.2
# set defaults to: 8, 100, 150
insert_distribution = stats.skewnorm.rvs(a=args.skew,
loc=args.insert_mean,
scale=args.insert_sd,
size=args.lib_size,
)
plt.hist(insert_distribution, bins=50)
plt.show()
printStats(insert_distribution, args)
insert_distribution = stats.skewnorm.rvs(a=ae,
loc=loce,
scale=scalee,
size=args.lib_size,
)
plt.hist(insert_distribution, bins=50)
plt.show()
printStats(insert_distribution, args)
[docs]def test_4(args):
'''
'''
simulateSequencingExperiment(args)
def argParser():
parser = argparse.ArgumentParser()
# required argument
parser.add_argument('-a', '--output_tag', required=True,
help='a tag that gets added to all output file names')
# optional args...
parser.add_argument('-b', '--snp_frequency', required=False, type=float, default=0.000625,
help='proportion of coding bases that are heterozygous')
parser.add_argument('-c', '--ase_frequency', required=False, type=float, default=0.1,
help='proportion of genes with allelic expression inbalance')
parser.add_argument('-d', '--lib_size', required=False, type=int, default=2315672,
help='target library size (in read pairs)')
parser.add_argument('-e', '--read_length', required=False, type=int, default=124,
help='length of single reads')
parser.add_argument('-f', '--inbalance', required=False, type=float, default=2,
help='extent of skew toward most abundant allele')
parser.add_argument('-g', '--gene_limit', required=False, type=int, default=1000000,
help='only process the first <gene_limit> transcripts')
parser.add_argument('-i', '--adaptor', required=False, default='AGATCGGAAGAGC',
help='e.g. illumina 3p adaptor')
parser.add_argument('-j', '--rpkm_file', required=False, default='rpkm_fixed.csv',
help='a table of RPKM expression values')
parser.add_argument('-k', '--tissue', required=False, default='testis',
help='which tissue to model expression values on?')
parser.add_argument('-l', '--transcriptome', required=False, default="water_buffalo_transcriptome.fasta",
help='a transcriptome file containing the longest transcript at each protein coding locus')
parser.add_argument('-m', '--multipliers', required=False, default='1,2,4,8,16,32',
help='simulate multiple runs with these different read depths, relative to the target library size')
parser.add_argument('-n', '--clobber', action='store_true',
help='whether to overwrite existing files')
parser.add_argument('-o', '--insert_mean', required=False, type=float, default=100,
help='average insert length')
parser.add_argument('-p', '--insert_sd', required=False, type=float, default=150,
help='standard deviation of insert length')
parser.add_argument('-q', '--skew', required=False, type=float, default=8,
help='skews the insert size distribution')
parser.add_argument('-r', '--insert_min', required=False, type=int, default=0,
help='minimum allowed insert size')
parser.add_argument('-s', '--insert_max', required=False, type=int, default=1000,
help='maximum allowed insert size')
parser.add_argument('-t', '--uniform_insert_sizes', action='store_true',
help='make all inserts equal to <insert_mean> size')
parser.add_argument('-u', '--logfile', required=False, default='logfile.txt',
help='make all inserts equal to <insert_mean> size')
parser.add_argument('-v', '--fastq', required=False, default='ERR2353209.1.fastq',
help='an example fastq file that we will steal quality strings from')
return parser
def main():
myParser = argParser()
simulateSequencingExperiment(myParser.parse_args())
# random.seed(18)
# np.random.seed(20)
#
# test_1(myParser.parse_args([
# '-a', 'test_1',
# '--insert_min', '125',
# '--insert_max', '3000',
# '--insert_mean', '180',
# '--insert_sd', '50',
## '--uniform_insert_sizes',
# ]))
# test_2(myParser.parse_args([
# '--output_tag', 'test_2',
# '--transcriptome', 'C:/Users/cow082/2genes.fa',
# '--insert_min', '0',
# '--insert_max', '1000',
# '--insert_mean', '180',
# '--insert_sd', '50',
## '--uniform_insert_sizes',
# ]))
#
# test_3(myParser.parse_args([
# '--output_tag', 'test_3',
# '--transcriptome', 'C:/Users/cow082/2genes.fa',
# '--lib_size', '1000',
# '--clobber',
# '--insert_min', '0',
# '--insert_max', '1000',
# '--insert_mean', '180',
# '--insert_sd', '75',
# '--skew', '4',
# ]))
#
# test_4(myParser.parse_args([
# '--output_tag', 'test_4',
# '--transcriptome', 'C:/Users/cow082/2genes.fa',
# '--lib_size', '10000',
# '--clobber',
# ]))
if __name__ == '__main__':
main()