Source code for simulate_reads

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""

.. Created on Thu Sep 20 13:53:33 2018

   @author: chris

"""

import os
import argparse
import logging
import random
import numpy as np
import scipy.stats as stats
import pandas as pd
from randomdict import RandomDict
from Bio import Seq


"""

.. Note: the first 3 functions to generate a random sequence library, 
   are not presently used, but could be useful later

"""

[docs]def createRandomSequence(mu=3392, sigma=2600, m=300, M=10000):
    '''create a random ATCG string with random length. 
    Note: use the normal distribution to choose the length (mean: 3392bp, s.d 2600bp), 
    but discard lengths outside the range [m,M]
    '''
    length = 0
    while not m <= length <= M:
        length = int(np.random.normal(mu, sigma))
    return ''.join([random.choice('ATCG') for i in range(length)])


[docs]def createRandomSequenceIndex(n):
    '''constructs a dictionary {seq_id: sequence} of random ATCG sequences 
    (for default values see the randomGene() function
    '''
    return dict([('gene_%s' % (i+1), createRandomSequence()) for i in range(n)])


[docs]def createRandomSequenceLibrary(n_seqs=400):
    '''generate a random sequence library and save to file
    '''
    seqs = createRandomSequenceIndex(n_seqs)
    transcriptome = fasta(seqs)
    with open('simulated_transcriptome.fasta', 'w') as F:
        F.write(transcriptome)


[docs]def fastWrap(text, width=None):
    '''much quicker than textwrap.wrap
    '''
    if not width:
        return text
    
    return '\n'.join(text[i:i+width] for i in range(0, len(text), width))


[docs]def fasta(seq_index, wrap=None):
    '''format a dictionary of {label:sequence} pairs into a fasta string
    '''
    return '\n'.join(['>%s\n%s' % (k, fastWrap(v, wrap)) for k,v in seq_index.items()])


[docs]def fastq(seq_index, args, wrap=None):
    '''format a dictionary of {label:sequence} pairs into a fastq string
    '''
    return '\n'.join(['@%s\n%s\n+\n%s' % (k, fastWrap(v, wrap), \
                fastWrap(qualScore(args), wrap)) for k,v in seq_index.items()])


[docs]def qualScore(args):
    '''select a quality string at random from a real RNA-Seq dataset
    
    NOTE: It is assumed that the available quality strings are >= the 
    length of the sequence strings. There is an assert statement in 
    createReadLibrary() which enforces this at the time of args.qualdata 
    construction.
    '''
    return args.qual_scores.random_value()[:args.read_length]
    
    
[docs]def randomDecision(probability=0.5):
    '''returns a random True/False based on:
            <probability>       chance of returning True
            1 - <probability>   chance of returning False
    '''
    assert 0 <= probability <= 1
    
    return random.random() < probability


[docs]def parseHeaders(s):
    '''custom fasta header parser specific to this dataset
    
    might not work with other datasets
    '''
    parts = s.split('gene=')
    if len(parts) > 1: 
        return parts[1].split(']')[0]
    else:
        return s.split('[')[0].strip()


[docs]def indexFasta(fasta, limit=None):
    '''re-written to account for some entries having '>' in the title string!
    '''
    index = {}
    with open(fasta) as F:
        head = ''
        tail = []
        records = 0
        for line in F.readlines():
            if line[0] == '>':
                if tail:
                    if records >= limit:
                        break
                    seq = ''.join(tail)
                    if head in index:
                        if len(seq) > len(index[head]): # keep only the longest transcript for each gene
                            index[head] = seq
                            records += 1
                    else:
                        index[head] = seq
                        records += 1
                head = parseHeaders(line.lstrip('>').strip())
                tail = []
            else:
                tail.append(line.strip())
    return index


[docs]def createAltAllele(seq, SNP_frequency):
    '''simulate the mutation process to generate a heterozygous allele
    '''
    
    ref_seq = list(seq)
    
    # duplicate the refseq
    alt_seq = ref_seq.copy()
    
    # choose randomised positions to mutate
    mutation_indices = mutateString(ref_seq, SNP_frequency)
    
    # get the ref nucleotides at these positions
    ref_nucleotides = [ref_seq[i] for i in mutation_indices]
    
    # choose random nucleotides for each position. make sure they are different from the ref nucleotides
    alt_nucleotides = [random.choice(list(set(list('ATCG')).difference(set(list(r))))) for r in ref_nucleotides]
    
    #perform the substitutions in the alt_seq
    for index, ref, alt in zip(mutation_indices, ref_nucleotides, alt_nucleotides):
        alt_seq[index] = alt
    
    # return the mutated sequence plus the details of any alt/ref bases it contains
    return ''.join(alt_seq), zip(mutation_indices, ref_nucleotides, alt_nucleotides)


[docs]def mutateString(ref_seq, SNP_frequency):
    '''based on a specified frequency, return a list of positions to mutate
    
    Note: there are many possible ways this could be done
    '''
    i = len(ref_seq)
    t = 0
    mutation_indices = []
    while t < i:
        p = np.random.geometric(SNP_frequency)
        t += p
        if t < i:
            mutation_indices.append(t)
    return mutation_indices


[docs]def prepareData(args):
    '''index the reference transcriptome, assign read counts, then decide 
    which genes are going to be ASE
    '''

    assert args.output_tag # blank tags not allowed
    assert os.path.exists(args.rpkm_file)
    assert os.path.exists(args.transcriptome)
    assert os.path.exists(args.fastq)

    # index the reference expression data
    RPKM_index = pd.read_csv(args.rpkm_file)
    n_seqs = RPKM_index.shape[0] # number of rows in the df
    
    # extract sequence data from an existing reference library, e.g. a transcriptome file
    seqs = indexFasta(args.transcriptome, limit=args.gene_limit)
    heads, tails = list(zip(*seqs.items()))
    n_seqs = min([n_seqs, len(seqs)])
    
    # randomly obtain a list of RPKM values with a realistic distribution profile
    rpkm = random.sample(list(RPKM_index[args.tissue]), n_seqs)
    
    '''alternative models for testing'''
#    rpkm = [10]*n_seqs
#    rpkm = sorted(list(RPKM_index[args.tissue]), reverse=True)[:n_seqs]
    
    # randomly select ASE genes at the desired frequency
    decisions = [randomDecision(args.ase_frequency) for i in range(n_seqs)]

    df = pd.DataFrame({
        'gene':heads,
        'seq' :tails,
        'ase' :decisions,
        'rpkm':rpkm,
        })

    # determine the total read count per locus from the RPKM values, desired library size and gene lengths
    df['reads'] = df.apply(lambda row: \
        round(row.rpkm * (len(row.seq) / 1000) * (args.lib_size / 1000000)), 
        axis=1)
    
    #print(np.sum(df.reads))
    
    return df


[docs]def defineCountsAndSequences(df, args):
    '''loop through each gene (transcript) constructing a second
    allele (based on a defined mutation rate) and read counts for each allele 
    (divide up the total reads between ref & alt alleles: proportions depend on whether the 
    gene has been assigned as ASE or not)
    '''
    statistics = {}
    snp_counter = 1
    ratio = 1/args.inbalance, 1 # e.g. (0.5, 1)
    proportions = [i/sum(ratio) for i in ratio] # e.g. (0.33, 0.66)
    
    # make a copy of the dataframe
    df1 = df.copy()
    
    # add some empty columns to the new dataframe
    df1['alt_seq'] = ''
    df1['ref_read_count'] = 0
    df1['alt_read_count'] = 0

    # iterate over each gene/transcript
    for row in df.itertuples():
        
        # create the alt allele
        alt_seq, stats = createAltAllele(row.seq, args.snp_frequency)
        
        # determine allele-specific read counts
        if row.ase:
            ref_read_count, alt_read_count = [round(row.reads*p) for p in proportions]
        else:
            ref_read_count, alt_read_count = [round(row.reads*p) for p in [0.5, 0.5]]
        
        # add the allele data to the new dataframe    
        df1.loc[row[0], 'alt_seq'] = str(alt_seq)
        df1.loc[row[0], 'ref_read_count'] = int(ref_read_count)
        df1.loc[row[0], 'alt_read_count'] = int(alt_read_count)
        
        genewise_snp_count = 1 # only used to assign <transcript_snp_id>
        
        # format stats for logging
        for pos, ref, alt in stats:
            statistics['SNP_%s' % snp_counter] = pd.Series({
                    'gene_id' : row.gene, 
                    'transcript_snp_id': '%s_%d' % (row.gene, genewise_snp_count),
                    'ase': row.ase,
                    'rpkm': row.rpkm, 
                    'total_read_count': row.reads, 
                    'ref_read_count': ref_read_count, 
                    'alt_read_count': alt_read_count, 
                    'transcript_length': len(row.seq), 
                    'relative_snp_position': pos, 
                    'ref_nucleotide': ref, 
                    'alt_nucleotide': alt,
                    })
            snp_counter += 1
            genewise_snp_count += 1
    
    # process and format the stats output
    statistics = pd.DataFrame(statistics).T
    statistics['average_coverage'] = statistics.total_read_count * args.read_length / statistics.transcript_length
    snp_count = dict([(gene, list(statistics.gene_id).count(gene)) for gene in list(set(statistics.gene_id))])
    statistics['snps_per_transcript'] = statistics.apply(lambda row: snp_count[row.gene_id], axis=1)

    return df1, statistics


[docs]def chooseInsert(seq, args, READ_COUNTER, max_attempts=5):
    '''choose the insert start and end positions based on a 
    predetermined distribution of insert sizes
    '''
    start = random.choice(range(len(seq)))
    
    if args.uniform_insert_sizes: # try to make all reads the same length
        
        insert_size = min([
                args.insert_mean, # this is our target size
                len(seq) - start, # but reads sometimes need to be smaller if they are close to the end
                ])
                
    else: # we are going to use a (potentially skewed) normal distribution to determine read sizes
        
        '''make sure reads do not exceed the specified max length, 
        but also that they do not exceed the end of the transcript
        '''
        insert_max = min([args.insert_max, len(seq)-start]) 
        
        '''if the maximum insert size is smaller than the smallest value in the
        insert_distribution, it is not possible to use random sampling to get a 
        suitable value, so use one we already have'''
        if insert_max <= args.insert_distribution_min:
            insert_size = insert_max
            
        else:
            count = 0
            while True:
                insert_size = args.insert_distribution.random_value()
                if args.insert_min <= insert_size <= insert_max:
                    break
                '''Avoid spending too much time sampling. If the insert_max 
                is close to the margins of the insert_distribution, it may take 
                millions of attempts to locate a suitable value!'''
                if count == max_attempts: # give up after n failed attempts!
                    insert_size = insert_max
                    break
                count += 1
                
    end = int(start+insert_size)
    
    return start, end
            

[docs]def invert(i, s):
    '''return a coordinate corresponding to a position on the reverse strand
    that matches a coordinate on the forward strand, i.e.
    
    :: 

        0     5
        | --> | 
        AAAAAAAAAAAAAAAA
        TTTTTTTTTTTTTTTT
              |        |
              9    <-- 0

    '''
    return len(s) - i - 1


#def pad(s, args):
#    '''if a read is shorter than intended, pad with N's
#    '''
#    return s + 'N'*max(0, args.read_length-len(s))


[docs]def createReadPair(seq, reads_file_1, reads_file_2, READ_COUNTER, args):
    '''simulate a single pair of reads in an NGS experiment
    '''   
    
    # select an insert
    start, end = chooseInsert(seq, args, READ_COUNTER) # forward strand position
    insert = seq[start:end+1]
    template_1 = insert + args.adaptor + ''.join('N'*args.read_length)
    read_1 = template_1[:args.read_length]
    template_2 = str(Seq.Seq(insert).reverse_complement()) + args.adaptor + ''.join('N'*args.read_length)
    read_2 = template_2[:args.read_length]
    
    # ensure the first read starts on line 0
    if READ_COUNTER == 0: 
        linebreak = ''
    else:
        linebreak = '\n' 

    reads_file_1.write(linebreak+fastq({
        'simulation '+str(READ_COUNTER)+'/1': read_1,
        }, args))
    reads_file_2.write(linebreak+fastq({
        'simulation '+str(READ_COUNTER)+'/2': read_2,
        }, args))
    
    if READ_COUNTER % 1000 == 0: # write buffer to disk every 1000 reads
        reads_file_1.flush()
        reads_file_2.flush()
        
    READ_COUNTER += 1
    
    return READ_COUNTER
    

[docs]def readQualscores(args):
    '''basic function for extracting quality strings from a fastq file
    '''
    quals = []
    
    with open(args.fastq, 'r') as fastq:
        for i, line in enumerate(fastq.readlines()):
            if i % 4 == 3: # process the 3rd line out of every 4 lines 
                line = line.strip()
                if len(line) >= args.read_length:
                    quals.append(line)
                    
    random.shuffle(quals)
    
    return quals


[docs]def createReadLibrary(df, reads_file_1, reads_file_2, args):
    '''loop through a dataframe containing sequences (ref and alt alleles) 
    and expression levels (ref and alt counts) and create two dictionaries of 
    simulated NGS reads (fwd and reverse read pairs)
    '''
    READ_COUNTER = 0
    args.read_target = np.sum(df.ref_read_count) + np.sum(df.alt_read_count)
    
    insert_distribution = [int(i) for i in list(stats.skewnorm.rvs(
            a=args.skew,
            size=args.read_target+10000, # add a buffer in case of rounding errors etc.
            loc=args.insert_mean,
            scale=args.insert_sd,
            ))]
    
    random.shuffle(insert_distribution)
    
    args.insert_distribution = RandomDict([(k,v) for k,v in enumerate(insert_distribution)])
    
    args.insert_distribution_min = min(insert_distribution)
    
    args.qual_scores = RandomDict([(k,v) for k,v in enumerate(readQualscores(args))])
                
    # iterate over each gene/transcript
    for row in df.itertuples():
                        
        # create ref_seq reads
        for i in range(row.ref_read_count):
            READ_COUNTER = createReadPair(
                row.seq, reads_file_1, reads_file_2, READ_COUNTER, args)

        # create alt_seq reads
        for i in range(row.alt_read_count):
            READ_COUNTER = createReadPair(
                row.alt_seq, reads_file_1, reads_file_2, READ_COUNTER, args)
            

[docs]def simulateSequencingExperiment(args):
    '''create a simulated NGS dataset containing allelic inbalance in the 
    expression of a defined proportion of genes
    
    this function can be modified to output multiple sequencing simulations 
    from a single simulated RNA sample, but with varying read counts to 
    assess the effect of increasing sequencing depth
    '''

    logging.basicConfig(filename=args.logfile, level=logging.DEBUG)
    for arg, value in vars(args).items():
        logging.info('%s: %r' % (arg, value))

    # simulate generation of the RNA library
    df = prepareData(args)
    df, statistics = defineCountsAndSequences(df, args)
    outfile_1 = 'simulation_%s.stats.txt'  % args.output_tag
    if not args.clobber: 
        assert not os.path.exists(outfile_1)
    statistics.to_csv(outfile_1)
    
    # simulate the sequencing run/s
    multipliers = [int(i) for i in args.multipliers.split(',')]
        
    for m in multipliers:
        df_new = df.copy()
        df_new.ref_read_count *= m
        df_new.alt_read_count *= m
        outfile_2 = 'simulation_%s_x%d.1.fastq' % (args.output_tag, m)
        outfile_3 = 'simulation_%s_x%d.2.fastq' % (args.output_tag, m)
        for f in [outfile_2, outfile_3]:
            if not args.clobber:
                assert not os.path.exists(f)
        reads_file_1 = open(outfile_2, 'w')
        reads_file_2 = open(outfile_3, 'w')
        createReadLibrary(df_new, reads_file_1, reads_file_2, args)
        reads_file_1.close()
        reads_file_2.close()


[docs]def createTestInsert(s_fwd, args):
    '''create an insert (for testing purposes only)
    '''
    start_1 = random.choice(range(len(s_fwd) - args.insert_min + 1))
    start_2 = chooseInsert(s_fwd, start_1, args) # forward strand position
    return start_2 - start_1


def printStats(a, args):
    print('\t'.join([str(obj) for obj in ['', 'target', 'actual']]))
    print('\t'.join([str(obj) for obj in ['mean', args.insert_mean, np.mean(a)]]))
    print('\t'.join([str(obj) for obj in ['median', '-', np.median(a)]]))
    print('\t'.join([str(obj) for obj in ['sd', args.insert_sd, np.std(a)]]))
    print('\t'.join([str(obj) for obj in ['min', args.insert_min, np.min(a)]]))
    print('\t'.join([str(obj) for obj in ['max', args.insert_max, np.max(a)]]))
    
    
[docs]def test_1(args):
    '''plot insert sizes from an invariant pseudo-transcriptome
    '''
    
    import matplotlib.pyplot as plt

    a = np.array([createTestInsert('A'*10000, args) for i in range(5000)])
    plt.hist(a, bins=50)
    plt.show()
    printStats(a, args)
    

[docs]def test_2(args):
    '''plot insert sizes from an actual transcriptome
    '''
    
    import matplotlib.pyplot as plt
    
    seqs = indexFasta(args.transcriptome, limit=args.gene_limit)
    heads, tails = list(zip(*seqs.items()))
    a = np.array([createTestInsert(random.choice(tails), args) for i in range(5000)])
    plt.hist(a, bins=50)
    plt.show()
    printStats(a, args)
    

[docs]def test_3(args):
    '''
    model the insert size distribution of an actual transcriptome
    
    >>> from scipy import stats

    # choose some parameters
    >>> a, loc, scale = 1.3, -0.1, 2.2
    # draw a sample
    >>> sample = stats.skewnorm(a, loc, scale).rvs(1000)
    
    # estimate parameters from sample
    >>> ae, loce, scalee = stats.skewnorm.fit(sample)
    >>> ae
    1.2495366661560348
    >>> loce
    -0.039775813819310835
    >>> scalee
    2.1126121580965536
    
    NOTE: the file 'ERR2353209.insert_size_metrics.txt' has insert data based on 
    mapping RNA-Seq reads to a genome. Therefore, some reported inserts will 
    span splice junctions and thus be recorded as longer than they actually are. 
    It was generated by running picard::CollectInsertSizeMetrics
    
    I do not plan to modify it, however one way would be to map reads to the 
    transcriptome instead, then re-measure the insert sizes.
    '''
    
    import matplotlib.pyplot as plt
    
    df = pd.read_csv('ERR2353209.insert_size_metrics.txt', 
                     delim_whitespace=True, header=0, skiprows=10)
    
    sample = []
    for row in df.itertuples():
        for i in range(int(row[2])):
            sample.append(row[1])
        
    ae, loce, scalee = stats.skewnorm.fit(sample)
    # print(ae, loce, scalee)
    # 8.4 97.3 149.2
    # set defaults to: 8, 100, 150
        
    insert_distribution = stats.skewnorm.rvs(a=args.skew, 
                                             loc=args.insert_mean, 
                                             scale=args.insert_sd,
                                             size=args.lib_size,
                                             )
    plt.hist(insert_distribution, bins=50)
    plt.show()
    printStats(insert_distribution, args)
    
    
    insert_distribution = stats.skewnorm.rvs(a=ae,
                                             loc=loce, 
                                             scale=scalee,
                                             size=args.lib_size,
                                             )
    plt.hist(insert_distribution, bins=50)
    plt.show()
    printStats(insert_distribution, args)
    
    
[docs]def test_4(args):
    '''
    '''
    simulateSequencingExperiment(args)
    

def argParser():
    
    parser = argparse.ArgumentParser()
    
    # required argument
    parser.add_argument('-a', '--output_tag', required=True,
                        help='a tag that gets added to all output file names')
    # optional args...
    parser.add_argument('-b', '--snp_frequency', required=False, type=float, default=0.000625, 
                        help='proportion of coding bases that are heterozygous')
    parser.add_argument('-c', '--ase_frequency', required=False, type=float, default=0.1, 
                        help='proportion of genes with allelic expression inbalance')
    parser.add_argument('-d', '--lib_size', required=False, type=int, default=2315672, 
                        help='target library size (in read pairs)')
    parser.add_argument('-e', '--read_length', required=False, type=int, default=124, 
                        help='length of single reads')
    parser.add_argument('-f', '--inbalance', required=False, type=float, default=2,
                        help='extent of skew toward most abundant allele')
    parser.add_argument('-g', '--gene_limit', required=False, type=int, default=1000000,
                        help='only process the first <gene_limit> transcripts')
    parser.add_argument('-i', '--adaptor', required=False, default='AGATCGGAAGAGC',
                        help='e.g. illumina 3p adaptor')
    parser.add_argument('-j', '--rpkm_file', required=False, default='rpkm_fixed.csv',
                        help='a table of RPKM expression values')
    parser.add_argument('-k', '--tissue', required=False, default='testis',
                        help='which tissue to model expression values on?')
    parser.add_argument('-l', '--transcriptome', required=False, default="water_buffalo_transcriptome.fasta",
                        help='a transcriptome file containing the longest transcript at each protein coding locus')
    parser.add_argument('-m', '--multipliers', required=False, default='1,2,4,8,16,32',
                        help='simulate multiple runs with these different read depths, relative to the target library size')
    parser.add_argument('-n', '--clobber', action='store_true',
                        help='whether to overwrite existing files')
    parser.add_argument('-o', '--insert_mean', required=False, type=float, default=100, 
                        help='average insert length')
    parser.add_argument('-p', '--insert_sd', required=False, type=float, default=150, 
                        help='standard deviation of insert length')
    parser.add_argument('-q', '--skew', required=False, type=float, default=8, 
                        help='skews the insert size distribution')
    parser.add_argument('-r', '--insert_min', required=False, type=int, default=0, 
                        help='minimum allowed insert size')
    parser.add_argument('-s', '--insert_max', required=False, type=int, default=1000, 
                        help='maximum allowed insert size')
    parser.add_argument('-t', '--uniform_insert_sizes', action='store_true',
                        help='make all inserts equal to <insert_mean> size')    
    parser.add_argument('-u', '--logfile', required=False, default='logfile.txt',
                        help='make all inserts equal to <insert_mean> size')
    parser.add_argument('-v', '--fastq', required=False, default='ERR2353209.1.fastq',
                        help='an example fastq file that we will steal quality strings from')
    
    return parser

    
def main():

    myParser = argParser()
    
    simulateSequencingExperiment(myParser.parse_args())

#    random.seed(18)
#    np.random.seed(20)
#
#    test_1(myParser.parse_args([
#            '-a', 'test_1', 
#            '--insert_min', '125',
#            '--insert_max', '3000', 
#            '--insert_mean', '180',
#            '--insert_sd', '50',
##            '--uniform_insert_sizes', 
#            ]))

#    test_2(myParser.parse_args([
#            '--output_tag', 'test_2', 
#            '--transcriptome', 'C:/Users/cow082/2genes.fa',
#            '--insert_min', '0',
#            '--insert_max', '1000', 
#            '--insert_mean', '180',
#            '--insert_sd', '50',
##            '--uniform_insert_sizes', 
#            ]))
#    
#    test_3(myParser.parse_args([
#            '--output_tag', 'test_3', 
#            '--transcriptome', 'C:/Users/cow082/2genes.fa',
#            '--lib_size', '1000',
#            '--clobber',
#            '--insert_min', '0',
#            '--insert_max', '1000', 
#            '--insert_mean', '180',
#            '--insert_sd', '75',
#            '--skew', '4',
#            ]))
#    
#    test_4(myParser.parse_args([
#            '--output_tag', 'test_4', 
#            '--transcriptome', 'C:/Users/cow082/2genes.fa',
#            '--lib_size', '10000',
#            '--clobber',
#            ]))


if __name__ == '__main__':
    main()