Source code for read_buffalo_xml_data

# -*- coding: utf-8 -*-

"""

.. Created on Thu Aug  9 16:45:06 2018

   @author: cow082

   python read_buffalo_xml_data.py

"""

[docs]def processXML():
    '''used to construct 'resources/buffalo_sample_metadata.csv'
    
    # it uses the following xml files 
    "ERS2495773-ERS2495833.xml", "ERS2495835-ERS2495900.xml","ERS2495902-ERS2495992.xml"
    
    # which are obtained from here (there is an xml download link on the page): 
    https://www.ebi.ac.uk/ena/data/view/ERS2495773-ERS2495833
    https://www.ebi.ac.uk/ena/data/view/ERS2495835-ERS2495900
    https://www.ebi.ac.uk/ena/data/view/ERS2495902-ERS2495992
    '''
    
    PATH = "C:/Users/cow082/Desktop/roslin secondment documentation"
    
    infiles = [
        "ERS2495902-ERS2495992.xml",
        "ERS2495835-ERS2495900.xml",
        "ERS2495773-ERS2495833.xml",
        ]
    
    outlines = []
    for f in infiles:
        contents = open("%s/%s" % (PATH, f), 'r').read()
        chunks = contents.split('<SAMPLE ')[1:]
        for chunk in chunks:
            lines = chunk.split('\n')
            for line in lines:
                if '<PRIMARY_ID>' in line:
                    name = line.split('<PRIMARY_ID>')[1].split('</PRIMARY_ID>')[0]
                elif '<TITLE>' in line:
                    data = line.split('<TITLE>')[1].split('</TITLE>')[0]
                    components = data.split(',')
                    data = [item.strip() for item in ','.join(components[:-3]).split(' from an ') + components[-3:]]
                    data = ','.join(data)
            outlines.append(name+','+data)
            
    
    with open('%s/buffalo_sample_metadata.csv' % PATH, 'w') as F:
        F.write('\n'.join(outlines))


def main():
    processXML()
    
    
if __name__ == '__main__':
    main()