buildBrowserGermline.py 2.35 KB
Newer Older
1 2 3
import json
import sys

4 5 6 7 8 9 10
def get_required_files(germlines_data):
    '''
    Parse the germlines data and get all the files that are required by that
    file.

    The function returns a list of the files (uniqueness is guaranteed)
    '''
11 12 13
    g_json = json.load(open(germlines_data, 'r'))
    path = g_json['path']
    germlines_json = g_json['systems']
14 15 16 17 18 19

    files = []
    for germline in germlines_json.keys():
        for recombination in germlines_json[germline]['recombinations']:
            for gene in ['5', '4', '3']:
                if gene in recombination:
20
                    for f in recombination[gene]:
21
                        f = path + '/' + f
22 23
                        if f not in files:
                            files.append(f)
24
    return files
25
    
26 27
if len(sys.argv) != 3:
    print("Usage: %s <JSON/DATA germline file> <JSON output file>" % sys.argv[0])
28
    sys.exit()
29

30
data_file = sys.argv[1]
31
output_name = sys.argv[2]
32 33 34 35 36


table = {}
identifiant = ""
sequence = ""
37 38 39 40

germline_files = get_required_files(data_file)

for current_file in germline_files:
41 42 43 44 45 46 47 48
    try:
        fasta = open(current_file, "r")
    except IOError as e:
        raise type(e),\
            type(e)(str(e) + '\nDid you forget to run ``make\'\' in the germline directory?\n'\
                    +'Otherwise, please tell us about the problem at contact@vidjil.org'),\
            sys.exc_info()[2]
    
49
    system = current_file.split('/')[-1].split('.')[0]
50 51 52 53 54 55

    table[system] = {}
    
    for ligne in fasta :
        ligne = ligne.rstrip('\n\r')
    
56
        if ligne:
57 58 59
            if ligne[0]=='>' :
                identifiant=ligne[1:]
            
60 61
                if '|' in identifiant:
                    identifiant = identifiant.split('|')[1]
62
                
63 64
                if '_' in identifiant:
                    identifiant = identifiant.split('_')[0]
65
                    
66
                sequence = ""
67 68 69
            else :
                sequence+=ligne
        
70 71
        if sequence:
            # If there is still some sequence left, this value will be overwritten in the next pass
72
            table[system][identifiant]=sequence
73

74
    fasta.close()
75

76 77


78 79 80 81 82 83 84 85
with open(output_name, "w") as file :
    file.write("germline = ")
    json.dump(table, file, indent=2, sort_keys=True)

    data = open(data_file, "r")
    file.write( "\n\n" )
    file.write("germline_data = ")
    file.write( data.read() )