......@@ -2,9 +2,11 @@
cd $(dirname $0)
wget -N
wget -N
wget -N
wget -N
python IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP IMGTGENEDB-GeneList 2> $errors
python IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP IMGTGENEDB-ReferenceSequences.fasta-nt-WithoutGaps-F+ORF+allP IMGTGENEDB-GeneList 2> $errors
......@@ -309,12 +309,18 @@ class IMGTGENEDBGeneList():
def split_IMGTGENEDBReferenceSequences(f, gene_list):
def split_IMGTGENEDBReferenceSequences(sources, gene_list):
downstream_data = OrderedDefaultListDict()
upstream_data = OrderedDefaultListDict()
for l in open(ReferenceSequences):
processed_keys = []
for source in sources:
print('<== %s' % source)
for l in open(source):
# New sequence: compute 'current_files' and stores up/downstream_data[]
......@@ -328,6 +334,13 @@ def split_IMGTGENEDBReferenceSequences(f, gene_list):
if species in SPECIES and feature in FEATURES:
seq = l.split('|')[1]
# Check whether this sequence was already retrieven from a previous source
key = '%s %s %s' % (species, seq, feature)
if key in processed_keys:
path = SPECIES[species]
if feature in FEATURES_VDJ:
......@@ -403,9 +416,10 @@ if __name__ == '__main__':
ReferenceSequences = sys.argv[1]
GeneList = sys.argv[2]
ReferenceSequencesInframe = sys.argv[1]
ReferenceSequencesAll = sys.argv[2]
GeneList = sys.argv[3]
gl = IMGTGENEDBGeneList(GeneList)
split_IMGTGENEDBReferenceSequences(ReferenceSequences, gl)
split_IMGTGENEDBReferenceSequences([ReferenceSequencesInframe, ReferenceSequencesAll], gl)
!LAUNCH: (cd $VIDJIL_DIR/germline ; cat homo-sapiens/IGKV.fa)
$ Genes that are not inframe are present
1: IGKV1-22.01
1: IGKV2/OR2-4.01
$ Inframe genes are present only once
1: IGKV1.13.01
1: IGKV1D.13.01
\ No newline at end of file
