Commit 0e9493a7 authored by Mikaël Salson's avatar Mikaël Salson

Merge branch 'feature-ga/3515-imgt-all' into 'dev'

Feature ga/3515 imgt all

See merge request !372
parents 005d00b0 895c9480
Pipeline #54809 passed with stages
in 8 minutes and 35 seconds
...@@ -2,9 +2,11 @@ ...@@ -2,9 +2,11 @@
cd $(dirname $0) cd $(dirname $0)
wget -N http://www.imgt.org/download/GENE-DB/IMGTGENEDB-GeneList wget -N http://www.imgt.org/download/GENE-DB/IMGTGENEDB-GeneList
wget -N http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP wget -N http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP
wget -N http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithoutGaps-F+ORF+allP
errors=$(tempfile) errors=$(tempfile)
python split-from-imgt.py IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP IMGTGENEDB-GeneList 2> $errors python split-from-imgt.py IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP IMGTGENEDB-ReferenceSequences.fasta-nt-WithoutGaps-F+ORF+allP IMGTGENEDB-GeneList 2> $errors
wget -O IMGT_RELEASE http://www.imgt.org/download/GENE-DB/RELEASE wget -O IMGT_RELEASE http://www.imgt.org/download/GENE-DB/RELEASE
......
...@@ -309,12 +309,18 @@ class IMGTGENEDBGeneList(): ...@@ -309,12 +309,18 @@ class IMGTGENEDBGeneList():
def split_IMGTGENEDBReferenceSequences(f, gene_list): def split_IMGTGENEDBReferenceSequences(sources, gene_list):
downstream_data = OrderedDefaultListDict() downstream_data = OrderedDefaultListDict()
upstream_data = OrderedDefaultListDict() upstream_data = OrderedDefaultListDict()
for l in open(ReferenceSequences): processed_keys = []
for source in sources:
print()
print()
print('<== %s' % source)
for l in open(source):
# New sequence: compute 'current_files' and stores up/downstream_data[] # New sequence: compute 'current_files' and stores up/downstream_data[]
...@@ -328,6 +334,13 @@ def split_IMGTGENEDBReferenceSequences(f, gene_list): ...@@ -328,6 +334,13 @@ def split_IMGTGENEDBReferenceSequences(f, gene_list):
if species in SPECIES and feature in FEATURES: if species in SPECIES and feature in FEATURES:
seq = l.split('|')[1] seq = l.split('|')[1]
# Check whether this sequence was already retrieven from a previous source
key = '%s %s %s' % (species, seq, feature)
if key in processed_keys:
continue
processed_keys.append(key)
path = SPECIES[species] path = SPECIES[species]
if feature in FEATURES_VDJ: if feature in FEATURES_VDJ:
...@@ -403,9 +416,10 @@ if __name__ == '__main__': ...@@ -403,9 +416,10 @@ if __name__ == '__main__':
else: else:
print (IMGT_LICENSE) print (IMGT_LICENSE)
ReferenceSequences = sys.argv[1] ReferenceSequencesInframe = sys.argv[1]
GeneList = sys.argv[2] ReferenceSequencesAll = sys.argv[2]
GeneList = sys.argv[3]
gl = IMGTGENEDBGeneList(GeneList) gl = IMGTGENEDBGeneList(GeneList)
split_IMGTGENEDBReferenceSequences(ReferenceSequences, gl) split_IMGTGENEDBReferenceSequences([ReferenceSequencesInframe, ReferenceSequencesAll], gl)
!NO_LAUNCHER:
!LAUNCH: (cd $VIDJIL_DIR/germline ; cat homo-sapiens/IGKV.fa)
$ Genes that are not inframe are present
1: IGKV1-22.01
1: IGKV2/OR2-4.01
$ Inframe genes are present only once
1: IGKV1.13.01
1: IGKV1D.13.01
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment