Commit 0e9493a7 authored by Mikaël Salson's avatar Mikaël Salson

Merge branch 'feature-ga/3515-imgt-all' into 'dev'

Feature ga/3515 imgt all

See merge request !372
parents 005d00b0 895c9480
Pipeline #54809 passed with stages
in 8 minutes and 35 seconds
......@@ -2,9 +2,11 @@
cd $(dirname $0)
wget -N http://www.imgt.org/download/GENE-DB/IMGTGENEDB-GeneList
wget -N http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP
wget -N http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP
wget -N http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithoutGaps-F+ORF+allP
errors=$(tempfile)
python split-from-imgt.py IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP IMGTGENEDB-GeneList 2> $errors
python split-from-imgt.py IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP IMGTGENEDB-ReferenceSequences.fasta-nt-WithoutGaps-F+ORF+allP IMGTGENEDB-GeneList 2> $errors
wget -O IMGT_RELEASE http://www.imgt.org/download/GENE-DB/RELEASE
......
......@@ -309,12 +309,18 @@ class IMGTGENEDBGeneList():
def split_IMGTGENEDBReferenceSequences(f, gene_list):
def split_IMGTGENEDBReferenceSequences(sources, gene_list):
downstream_data = OrderedDefaultListDict()
upstream_data = OrderedDefaultListDict()
for l in open(ReferenceSequences):
processed_keys = []
for source in sources:
print()
print()
print('<== %s' % source)
for l in open(source):
# New sequence: compute 'current_files' and stores up/downstream_data[]
......@@ -328,6 +334,13 @@ def split_IMGTGENEDBReferenceSequences(f, gene_list):
if species in SPECIES and feature in FEATURES:
seq = l.split('|')[1]
# Check whether this sequence was already retrieven from a previous source
key = '%s %s %s' % (species, seq, feature)
if key in processed_keys:
continue
processed_keys.append(key)
path = SPECIES[species]
if feature in FEATURES_VDJ:
......@@ -403,9 +416,10 @@ if __name__ == '__main__':
else:
print (IMGT_LICENSE)
ReferenceSequences = sys.argv[1]
GeneList = sys.argv[2]
ReferenceSequencesInframe = sys.argv[1]
ReferenceSequencesAll = sys.argv[2]
GeneList = sys.argv[3]
gl = IMGTGENEDBGeneList(GeneList)
split_IMGTGENEDBReferenceSequences(ReferenceSequences, gl)
split_IMGTGENEDBReferenceSequences([ReferenceSequencesInframe, ReferenceSequencesAll], gl)
!NO_LAUNCHER:
!LAUNCH: (cd $VIDJIL_DIR/germline ; cat homo-sapiens/IGKV.fa)
$ Genes that are not inframe are present
1: IGKV1-22.01
1: IGKV2/OR2-4.01
$ Inframe genes are present only once
1: IGKV1.13.01
1: IGKV1D.13.01
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment