Commit a7acea6c authored by Ryan Herbert's avatar Ryan Herbert

Once again, so not package germlines.

parent fa718bc1
Pipeline #33114 failed with stages
in 24 seconds
......@@ -26,7 +26,6 @@ copy ./conf/align.cgi /usr/share/vidjil/browser/cgi/align.cgi
copy ./conf/similarity.cgi /usr/share/vidjil/browser/cgi/similarity.cgi
run cd /usr/share/vidjil/browser/css/icons && make
run cd /usr/share/vidjil/germline && make
arg build_env='PRODUCTION'
env BUILD_ENV $build_env
......
DEFAULT_G=homo-sapiens.g
DIRS=homo-sapiens/ mus-musculus/ rattus-norvegicus/
GERMLINE_JS=../browser/js/germline.js
all: get-saved-data
germline: get-saved-data $(GERMLINE_JS)
js: $(GERMLINE_JS)
$(GERMLINE_JS): $(DEFAULT_G)
python buildBrowserGermline.py $(DEFAULT_G) $@
get-all-data: clean
sh get-germline
python get-CD.py
get-saved-data: germline_id
sh get-saved-germline
clean:
rm -rf $(DIRS) $(GERMLINE_JS)
diff-from-saved:
rm -rf saved-germline
mkdir saved-germline
cd saved-germline ; sh ../get-saved-germline
echo
diff -r -u -x "*[.][^f][^a]" -x "germline*" -x "get*" -x "Makefile" -x "saved-*" saved-germline/ .
distrib: get-all-data js
cd .. ; tar cvzf germline-`cat germline/germline_id`.tar.gz germline/germline_id germline/*/*.fa germline/IMGT_RELEASE browser/js/germline.js
.PHONY: all germline js get-all-data clean diff-from-saved
import json
import sys
def get_required_files(germlines_data):
'''
Parse the germlines data and get all the files that are required by that
file.
The function returns a list of the files (uniqueness is guaranteed)
'''
g_json = json.load(open(germlines_data, 'r'))
path = g_json['path']
germlines_json = g_json['systems']
files = []
for germline in germlines_json.keys():
for recombination in germlines_json[germline]['recombinations']:
for gene in ['5', '4', '3']:
if gene in recombination:
for f in recombination[gene]:
f = path + '/' + f
if f not in files:
files.append(f)
return files
if len(sys.argv) != 3:
print("Usage: %s <JSON/DATA germline file> <JSON output file>" % sys.argv[0])
sys.exit()
data_file = sys.argv[1]
output_name = sys.argv[2]
table = {}
identifiant = ""
sequence = ""
germline_files = get_required_files(data_file)
for current_file in germline_files:
try:
fasta = open(current_file, "r")
except IOError as e:
raise type(e),\
type(e)(str(e) + '\nDid you forget to run ``make\'\' in the germline directory?\n'\
+'Otherwise, please tell us about the problem at contact@vidjil.org'),\
sys.exc_info()[2]
system = current_file.split('/')[-1].split('.')[0]
table[system] = {}
for ligne in fasta :
ligne = ligne.rstrip('\n\r')
if ligne:
if ligne[0]=='>' :
identifiant=ligne[1:]
if '|' in identifiant:
identifiant = identifiant.split('|')[1]
if '_' in identifiant:
identifiant = identifiant.split('_')[0]
sequence = ""
else :
sequence+=ligne
if sequence:
# If there is still some sequence left, this value will be overwritten in the next pass
table[system][identifiant]=sequence
fasta.close()
with open(output_name, "w") as file :
file.write("germline = ")
json.dump(table, file, indent=2, sort_keys=True)
data = open(data_file, "r")
file.write( "\n\n" )
file.write("germline_data = ")
file.write( data.read() )
import sys
COMPLEMENT_NUCLEOTIDE = {
'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C',
'Y': 'R', 'R': 'Y', # pyrimidine (CT) / purine (AG)
'W': 'S', 'S': 'W', # weak (AT) / strong (GC)
'K': 'M', 'M': 'K', # keto (TG) / amino (AC)
'B': 'V', 'V': 'B', 'D': 'H', 'H': 'D',
'N': 'N'
}
def revcomp(seq):
'''Returns the reverse complement of a sequence
>>> revcomp('ACGNTT')
'AANCGT'
'''
rc = ''
for nucl in seq[::-1]:
try:
rc += COMPLEMENT_NUCLEOTIDE[nucl.upper()]
except KeyError:
sys.stderr.write("! Unknown nucleotide : '%s' " % nucl + seq)
rc += 'N'
return rc
def parse(fasta, endline=''):
'''Iterates over sequences in a fasta files, yielding (header, sequence) pairs'''
header = ''
sequence = ''
for l in fasta:
l = l.strip()
if not l:
continue
if l[0] == '#':
continue
if l[0] == '>':
if header or sequence:
yield (header, sequence)
header = l[1:]
sequence = ''
else:
sequence += l + endline
if header or sequence:
yield (header, sequence)
def extract_field_if_exists(s, separator, field_number):
fields = s.split(separator)
if len(fields) > field_number:
return fields[field_number]
return str
def parse_as_Fasta(fasta):
for (header, sequence) in parse(fasta):
yield Fasta(header, sequence)
class Fasta():
def __init__(self, header, sequence):
self.header = header
self.seq = sequence
def revcomp(self):
self.seq = revcomp(self.seq)
@property
def name(self):
return extract_field_if_exists(self.header, '|', 1)
@property
def species(self):
return extract_field_if_exists(self.header, '|', 2)
def __len__(self):
return len(self.seq)
def __str__(self):
return '>%s\n%s\n' % (self.header, self.seq)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''Get from NCBI CD sequences from the HCDM database (hcdm.org), as exported by HGNC (genenames.org)'''
import urllib
HUGO_REQUEST = 'http://www.genenames.org/cgi-bin/download?'
HUGO_COLS = '&col=gd_hgnc_id&col=md_refseq_id&col=gd_other_ids_list&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=gd_pub_refseq_ids'
# HUGO query on 'hcdm.org' entries
HUGO_QUERY_HCDM = '&status=Approved&status=Entry+Withdrawn&status_opt=2&where=gd_other_ids+LIKE+%27%25hcdm.org%25%27&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit'
HUGO_URL_HCDM = HUGO_REQUEST + HUGO_COLS + HUGO_QUERY_HCDM
NCBI_API = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&retmode=text'+'&id=%s'
# Common CD used to sort cells, see https://en.wikipedia.org/wiki/Cluster_of_differentiation
SORTING_CD = [ 'CD3g', 'CD3d', 'CD3e', 'CD4', 'CD8a', 'CD8b', 'CD11a', 'CD11b', 'CD14', 'CD15', 'CD16a', 'CD16b', 'CD19', 'CD20', 'CD22', 'CD24', 'CD25', 'CD30', 'CD31', 'CD34', 'CD38', 'CD45', 'CD56', 'CD61', 'CD91', 'CD114', 'CD117', 'CD182' ]
OUT = 'homo-sapiens/CD.fa'
SORTING_OUT = 'homo-sapiens/CD-sorting.fa'
print "==>", OUT
out = open(OUT, 'w')
print "==>", SORTING_OUT
sorting_out = open(SORTING_OUT, 'w')
def ncbi_and_write(ncbi, hugo, cd_id, outs):
print cd_id, hugo, ncbi
fasta = urllib.urlopen(NCBI_API % ncbi).read()
fasta_with_id = fasta.replace('>', '>%s|%s|' % (hugo, cd_id))
for out in outs:
out.write(fasta_with_id)
for l in urllib.urlopen(HUGO_URL_HCDM).readlines():
ll = l.split('\t')
try:
hugo, ncbi, ids = ll[0], ll[1], ll[2]
cd_id = ids.split(',')[2].strip()
except:
print "!", l
continue
ncbi_and_write(ncbi, hugo, cd_id, [out] + ([sorting_out] if cd_id in SORTING_CD else []))
#!/bin/sh
cd $(dirname $0)
wget -O - http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP | python split-from-imgt.py
wget -O IMGT_RELEASE http://www.imgt.org/download/GENE-DB/RELEASE
wget -N -P homo-sapiens http://vidjil.org/germline/IGK-INTRON.fa
wget -N -P homo-sapiens http://vidjil.org/germline/IGK-KDE.fa
#!/bin/sh
set -e
dir=$(dirname $0)
germline_id=$(cat $dir/germline_id)
cat <<EOF
By downloading the IMGT germline databases (IMGT/GENE-DB), you
accept the IMGT® license: academic research only, provided that it is referred
to IMGT® and cited as "IMGT®, the international ImMunoGeneTics information
system® http://www.imgt.org (founder and director: Marie-Paule Lefranc,
Montpellier, France). Lefranc, M.-P., IMGT®, the international ImMunoGeneTics
database, Nucl. Acids Res., 29, 207-209 (2001). PMID:11125093.
Otherwise please remove the files from your system.
EOF
wget -N http://www.vidjil.org/germlines/germline-${germline_id}.tar.gz
tar xzmfv germline-${germline_id}.tar.gz --strip-components=1 germline/
tar xzmfv germline-${germline_id}.tar.gz --strip-components=1 -C $dir/../browser/ browser/ || true
{
"ref": "http://www.vidjil.org/germlines/germline-46.tar.gz",
"species": "Homo sapiens",
"species_taxon_id": 9606,
"path": "homo-sapiens",
"systems": {
"CD": {
"shortcut": "I",
"color" : "#ffffff",
"description": "CD",
"recombinations": [ { "1": ["CD-sorting.fa"] } ],
"parameters": {
"seed": "13s"
}
}
}
}
{
"ref": "http://www.vidjil.org/germlines/germline-46.tar.gz",
"species": "Homo sapiens",
"species_taxon_id": 9606,
"path": "homo-sapiens",
"systems": {
"IgVC": {
"shortcut": "C",
"color" : "#6c71c4",
"description": "Human immunoglobulin, heavy locus (14q32.33), with constant heavy chains",
"recombinations": [ {
"5": ["IGHV.fa"],
"4": ["IGHJ.fa"],
"3": ["IGHC=M.fa",
"IGHC=D.fa",
"IGHC=G3.fa",
"IGHC=G1.fa",
"IGHC=A1.fa",
"IGHC=G2.fa",
"IGHC=G4.fa",
"IGHC=E.fa",
"IGHC=A2.fa",
"IGHC=GP.fa"
]
} ],
"parameters": {
"seed": "12s"
}
},
"IgJC": {
"shortcut": "c",
"color" : "#8c91e4",
"description": "Human immunoglobulin, heavy locus (14q32.33), with constant heavy chains",
"recombinations": [ {
"5": ["IGHJ.fa"],
"3": ["IGHC=M.fa",
"IGHC=D.fa",
"IGHC=G3.fa",
"IGHC=G1.fa",
"IGHC=A1.fa",
"IGHC=G2.fa",
"IGHC=G4.fa",
"IGHC=E.fa",
"IGHC=A2.fa",
"IGHC=GP.fa"
]
} ],
"parameters": {
"seed": "12s"
}
}
}
}
{
"ref": "http://www.vidjil.org/germlines/germline-52.tar.gz",
"species": "Homo sapiens",
"species_taxon_id": 9606,
"path": "homo-sapiens",
"systems": {
"TRA": {
"shortcut": "A",
"color" : "#268bd2",
"description": "Human T-cell receptor, alpha locus (14q11.2)",
"recombinations": [ {
"5": ["TRAV.fa"],
"3": ["TRAJ.fa"]
} ],
"parameters": {
"seed": "13s"
}
},
"TRB": {
"shortcut": "B",
"color" : "#cb4b16",
"description": "Human T-cell receptor, beta locus (7q34)",
"recombinations": [ {
"5": ["TRBV.fa"],
"4": ["TRBD.fa"],
"3": ["TRBJ.fa"]
} ],
"parameters": {
"seed": "12s"
}
},
"TRB+": {
"shortcut": "b",
"color" : "#eb6b36",
"description": "Human T-cell receptor, beta locus (7q34), incomplete Db-Jb recombinations",
"follows": "TRB",
"recombinations": [ {
"5": ["TRBD+up.fa"],
"3": ["TRBJ.fa"]
} ],
"parameters": {
"seed": "12s"
}
},
"TRG": {
"shortcut": "G",
"color" : "#dc322f",
"description": "Human T-cell receptor, gamma locus (7p14)",
"recombinations": [ {
"5": ["TRGV.fa"],
"3": ["TRGJ.fa"]
} ],
"parameters": {
"seed": "10s"
}
},
"TRD": {
"shortcut": "D",
"color" : "#b58900",
"description": "Human T-cell receptor, delta locus (14q11.2)",
"recombinations": [ {
"5": ["TRDV.fa"],
"4": ["TRDD.fa"],
"3": ["TRDJ.fa"]
} ],
"parameters": {
"seed": "10s"
}
},
"TRA+D": {
"shortcut": "a",
"color" : "#46abf2",
"description": "Human T-cell receptor, alpha/delta locus (14q11.2)",
"recombinations": [ {
"5": ["TRDV.fa"],
"4": ["TRDD.fa"],
"3": ["TRAJ.fa"]
}, {
"5": ["TRDD+up.fa"],
"3": ["TRAJ.fa"]
} ],
"parameters": {
"seed": "13s"
}
},
"TRD+": {
"shortcut": "d",
"color" : "#d5a920",
"description": "Human T-cell receptor, delta locus (14q11.2), incomplete Dd2-Dd3 recombinations",
"follows": "TRD",
"recombinations": [ {
"5": ["TRDV.fa"],
"3": ["TRDD3+down.fa"]
}, {
"5": ["TRDD2+up.fa"],
"4": ["TRDD.fa"],
"3": ["TRDJ.fa"]
}, {
"5": ["TRDD2+up.fa"],
"3": ["TRDD3+down.fa"]
} ],
"parameters": {
"seed": "9s"
}
},
"IGH": {
"shortcut": "H",
"color" : "#6c71c4",
"description": "Human immunoglobulin, heavy locus (14q32.33)",
"recombinations": [ {
"5": ["IGHV.fa"],
"4": ["IGHD.fa"],
"3": ["IGHJ.fa"]
} ],
"parameters": {
"seed": "12s"
}
},
"IGH+": {
"shortcut": "h",
"color" : "#8c91e4",
"description": "Human immunoglobulin, heavy locus (14q32.33), incomplete Dh-Jh recombinations",
"follows": "IGH",
"recombinations": [ {
"5": ["IGHD+up.fa"],
"3": ["IGHJ.fa"]
} ],
"parameters": {
"seed": "12s"
}
},
"IGK": {
"shortcut": "K",
"color" : "#2aa198",
"description": "Human immunoglobulin, kappa locus (2p11.2)",
"recombinations": [ {
"5": ["IGKV.fa"],
"3": ["IGKJ.fa"]
} ],
"parameters": {
"seed": "10s"
}
},
"IGK+": {
"shortcut": "k",
"color" : "#4ac1a8",
"description": "Human immunoglobulin, kappa locus (2p11.2), Vk-KDE and Intron-KDE recombinations",
"follows": "IGK",
"recombinations": [ {
"5": ["IGKV.fa", "IGK-INTRON.fa"],
"3": ["IGK-KDE.fa"]
} ],
"parameters": {
"seed": "10s"
}
},
"IGL": {
"shortcut": "L",
"color" : "#d33682",
"description": "Human immunoglobulin, lambda locus (22q11.2)",
"recombinations": [ {
"5": ["IGLV.fa"],
"3": ["IGLJ.fa"]
} ],
"parameters": {
"seed": "10s"
}
}
}
}
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
>J00220|IGHA1*01|Homo sapiens|F|CH1|g,142..446|306 nt|1|+1|-1| | |306+69=375| | |
............gcatccccgaccagccccaaggtcttcccgctgagcctctgcagcacc
cagcca............gatgggaacgtggtcatcgcctgcctggtccagggcttcttc
ccccag...gagccactcagtgtgacctggagcgaaagcggacagggcgtg.........
...accgccagaaacttcccacccagccaggatgcctccggg...............gac
ctgtacaccacgagcagccagctgaccctgccggccacacagtgc......ctagccggc
aagtccgtgacatgccacgtgaagcactac.........acgaatcccagccaggatgtg
actgtgccctgccca
>J00220|IGHA1*01|Homo sapiens|F|H-CH2|g,662..1020|360 nt|1|+1|-1| | |360+72=432| | |
gttccctcaactccacctaccccatctccctcaactccacctaccccatctccctca...
...............tgctgccacccccgactgtcactgcaccgaccggccctcgaggac
ctgctctta......ggttcagaagcgaacctcacgtgcacactgaccggcctgagagat
gcc...tcaggtgtcaccttcacctggacgccctcaagtgggaagagc............
...gctgttcaaggaccacctgagcgtgacctctgtggc..................tgc
tacagcgtgtccagtgtcctgccgggctgtgccgagccatgg......aaccatgggaag
accttcacttgcactgctgcctaccccgag......tccaagaccccgctaaccgccacc
ctctcaaaatcc
>J00220|IGHA1*01|Homo sapiens|F|CH3-CHS|g,1244..1635|393 nt|1|+1| | | |393+54=447| | |
............ggaaacacattccggcccgaggtccacctgctgccgccgccgtcggag
gagctggccctg......aacgagctggtgacgctgacgtgcctggcacgcggcttcagc
ccc......aaggacgtgctggttcgctggctgcaggggtcacaggagctgccccgcgag
...aagtacctgacttgggcatcccggcaggagcccagccagggc.........accacc
accttcgctgtgaccagcatactgcgcgtggcagccgaggactgg......aagaagggg
gacaccttctcctgcatggtgggccacgaggcc...ctgccgctggccttcacacagaag
accatcgaccgcttggcg.........ggtaaacccacccatgtcaatgtgtctgttgtc
atggcggaggtggacggcacctgctac
>M60193|IGHA1*01|Homo sapiens|F|M|g,226..437|213 nt|1|+1| | | |213+0=213| | |
ggctcttgctctgttgcagattggcagatgccgcctccctatgtggtgctggacttgccg
caggagaccctggaggaggagacccccggcgccaacctgtggcccaccaccatcaccttc
ctcaccctcttcctgctgagcctgttctatagcacagcactgaccgtgaccagcgtccgg
ggcccatctggcaacagggagggcccccagtac
>AL928768|IGHA1*02|Homo sapiens|F|H-CH2|g,13..371|360 nt|1|+1|-1| | |360+72=432| | |
gttccctcaactccacctaccccatctccctcaactccacctaccccatctccctca...
...............tgctgccacccccgactgtcactgcaccgaccggccctcgaggac
ctgctctta......ggttcagaagcgaacctcacgtgcacactgaccggcctgagagat
gcc...tcaggtgtcaccttcacctggacgccctcaagtgggaagagc............
...gctgttcaaggaccacctgagcgtgacctctgtggc..................tgc
tacagcgtgtccagtgtcctgccgggctgtgccgagccatgg......aaccatgggaag
accttcacttgcactgctgcctaccccgag......tccaagaccccgctaaccgccacc
ctctcaaaatcc
>AL928768|IGHA1*02|Homo sapiens|F|CH3-CHS|g,595..986|393 nt|1|+1| | | |393+54=447| | |
............ggaaacacattccggcccgaggtccacctgctgccgccgccgtcggag
gagctggccctg......aacgagctggtgacgctgacgtgcctggcacgcggcttcagc
ccc......aaggatgtgctggttcgctggctgcaggggtcacaggagctgccccgcgag
...aagtacctgacttgggcatcccggcaggagcccagccagggc.........accacc
accttcgctgtgaccagcatactgcgcgtggcagccgaggactgg......aagaagggg
gacaccttctcctgcatggtgggccacgaggcc...ctgccgctggccttcacacagaag
accatcgaccgcttggcg.........ggtaaacccacccatgtcaatgtgtctgttgtc
atggcggaggtggacggcacctgctac
>AL928768|IGHA1*02|Homo sapiens|F|M|g,3555..3766|213 nt|1|+1| | | |213+0=213| | |
ggctcttgctgtgttgcagattggcagatgccgcctccctatgtggtgctggacttgccg
caggagaccctggaggaggagacccccggcgccaacctgtggcccaccaccatcaccttc
ctcaccctcttcctgctgagcctgttctatagcacagcactgaccgtgaccagcgtccgg
ggcccatctggcaacagggagggcccccagtac
>J00221|IGHA2*01|Homo sapiens|F|CH1|g,164..468|306 nt|1|+1|-1| | |306+69=375| | |
............gcatccccgaccagccccaaggtcttcccgctgagcctcgacagcacc
ccccaa............gatgggaacgtggtcgtcgcatgcctggtccagggcttcttc
ccccag...gagccactcagtgtgacctggagcgaaagcggacagaacgtg.........
...accgccagaaacttcccacctagccaggatgcctccggg...............gac
ctgtacaccacgagcagccagctgaccctgccggccacacagtgc......ccagacggc
aagtccgtgacatgccacgtgaagcactac.........acgaatcccagccaggatgtg
actgtgccctgccca
>J00221|IGHA2*01|Homo sapiens|F|H-CH2|g,684..1003|321 nt|1|+1|-1| | |321+72=393| | |
gttcccccacctccccca..................tgctgccacccccgactgtcgctg
caccgaccggccctcgaggacctgctctta......ggttcagaagcgaacctcacgtgc
acactgaccggcctgagagatgcc...tctggtgccaccttcacctggacgccctcaagt
gggaagagc...............gctgttcaaggaccacctgagcgtgacctctgtggc
..................tgctacagcgtgtccagtgtcctgcctggctgtgcccagcca
tgg......aaccatggggagaccttcacctgcactgctgcccaccccgag......ttg
aagaccccactaaccgccaacatcacaaaatcc
>J00221|IGHA2*01|Homo sapiens|F|CH3-CHS|g,1227..1618|393 nt|1|+1| | | |393+54=447| | |
............ggaaacacattccggcccgaggtccacctgctgccgccgccgtcggag
gagctggccctg......aacgagctggtgacgctgacgtgcctggcacgtggcttcagc
ccc......aaggatgtgctggttcgctggctgcaggggtcacaggagctgccccgcgag
...aagtacctgacttgggcatcccggcaggagcccagccagggc.........accacc
accttcgctgtgaccagcatactgcgcgtggcagccgaggactgg......aagaagggg
gacaccttctcctgcatggtgggccacgaggcc...ctgccgctggccttcacacagaag
accatcgaccgcttggcg.........ggtaaacccacccatgtcaatgtgtctgttgtc
atggcggaggtggacggcacctgctac
>S71043|IGHA2*03|Homo sapiens|F|CH1|g,107..411|306 nt|1|+1|-1| | |306+69=375| | |
............gcatccccgaccagccccaaggtcttcccgctgagcctcgacagcacc
ccccaa............gatgggaacgtggtcgtcgcatgcctggtccagggcttcttc
ccccag...gagccactcagtgtgacctggagcgaaagcggacagaacgtg.........