Commit a7acea6c authored by Ryan Herbert's avatar Ryan Herbert
Browse files

Once again, so not package germlines.

parent fa718bc1
Pipeline #33114 failed with stages
in 24 seconds
......@@ -26,7 +26,6 @@ copy ./conf/align.cgi /usr/share/vidjil/browser/cgi/align.cgi
copy ./conf/similarity.cgi /usr/share/vidjil/browser/cgi/similarity.cgi
run cd /usr/share/vidjil/browser/css/icons && make
run cd /usr/share/vidjil/germline && make
arg build_env='PRODUCTION'
env BUILD_ENV $build_env
......
DEFAULT_G=homo-sapiens.g
DIRS=homo-sapiens/ mus-musculus/ rattus-norvegicus/
GERMLINE_JS=../browser/js/germline.js
all: get-saved-data
germline: get-saved-data $(GERMLINE_JS)
js: $(GERMLINE_JS)
$(GERMLINE_JS): $(DEFAULT_G)
python buildBrowserGermline.py $(DEFAULT_G) $@
get-all-data: clean
sh get-germline
python get-CD.py
get-saved-data: germline_id
sh get-saved-germline
clean:
rm -rf $(DIRS) $(GERMLINE_JS)
diff-from-saved:
rm -rf saved-germline
mkdir saved-germline
cd saved-germline ; sh ../get-saved-germline
echo
diff -r -u -x "*[.][^f][^a]" -x "germline*" -x "get*" -x "Makefile" -x "saved-*" saved-germline/ .
distrib: get-all-data js
cd .. ; tar cvzf germline-`cat germline/germline_id`.tar.gz germline/germline_id germline/*/*.fa germline/IMGT_RELEASE browser/js/germline.js
.PHONY: all germline js get-all-data clean diff-from-saved
import json
import sys
def get_required_files(germlines_data):
'''
Parse the germlines data and get all the files that are required by that
file.
The function returns a list of the files (uniqueness is guaranteed)
'''
g_json = json.load(open(germlines_data, 'r'))
path = g_json['path']
germlines_json = g_json['systems']
files = []
for germline in germlines_json.keys():
for recombination in germlines_json[germline]['recombinations']:
for gene in ['5', '4', '3']:
if gene in recombination:
for f in recombination[gene]:
f = path + '/' + f
if f not in files:
files.append(f)
return files
if len(sys.argv) != 3:
print("Usage: %s <JSON/DATA germline file> <JSON output file>" % sys.argv[0])
sys.exit()
data_file = sys.argv[1]
output_name = sys.argv[2]
table = {}
identifiant = ""
sequence = ""
germline_files = get_required_files(data_file)
for current_file in germline_files:
try:
fasta = open(current_file, "r")
except IOError as e:
raise type(e),\
type(e)(str(e) + '\nDid you forget to run ``make\'\' in the germline directory?\n'\
+'Otherwise, please tell us about the problem at contact@vidjil.org'),\
sys.exc_info()[2]
system = current_file.split('/')[-1].split('.')[0]
table[system] = {}
for ligne in fasta :
ligne = ligne.rstrip('\n\r')
if ligne:
if ligne[0]=='>' :
identifiant=ligne[1:]
if '|' in identifiant:
identifiant = identifiant.split('|')[1]
if '_' in identifiant:
identifiant = identifiant.split('_')[0]
sequence = ""
else :
sequence+=ligne
if sequence:
# If there is still some sequence left, this value will be overwritten in the next pass
table[system][identifiant]=sequence
fasta.close()
with open(output_name, "w") as file :
file.write("germline = ")
json.dump(table, file, indent=2, sort_keys=True)
data = open(data_file, "r")
file.write( "\n\n" )
file.write("germline_data = ")
file.write( data.read() )
import sys
COMPLEMENT_NUCLEOTIDE = {
'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C',
'Y': 'R', 'R': 'Y', # pyrimidine (CT) / purine (AG)
'W': 'S', 'S': 'W', # weak (AT) / strong (GC)
'K': 'M', 'M': 'K', # keto (TG) / amino (AC)
'B': 'V', 'V': 'B', 'D': 'H', 'H': 'D',
'N': 'N'
}
def revcomp(seq):
'''Returns the reverse complement of a sequence
>>> revcomp('ACGNTT')
'AANCGT'
'''
rc = ''
for nucl in seq[::-1]:
try:
rc += COMPLEMENT_NUCLEOTIDE[nucl.upper()]
except KeyError:
sys.stderr.write("! Unknown nucleotide : '%s' " % nucl + seq)
rc += 'N'
return rc
def parse(fasta, endline=''):
'''Iterates over sequences in a fasta files, yielding (header, sequence) pairs'''
header = ''
sequence = ''
for l in fasta:
l = l.strip()
if not l:
continue
if l[0] == '#':
continue
if l[0] == '>':
if header or sequence:
yield (header, sequence)
header = l[1:]
sequence = ''
else:
sequence += l + endline
if header or sequence:
yield (header, sequence)
def extract_field_if_exists(s, separator, field_number):
fields = s.split(separator)
if len(fields) > field_number:
return fields[field_number]
return str
def parse_as_Fasta(fasta):
for (header, sequence) in parse(fasta):
yield Fasta(header, sequence)
class Fasta():
def __init__(self, header, sequence):
self.header = header
self.seq = sequence
def revcomp(self):
self.seq = revcomp(self.seq)
@property
def name(self):
return extract_field_if_exists(self.header, '|', 1)
@property
def species(self):
return extract_field_if_exists(self.header, '|', 2)
def __len__(self):
return len(self.seq)
def __str__(self):
return '>%s\n%s\n' % (self.header, self.seq)
'''Generates artificial VJ recombinations'''
from __future__ import print_function
import json
import fasta
import random
import argparse
random.seed(33328778554)
class vdj_repertoire:
'''
The class generates recombinations among a set of sequences
'''
labels = []
sequences = []
def __init__(self, labels = None, repertoire = None):
'''
repertoire should be a list of dictionary of a list of Fasta sequences
'''
if repertoire is not None:
self.sequences = repertoire
self.labels = labels
@classmethod
def files(self, labels, repertoire):
'''
Provide a list of list of sequences
'''
sequences = []
for filenames in repertoire:
current_sequences = []
for filename in filenames:
current_sequences += list(fasta.parse_as_Fasta(open(filename)))
sequences.append(current_sequences)
return self(labels, sequences)
def germlines(self):
return self.labels
def nb_sequences(self, label):
'''
>>> rep = vdj_repertoire(['a', 'b'], [['aat', 'taa'], ['gcc']])
>>> rep.nb_sequences('a')
2
>>> rep.nb_sequences('b')
1
>>> rep.nb_sequences('c')
Traceback (most recent call last):
...
ValueError: 'c' is not in list
'''
index = self.labels.index(label)
return len(self.sequences[index])
def recombinations(self, at_most = None):
'''
Returns a list of recombinations.
The recombinations are given under the form of a list.
>>> [v for v in vdj_repertoire(['a', 'b'], [['aat', 'taa'],\
['gcc']]).recombinations()]
[['aat', 'gcc'], ['taa', 'gcc']]
>>> len([v for v in vdj_repertoire(['a', 'b'], [['aat', 'taa'],\
['gcc']]).recombinations(1)])
1
'''
if at_most is not None:
return self._at_most_recombinations_(at_most)
else:
return self._all_recombinations_()
def _at_most_recombinations_(self, at_most):
nb = 0
while nb < at_most:
recombination = []
for current_rep in self.sequences:
recombination.append(random.choice(current_rep))
yield recombination
nb += 1
def _all_recombinations_(self):
return list_recombinations(self.sequences)
def list_recombinations(l):
'''
>>> [i for i in list_recombinations([[1], [10, 11], [100, 102]])]
[[1, 10, 100], [1, 11, 100], [1, 10, 102], [1, 11, 102]]
>>> [i for i in list_recombinations([[1, 2], [3]])]
[[1, 3], [2, 3]]
'''
if len(l) <= 0:
yield []
else:
for item in l[len(l)-1]:
for recomb in list_recombinations(l[:-1]):
recomb.append(item)
yield recomb
class vdj_recombination:
deletions = None
insertions = None
processing = []
def __init__(self, insertions = None, deletions = None, processing = None):
'''insertions and deletions are lists of length 1 or whose length are the
number of locations where insertions and deletions take place. They
contain a function with no parameter returning a natural integer
corresponding to the number of expected insertions/deletions.
processing is a list of the same size which contains a function which
is applied to a string and which returns a string. It is used to alter
the input sequence.
'''
if insertions is not None:
self.insertions = insertions
else:
self.insertions = [(lambda: 0)]
if deletions is not None:
self.deletions = deletions
else:
self.deletions = [(lambda: 0)]
if processing is not None:
self.processing = processing
else:
self.processing = [(lambda s: s)]
def recombine(self, sequences):
'''
Recombine the sequences with the provided recombinations
>>> str(vdj_recombination().recombine([fasta.Fasta('a', 'AATTAT'),\
fasta.Fasta('b', 'GGGACACAT'),\
fasta.Fasta('c', 'ATAGATATGA')]))
'>a 0//0 b 0//0 c\\nAATTAT\\nGGGACACAT\\nATAGATATGA\\n\\n'
>>> str(vdj_recombination(deletions=[(lambda: 2)]).recombine([fasta.Fasta('a', 'AATTAT'),\
fasta.Fasta('b', 'GGGACACAT'),\
fasta.Fasta('c', 'ATAGATATGA')]))
'>a 2//2 b 2//2 c\\nAATT\\nGACAC\\nAGATATGA\\n\\n'
'''
name = ''
seq = ''
insertions = self.insertions * (len(sequences)-1)
deletions = self.deletions * (len(sequences)*2-2)
process = self.processing * (len(sequences))
for i, sequence in enumerate(sequences):
nb_deletions_start = 0
nb_deletions_end = 0
N_insertions = ''
sequence.seq = sequence.seq.translate(None, '.')
if i > 0:
# Start deletion
nb_deletions_start = deletions[2*i-1]()
name += '/%d ' % nb_deletions_start
name += sequence.name
if i < len(sequences) - 1:
# End deletion
nb_deletions_end = deletions[2*i]()
N_insertions = random_sequence(['A', 'C', 'G', 'T'],\
insertions[i]())
name += ' %d/%s' % (nb_deletions_end, N_insertions)
nb_deletions_end = -nb_deletions_end if nb_deletions_end > 0 else None
seq += process[i](sequence.seq[nb_deletions_start:nb_deletions_end])+"\n"+N_insertions+"\n"
return fasta.Fasta(name, seq)
def random_sequence(characters, length):
return ''.join([random.choice(characters) for x in range(length)])
def mutate_sequence(sequence, probability):
'''
Mutate the original DNA sequence given in parameter.
The probability is a per nucleotide probability.
This solution is inspired from Blckknght's: http://stackoverflow.com/a/24063748/1192742
'''
mutated = []
nucleotides = ['A', 'C', 'G', 'T']
for nt in sequence:
if random.random() < probability:
if nt.upper() in nucleotides:
nt = nucleotides[nucleotides.index(nt.upper()) - random.randint(1, 3)]
else:
nt = random.choice(nucleotides)
mutated.append(nt)
return ''.join(mutated)
def random_pos_int(mean, stddev):
'''
Returns a random number whose distribution
has the mean provided has a parameter and the standard deviation
is stddev
'''
result = random.gauss(mean, stddev)
if result < 0:
return 0
return int(result)
def get_gene_name(allele):
'''
From fasta sequence to Ig/TR gene name
'''
return allele.name[:allele.name.find('*')]
def write_seq_to_file(seq, code, f):
seq.header = seq.header.replace(' ', '_')+"__"+code
f.write(str(seq))
def generate_to_file(repertoire, recombination, code, f, nb_recomb):
print(" ==>", f)
output = open(f, 'w')
nb = 0
for recomb in repertoire.recombinations():
for i in range(nb_recomb):
write_seq_to_file(recombination.recombine(recomb), code, output)
nb += 1
print(" ==> %d recombinations" % nb)
def list_random_tuple(s):
try:
list_r = s.split(':')
result_list = []
for item in list_r:
one, two = map(float, item.split(','))
result_list.append((lambda: random_pos_int(one, two)))
return result_list
except Exception, e:
raise argparse.ArgumentTypeError('A list separated by colons, of couples separated by commas must be provided (ex: 1,2:2,1) '+str(e))
def list_int(s):
try:
result_list = []
for item in s.split(':'):
result_list.append((lambda: int(item)))
return result_list
except Exception, e:
raise argparse.ArgumentTypeError('A list of integers separated by colons must be provided (ex: 1:2) '+str(e))
if __name__ == '__main__':
DESCRIPTION='Script generating fake V(D)J recombinations'
parser = argparse.ArgumentParser(description=DESCRIPTION)
parser.add_argument('-g', '--germlines', type=file, default='germlines.data', help='path to the germlines.data file')
parser.add_argument('--deletions', '-d', type=list_int, default = [(lambda: 5)], help='List -- separated by colons -- of the number of deletions at junctions (or single value, if the number is the same everywhere).')
parser.add_argument('--insertions', '-i', type=list_int, default = [(lambda: 3)], help='List -- separated by colons -- of the number of insertions at junctions (or single value, if the number is the same everywhere')
parser.add_argument('--random-deletions', '-D', type=list_random_tuple, help='List of random deletions at junctions under the format mean,standard_deviation (or single value, if the number is the same everywhere')
parser.add_argument('--random-insertions', '-I', type=list_random_tuple, help='List of the number of insertions at junctions under the format mean,standard_deviation (or single value, if the number is the same everywhere')
parser.add_argument('-n', '--nb-recombinations', type=int, default=5, help='Number of times each recombination (with insertions/deletions) is generated')
parser.add_argument('-e', '--error', type=float, default = 0., help='Probability of error at the nucleotide level')
args = parser.parse_args()
germlines_json = args.germlines.read().replace('germline_data = ', '')
germlines = json.loads(germlines_json)
for code in germlines:
g = germlines[code]
print("--- %s - %-4s - %s" % (g['shortcut'], code, g['description']))
# Read germlines
nb_recomb = 0
for recomb in g['recombinations']:
labels = ['V']
files = [recomb['5']]
if '4' in recomb:
labels.append('D')
files.append(recomb['4'])
labels.append('J')
files.append(recomb['3'])
repertoire = vdj_repertoire.files(labels, files)
print(" 5: %3d sequences\n" % repertoire.nb_sequences('V'),
" 4: %3d sequences\n" % (repertoire.nb_sequences('D') if 'D' in labels else 0),
" 3: %3d sequences\n" % repertoire.nb_sequences('J'))
code_in_filename = code
if nb_recomb > 0:
code_in_filename = code_in_filename + '-%d' % (nb_recomb+1)
# Generate recombinations
recombination0 = vdj_recombination()
generate_to_file(repertoire, recombination0, code, '../data/gen/0-removes-%s.should-vdj.fa' % code_in_filename, 1)
deletions = args.deletions if args.random_deletions is None else args.random_deletions
insertions = args.insertions if args.random_insertions is None else args.random_insertions
recombination5 = vdj_recombination(deletions=deletions, insertions=insertions, processing = [(lambda s: mutate_sequence(s, args.error))])
generate_to_file(repertoire, recombination5, code, '../data/gen/5-removes-%s.should-vdj.fa' % code_in_filename, args.nb_recombinations)
print()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''Get from NCBI CD sequences from the HCDM database (hcdm.org), as exported by HGNC (genenames.org)'''
import urllib
HUGO_REQUEST = 'http://www.genenames.org/cgi-bin/download?'
HUGO_COLS = '&col=gd_hgnc_id&col=md_refseq_id&col=gd_other_ids_list&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=gd_pub_refseq_ids'
# HUGO query on 'hcdm.org' entries
HUGO_QUERY_HCDM = '&status=Approved&status=Entry+Withdrawn&status_opt=2&where=gd_other_ids+LIKE+%27%25hcdm.org%25%27&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit'
HUGO_URL_HCDM = HUGO_REQUEST + HUGO_COLS + HUGO_QUERY_HCDM
NCBI_API = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&retmode=text'+'&id=%s'
# Common CD used to sort cells, see https://en.wikipedia.org/wiki/Cluster_of_differentiation
SORTING_CD = [ 'CD3g', 'CD3d', 'CD3e', 'CD4', 'CD8a', 'CD8b', 'CD11a', 'CD11b', 'CD14', 'CD15', 'CD16a', 'CD16b', 'CD19', 'CD20', 'CD22', 'CD24', 'CD25', 'CD30', 'CD31', 'CD34', 'CD38', 'CD45', 'CD56', 'CD61', 'CD91', 'CD114', 'CD117', 'CD182' ]
OUT = 'homo-sapiens/CD.fa'
SORTING_OUT = 'homo-sapiens/CD-sorting.fa'
print "==>", OUT
out = open(OUT, 'w')
print "==>", SORTING_OUT
sorting_out = open(SORTING_OUT, 'w')
def ncbi_and_write(ncbi, hugo, cd_id, outs):
print cd_id, hugo, ncbi
fasta = urllib.urlopen(NCBI_API % ncbi).read()
fasta_with_id = fasta.replace('>', '>%s|%s|' % (hugo, cd_id))
for out in outs:
out.write(fasta_with_id)
for l in urllib.urlopen(HUGO_URL_HCDM).readlines():
ll = l.split('\t')
try:
hugo, ncbi, ids = ll[0], ll[1], ll[2]
cd_id = ids.split(',')[2].strip()
except:
print "!", l
continue
ncbi_and_write(ncbi, hugo, cd_id, [out] + ([sorting_out] if cd_id in SORTING_CD else []))