Commit c4fc2195 authored by flothoni's avatar flothoni

Merge branch...

Merge branch 'feature-c/4104-n-afficher-dans-le-graph-que-les-clones-du-sample-courant' of gitlab.inria.fr:vidjil/vidjil into feature-c/4104-n-afficher-dans-le-graph-que-les-clones-du-sample-courant
parents e995fbbe 6b8cd835
Pipeline #122887 passed with stages
in 7 minutes and 44 seconds
...@@ -9,9 +9,9 @@ build_doc: ...@@ -9,9 +9,9 @@ build_doc:
- site/ - site/
expire_in: 1 month expire_in: 1 month
only: only:
changes: - /^[^\/]*doc[^\/]*\//
- doc/**/* # branch name should contain doc before the first /
- mkdocs.yml # eg.: doc/blabla or feature-adoc/blabla
tags: tags:
- doc - doc
...@@ -21,6 +21,4 @@ deploy_doc: ...@@ -21,6 +21,4 @@ deploy_doc:
- scp -r site/ $VIDJIL_WWW:doc/ - scp -r site/ $VIDJIL_WWW:doc/
when: manual when: manual
only: only:
changes: - /^[^\/]*doc[^\/]*\//
- doc/**/*
- mkdocs.yml
...@@ -139,8 +139,38 @@ def store_data_if_updownstream(fasta_header, path, data, genes): ...@@ -139,8 +139,38 @@ def store_data_if_updownstream(fasta_header, path, data, genes):
data[path+'/'+gene].append((gene_name, gene_coord)) data[path+'/'+gene].append((gene_name, gene_coord))
paths.append(path+'/'+gene) paths.append(path+'/'+gene)
return paths return paths
def retrieve_genes(f, genes, tag, additional_length, gene_list): def ignore_strand(start, end):
if start < end:
return (start, end)
return (end, start)
def compute_updownstream_length(genes, default_length):
'''
Returns the maximal `min_length` size (but at most `default_length`)
such that this size never overlaps the previous/next gene.
'''
positions = [ ignore_strand(info[1]['target_start'], info[1]['target_end']) for info in genes if 'target_start' in info[1]]
positions = list(set(positions))
positions.sort()
i = 0
min_length = default_length
sign = - 1 if min_length < 0 else 1
while i < len(positions) - 1:
last = positions[i][1]
first_next = positions[i+1][0]
diff = first_next - last - 1
if diff < abs(min_length):
min_length = diff * sign
i += 1
# Should we divide by 2 the length so that we don't have overlaps
# between up and downstream?
return min_length
def retrieve_genes(f_name, genes, tag, additional_length, gene_list):
f = verbose_open_w(f_name)
for info in genes: for info in genes:
(gene, coord) = info (gene, coord) = info
# try to extract from genome # try to extract from genome
...@@ -151,17 +181,27 @@ def retrieve_genes(f, genes, tag, additional_length, gene_list): ...@@ -151,17 +181,27 @@ def retrieve_genes(f, genes, tag, additional_length, gene_list):
if gene_id: if gene_id:
try: try:
(target, start, end) = ncbi.get_gene_positions(gene_id) (target, start, end) = ncbi.get_gene_positions(gene_id)
coord['target'] = target
coord['target_start'] = start
coord['target_end'] = end
coord['gene_id'] = gene_id
except KeyError: except KeyError:
print('! No positions for %s (%s: %s)' % (gene_id, gene, str(coord))) print('! No positions for %s (%s: %s)' % (gene_id, gene, str(coord)))
allele_additional_length = additional_length
gene_id = None min_updownstream = compute_updownstream_length(genes, additional_length)
print(' %s, ' % f_name + 'genes: %d, ' % len(genes) + 'up/downstream: %dbp' % min_updownstream)
# gene: is the name of the sequence where the VDJ gene was identified according to IMGT. The gene is just a part of the sequence # gene: is the name of the sequence where the VDJ gene was identified according to IMGT. The gene is just a part of the sequence
# gene_id: is the NCBI ID of the VDJ gene # gene_id: is the NCBI ID of the VDJ gene
# target: is the NCBI ID of the chromosome # target: is the NCBI ID of the chromosome
for info in genes:
(gene, coord) = info
gene_id = coord['gene_id'] if 'gene_id' in coord else None
if GENES_SEQ_FROM_NCBI: if GENES_SEQ_FROM_NCBI:
gene_data = ncbi.get_gene_sequence(gene, coord['imgt_data'] + tag, coord['from'], coord['to'], allele_additional_length) gene_data = ncbi.get_gene_sequence(gene, coord['imgt_data'] + tag, coord['from'], coord['to'], min_updownstream)
else: else:
# IMGT # IMGT
gene_data = coord['seq'] gene_data = coord['seq']
...@@ -169,9 +209,10 @@ def retrieve_genes(f, genes, tag, additional_length, gene_list): ...@@ -169,9 +209,10 @@ def retrieve_genes(f, genes, tag, additional_length, gene_list):
if gene_id: if gene_id:
# Check consistency for *01 allele # Check consistency for *01 allele
if coord['imgt_name'].endswith('*01'): if coord['imgt_name'].endswith('*01'):
check_imgt_ncbi_consistency(coord, gene_data, target, start, end) check_imgt_ncbi_consistency(coord, gene_data, coord['target'], coord['target_start'],
coord['target_end'])
up_down = ncbi.get_updownstream_sequences(target, start, end, additional_length) up_down = ncbi.get_updownstream_sequences(coord['target'], coord['target_start'],
coord['target_end'], min_updownstream)
# We put the up and downstream data before and after the sequence we retrieved previously # We put the up and downstream data before and after the sequence we retrieved previously
gene_data = paste_updown_on_fasta(gene_data, up_down[0], up_down[1]) gene_data = paste_updown_on_fasta(gene_data, up_down[0], up_down[1])
...@@ -237,8 +278,8 @@ def gap_j(seq): ...@@ -237,8 +278,8 @@ def gap_j(seq):
return (MAX_GAP_J - pos) * '.' + seq return (MAX_GAP_J - pos) * '.' + seq
LENGTH_UPSTREAM=40 LENGTH_UPSTREAM=200
LENGTH_DOWNSTREAM=40 LENGTH_DOWNSTREAM=200
# Create isolated files for some sequences # Create isolated files for some sequences
SPECIAL_SEQUENCES = [ SPECIAL_SEQUENCES = [
] ]
...@@ -399,12 +440,12 @@ def split_IMGTGENEDBReferenceSequences(sources, gene_list): ...@@ -399,12 +440,12 @@ def split_IMGTGENEDBReferenceSequences(sources, gene_list):
# Dump up/downstream data # Dump up/downstream data
for system in upstream_data: for system in upstream_data:
f = verbose_open_w(system + TAG_UPSTREAM + '.fa') f_name = system + TAG_UPSTREAM + '.fa'
retrieve_genes(f, upstream_data[system], TAG_UPSTREAM, -LENGTH_UPSTREAM, gene_list) retrieve_genes(f_name, upstream_data[system], TAG_UPSTREAM, -LENGTH_UPSTREAM, gene_list)
for system in downstream_data: for system in downstream_data:
f = verbose_open_w(system + TAG_DOWNSTREAM + '.fa') f_name = system + TAG_DOWNSTREAM + '.fa'
retrieve_genes(f, downstream_data[system], TAG_DOWNSTREAM, LENGTH_DOWNSTREAM, gene_list) retrieve_genes(f_name, downstream_data[system], TAG_DOWNSTREAM, LENGTH_DOWNSTREAM, gene_list)
......
!NO_LAUNCHER: !NO_LAUNCHER:
!LAUNCH: (cd $VIDJIL_DIR/germline ; grep -A2 -F 'IGHD2-2*02' homo-sapiens/IGHD+up.fa | tr -d '\n') # The awk part prints the IGHD2-2*02 sequence
!LAUNCH: (cd $VIDJIL_DIR/germline ; awk '$0 ~ /IGHD2-2.02/ {print; getline; while ($0 !~ /^>/) {print; getline}}' homo-sapiens/IGHD+up.fa | tr -d '\n')
$ Correct sequence, with upstream $ Correct sequence, with upstream
i1:AGGATTTTGTGGGGGCTCGTGTCACTGTGA i1:AGGATTTTGTGGGGGCTCGTGTCACTGTGA
!NO_LAUNCHER:
!LAUNCH: (cd $VIDJIL_DIR/germline ; cat homo-sapiens/TRBJ+down.fa | tr -d '\n' | tr '>' '\n')
$ TRBJ1-1 has the correct 71bp downstream
1: GTAAGACATTTTTCAGGTTCTTTTGCAGATCCGTCACAGGGAAAAGTGGGTCCACAGTGTCCCTTTTAGAG
$ The 16 sequences have 71bp downstream, and no more
16: [AGCT]{71}
0: [AGCT]{72}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment