From 52aa017c75c30eee8bd826ccaaaca63fac495885 Mon Sep 17 00:00:00 2001 From: Mikael Salson Date: Mon, 23 Jul 2018 16:15:41 +0200 Subject: [PATCH 1/6] split-from-imgt.py: Be more flexible on up and downstream sequences Compute the mininmal length between genes from a given locus to know how far we can go for upstream and downstream sequences. Fix #3133 --- germline/split-from-imgt.py | 48 +++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/germline/split-from-imgt.py b/germline/split-from-imgt.py index 63ecf0405..80064322d 100644 --- a/germline/split-from-imgt.py +++ b/germline/split-from-imgt.py @@ -139,7 +139,31 @@ def store_data_if_updownstream(fasta_header, path, data, genes): data[path+'/'+gene].append((gene_name, gene_coord)) paths.append(path+'/'+gene) return paths - + +def ignore_strand(start, end): + if start < end: + return (start, end) + return (end, start) + +def compute_updownstream_length(genes, default_length): + positions = [ ignore_strand(info[1]['target_start'], info[1]['target_end']) for info in genes if 'target_start' in info[1]] + positions = list(set(positions)) + positions.sort() + i = 0 + min_length = default_length + sign = - 1 if min_length < 0 else 1 + while i < len(positions) - 1: + last = positions[i][1] + first_next = positions[i+1][0] + diff = first_next - last - 1 + if diff < abs(min_length): + min_length = diff * sign + i += 1 + # Should we divide by 2 the length so that we don't have overlaps + # between up and downstream? + return min_length + + def retrieve_genes(f, genes, tag, additional_length, gene_list): for info in genes: (gene, coord) = info @@ -151,17 +175,26 @@ def retrieve_genes(f, genes, tag, additional_length, gene_list): if gene_id: try: (target, start, end) = ncbi.get_gene_positions(gene_id) + coord['target'] = target + coord['target_start'] = start + coord['target_end'] = end + coord['gene_id'] = gene_id except KeyError: print('! No positions for %s (%s: %s)' % (gene_id, gene, str(coord))) - allele_additional_length = additional_length - gene_id = None + + min_updownstream = compute_updownstream_length(genes, additional_length) # gene: is the name of the sequence where the VDJ gene was identified according to IMGT. The gene is just a part of the sequence # gene_id: is the NCBI ID of the VDJ gene # target: is the NCBI ID of the chromosome + for info in genes: + (gene, coord) = info + + gene_id = coord['gene_id'] if 'gene_id' in coord else None + if GENES_SEQ_FROM_NCBI: - gene_data = ncbi.get_gene_sequence(gene, coord['imgt_data'] + tag, coord['from'], coord['to'], allele_additional_length) + gene_data = ncbi.get_gene_sequence(gene, coord['imgt_data'] + tag, coord['from'], coord['to'], min_updownstream) else: # IMGT gene_data = coord['seq'] @@ -169,9 +202,10 @@ def retrieve_genes(f, genes, tag, additional_length, gene_list): if gene_id: # Check consistency for *01 allele if coord['imgt_name'].endswith('*01'): - check_imgt_ncbi_consistency(coord, gene_data, target, start, end) - - up_down = ncbi.get_updownstream_sequences(target, start, end, additional_length) + check_imgt_ncbi_consistency(coord, gene_data, coord['target'], coord['target_start'], + coord['target_end']) + up_down = ncbi.get_updownstream_sequences(coord['target'], coord['target_start'], + coord['target_end'], min_updownstream) # We put the up and downstream data before and after the sequence we retrieved previously gene_data = paste_updown_on_fasta(gene_data, up_down[0], up_down[1]) -- GitLab From 802105692aed11727f3190cdbe63b45159c651dc Mon Sep 17 00:00:00 2001 From: Mikael Salson Date: Mon, 23 Jul 2018 16:15:54 +0200 Subject: [PATCH 2/6] split-from-imgt.py: Increase default upstream and downstream lengths It can now be much higher as we make sure not to overlap another gene. See #3133 --- germline/split-from-imgt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/germline/split-from-imgt.py b/germline/split-from-imgt.py index 80064322d..9f201fbe1 100644 --- a/germline/split-from-imgt.py +++ b/germline/split-from-imgt.py @@ -271,8 +271,8 @@ def gap_j(seq): return (MAX_GAP_J - pos) * '.' + seq -LENGTH_UPSTREAM=40 -LENGTH_DOWNSTREAM=40 +LENGTH_UPSTREAM=200 +LENGTH_DOWNSTREAM=200 # Create isolated files for some sequences SPECIAL_SEQUENCES = [ ] -- GitLab From 2250e6b2c55363cafdf3c7e6ec48a2375112992e Mon Sep 17 00:00:00 2001 From: Mikael Salson Date: Mon, 23 Jul 2018 18:15:59 +0200 Subject: [PATCH 3/6] 10-germlines-IGH-upstream.should-get: Update test Use awk rather than awk to output the sequence. With grep we need to know on how many lines the gene is, while it is not necessary with awk (albeit somehow more complicated). --- .../should-get-tests/10-germlines-IGHD-upstream.should-get | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/germline/tests/should-get-tests/10-germlines-IGHD-upstream.should-get b/germline/tests/should-get-tests/10-germlines-IGHD-upstream.should-get index c1ede01d3..ca4971d77 100644 --- a/germline/tests/should-get-tests/10-germlines-IGHD-upstream.should-get +++ b/germline/tests/should-get-tests/10-germlines-IGHD-upstream.should-get @@ -1,5 +1,6 @@ !NO_LAUNCHER: -!LAUNCH: (cd $VIDJIL_DIR/germline ; grep -A2 -F 'IGHD2-2*02' homo-sapiens/IGHD+up.fa | tr -d '\n') +# The awk part prints the IGHD2-2*02 sequence +!LAUNCH: (cd $VIDJIL_DIR/germline ; awk '$0 ~ /IGHD2-2.02/ {print; getline; while ($0 !~ /^>/) {print; getline}}' homo-sapiens/IGHD+up.fa | tr -d '\n') $ Correct sequence, with upstream i1:AGGATTTTGTGGGGGCTCGTGTCACTGTGA -- GitLab From a089d5b6edee1026f52646752eed20e94273a149 Mon Sep 17 00:00:00 2001 From: Mikael Salson Date: Mon, 23 Jul 2018 20:09:45 +0200 Subject: [PATCH 4/6] chimera-fake-half.fa: Change cacac.... to tatat.... It appears that cacacacacac exists in a J+down sequence. Maybe we should remove low-complexity sequences? --- algo/tests/data/chimera-fake-half.fa | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/algo/tests/data/chimera-fake-half.fa b/algo/tests/data/chimera-fake-half.fa index 5ab8f7aa2..62606a3dd 100644 --- a/algo/tests/data/chimera-fake-half.fa +++ b/algo/tests/data/chimera-fake-half.fa @@ -4,15 +4,15 @@ >TRAV1-1*01--caca ggacaaagccttgagcagccctctgaagtgacagctgtggaaggagccattgtccagata -cacacacacacacacacacacacacacacacacacacaca +tatatatatatatatatatatatatatatatatatatata >TRGV1*01--caca tcttccaacttggaagggagaacgaagtcagtcaccaggctgactgggtcatctgctgaa -cacacacacacacacacacacacacacacacacacacaca +tatatatatatatatatatatatatatatatatatatata >IGHV1-18*01--caca caggttcagctggtgcagtctggagctgaggtgaagaagcctggggcctcagtgaaggtc -cacacacacacacacacacacacacacacacacacacaca +tatatatatatatatatatatatatatatatatatatata >atat--TRBV1*01 -- GitLab From b4905a78a34ac20aa88a7cb8dceebd64574a0bf7 Mon Sep 17 00:00:00 2001 From: Mikael Salson Date: Mon, 23 Jul 2018 20:10:05 +0200 Subject: [PATCH 5/6] stanford-germlines.should-get: Update numbers with longer up/down stream sequences --- algo/tests/should-get-tests/germlines-read.should | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/algo/tests/should-get-tests/germlines-read.should b/algo/tests/should-get-tests/germlines-read.should index a4f067246..066a34add 100644 --- a/algo/tests/should-get-tests/germlines-read.should +++ b/algo/tests/should-get-tests/germlines-read.should @@ -4,14 +4,14 @@ $ number of reads and kmers 1:13153 reads, 3020179 kmers $ k-mers, IGHV -1:13116 .* 1219953 .*IGHV +1:13116 .* 1219943 .*IGHV $ k-mers, IGHJ -1:37 .* 435550 .*IGHJ +1:37 .* 435553 .*IGHJ $ k-mers, ambiguous -1:53416 .*\\? +1:53426 .*\\? $ k-mers, unknown -1:1406774 .*_ +1:1248820 .*_ -- GitLab From 46ebab8acdf768def5cbd5008437f6098bce3636 Mon Sep 17 00:00:00 2001 From: Mikael Salson Date: Mon, 23 Jul 2018 20:10:28 +0200 Subject: [PATCH 6/6] vidjil-h-examples.should-get: TRGJ1*01 is now identical to TRGJ1*02 --- algo/tests/should-get-tests/help-examples.should | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/algo/tests/should-get-tests/help-examples.should b/algo/tests/should-get-tests/help-examples.should index b19ca4db8..52a690a64 100644 --- a/algo/tests/should-get-tests/help-examples.should +++ b/algo/tests/should-get-tests/help-examples.should @@ -23,4 +23,4 @@ cd $VIDJIL_DIR # LIL-L4, detects first clone 1:Clone .001 .* 30..\% -1:clone-001--TRG.* TRGV9.01 0/AC/1 TRGJ1.02 +1:clone-001--TRG.* TRGV9.01 0/AC/4 TRGJ[12] -- GitLab