diff --git a/algo/tests/data/chimera-fake-half.fa b/algo/tests/data/chimera-fake-half.fa index 5ab8f7aa23ffec7340d6aeb0437d319dd734e479..62606a3ddbc7a9838cc49d7d1364c88038171883 100644 --- a/algo/tests/data/chimera-fake-half.fa +++ b/algo/tests/data/chimera-fake-half.fa @@ -4,15 +4,15 @@ >TRAV1-1*01--caca ggacaaagccttgagcagccctctgaagtgacagctgtggaaggagccattgtccagata -cacacacacacacacacacacacacacacacacacacaca +tatatatatatatatatatatatatatatatatatatata >TRGV1*01--caca tcttccaacttggaagggagaacgaagtcagtcaccaggctgactgggtcatctgctgaa -cacacacacacacacacacacacacacacacacacacaca +tatatatatatatatatatatatatatatatatatatata >IGHV1-18*01--caca caggttcagctggtgcagtctggagctgaggtgaagaagcctggggcctcagtgaaggtc -cacacacacacacacacacacacacacacacacacacaca +tatatatatatatatatatatatatatatatatatatata >atat--TRBV1*01 diff --git a/algo/tests/should-get-tests/germlines-read.should b/algo/tests/should-get-tests/germlines-read.should index a4f0672462964ded4c1c1c3ec5563662cb05cbd2..066a34addbf64ddc10a033510f4ff2ce7858414b 100644 --- a/algo/tests/should-get-tests/germlines-read.should +++ b/algo/tests/should-get-tests/germlines-read.should @@ -4,14 +4,14 @@ $ number of reads and kmers 1:13153 reads, 3020179 kmers $ k-mers, IGHV -1:13116 .* 1219953 .*IGHV +1:13116 .* 1219943 .*IGHV $ k-mers, IGHJ -1:37 .* 435550 .*IGHJ +1:37 .* 435553 .*IGHJ $ k-mers, ambiguous -1:53416 .*\\? +1:53426 .*\\? $ k-mers, unknown -1:1406774 .*_ +1:1248820 .*_ diff --git a/algo/tests/should-get-tests/help-examples.should b/algo/tests/should-get-tests/help-examples.should index b19ca4db867c465e69b5cbab7b3e904fbbf26226..52a690a64aa76c2607573f558db0f142c5a295d7 100644 --- a/algo/tests/should-get-tests/help-examples.should +++ b/algo/tests/should-get-tests/help-examples.should @@ -23,4 +23,4 @@ cd $VIDJIL_DIR # LIL-L4, detects first clone 1:Clone .001 .* 30..\% -1:clone-001--TRG.* TRGV9.01 0/AC/1 TRGJ1.02 +1:clone-001--TRG.* TRGV9.01 0/AC/4 TRGJ[12] diff --git a/germline/split-from-imgt.py b/germline/split-from-imgt.py index 63ecf0405cd74a4a5bc6c79c87438b3eece0957a..9f201fbe1e1e0682a4b14a7abdb3bb0d0366bc30 100644 --- a/germline/split-from-imgt.py +++ b/germline/split-from-imgt.py @@ -139,7 +139,31 @@ def store_data_if_updownstream(fasta_header, path, data, genes): data[path+'/'+gene].append((gene_name, gene_coord)) paths.append(path+'/'+gene) return paths - + +def ignore_strand(start, end): + if start < end: + return (start, end) + return (end, start) + +def compute_updownstream_length(genes, default_length): + positions = [ ignore_strand(info[1]['target_start'], info[1]['target_end']) for info in genes if 'target_start' in info[1]] + positions = list(set(positions)) + positions.sort() + i = 0 + min_length = default_length + sign = - 1 if min_length < 0 else 1 + while i < len(positions) - 1: + last = positions[i][1] + first_next = positions[i+1][0] + diff = first_next - last - 1 + if diff < abs(min_length): + min_length = diff * sign + i += 1 + # Should we divide by 2 the length so that we don't have overlaps + # between up and downstream? + return min_length + + def retrieve_genes(f, genes, tag, additional_length, gene_list): for info in genes: (gene, coord) = info @@ -151,17 +175,26 @@ def retrieve_genes(f, genes, tag, additional_length, gene_list): if gene_id: try: (target, start, end) = ncbi.get_gene_positions(gene_id) + coord['target'] = target + coord['target_start'] = start + coord['target_end'] = end + coord['gene_id'] = gene_id except KeyError: print('! No positions for %s (%s: %s)' % (gene_id, gene, str(coord))) - allele_additional_length = additional_length - gene_id = None + + min_updownstream = compute_updownstream_length(genes, additional_length) # gene: is the name of the sequence where the VDJ gene was identified according to IMGT. The gene is just a part of the sequence # gene_id: is the NCBI ID of the VDJ gene # target: is the NCBI ID of the chromosome + for info in genes: + (gene, coord) = info + + gene_id = coord['gene_id'] if 'gene_id' in coord else None + if GENES_SEQ_FROM_NCBI: - gene_data = ncbi.get_gene_sequence(gene, coord['imgt_data'] + tag, coord['from'], coord['to'], allele_additional_length) + gene_data = ncbi.get_gene_sequence(gene, coord['imgt_data'] + tag, coord['from'], coord['to'], min_updownstream) else: # IMGT gene_data = coord['seq'] @@ -169,9 +202,10 @@ def retrieve_genes(f, genes, tag, additional_length, gene_list): if gene_id: # Check consistency for *01 allele if coord['imgt_name'].endswith('*01'): - check_imgt_ncbi_consistency(coord, gene_data, target, start, end) - - up_down = ncbi.get_updownstream_sequences(target, start, end, additional_length) + check_imgt_ncbi_consistency(coord, gene_data, coord['target'], coord['target_start'], + coord['target_end']) + up_down = ncbi.get_updownstream_sequences(coord['target'], coord['target_start'], + coord['target_end'], min_updownstream) # We put the up and downstream data before and after the sequence we retrieved previously gene_data = paste_updown_on_fasta(gene_data, up_down[0], up_down[1]) @@ -237,8 +271,8 @@ def gap_j(seq): return (MAX_GAP_J - pos) * '.' + seq -LENGTH_UPSTREAM=40 -LENGTH_DOWNSTREAM=40 +LENGTH_UPSTREAM=200 +LENGTH_DOWNSTREAM=200 # Create isolated files for some sequences SPECIAL_SEQUENCES = [ ] diff --git a/germline/tests/should-get-tests/10-germlines-IGHD-upstream.should-get b/germline/tests/should-get-tests/10-germlines-IGHD-upstream.should-get index c1ede01d35fee1d5a0202de8617eb9ebf35dd955..ca4971d770ff654082beec5083feeb07c5306b90 100644 --- a/germline/tests/should-get-tests/10-germlines-IGHD-upstream.should-get +++ b/germline/tests/should-get-tests/10-germlines-IGHD-upstream.should-get @@ -1,5 +1,6 @@ !NO_LAUNCHER: -!LAUNCH: (cd $VIDJIL_DIR/germline ; grep -A2 -F 'IGHD2-2*02' homo-sapiens/IGHD+up.fa | tr -d '\n') +# The awk part prints the IGHD2-2*02 sequence +!LAUNCH: (cd $VIDJIL_DIR/germline ; awk '$0 ~ /IGHD2-2.02/ {print; getline; while ($0 !~ /^>/) {print; getline}}' homo-sapiens/IGHD+up.fa | tr -d '\n') $ Correct sequence, with upstream i1:AGGATTTTGTGGGGGCTCGTGTCACTGTGA