Commit 443b1747 authored by Mikaël Salson's avatar Mikaël Salson

doc, germline: Add some documentation (and debug info) on germlines

parent 77734929
Pipeline #121585 passed with stage
in 8 seconds
......@@ -6,6 +6,17 @@ at the [issues](, at the commit messages, and at the so
# Development notes -- Vidjil-algo
## Germlines
Germlines are retrieved from IMGT and their sequences can be completed with
additional upstream and downstream sequences to improve the detection of short
D/J genes.
We paste the same upstream and downstream sequences for all alleles of a same
gene. The sequences are retrieved from the chromosomal position of the
gene. So if the chromosomal position of a gene cannot be obtained, the alleles
of that gene won't have any upstream or downstream sequences.
## Code organisation
The algorithm follows roughly those steps:
......@@ -9,6 +9,7 @@ from collections import defaultdict, OrderedDict
import re
import ncbi
from inspect import getframeinfo, stack
......@@ -29,6 +30,13 @@ def remove_allele(name):
current_file = None
debug_mode = False
def debug(message):
if (debug_mode):
caller = getframeinfo(stack()[1][0])
print "%s:%d\t%s" % (caller.filename, caller.lineno, message)
def verbose_open_w(name):
print (" ==> %s" % name)
......@@ -115,6 +123,7 @@ def check_imgt_ncbi_consistency(imgt_info, imgt_data, ncbi_target, ncbi_start, n
print >>sys.stderr,"WARNING: Length for %s differ between IMGT (%d) and NCBI (%d)" % (imgt_info['imgt_name'], abs(imgt_info['from'] - imgt_info['to'])+1, abs(ncbi_start - ncbi_end)+1)
# Check that sequences are identical
debug("NCBI: target={} {}-{}".format(ncbi_target, ncbi_start, ncbi_end))
ncbi_seq = ncbi.get_gene_sequence(ncbi_target, '', ncbi_start, ncbi_end, 0).split('\n')[1:]
gene_lines = imgt_data.split('\n')[1:]
if gene_lines[0].startswith('#'):
......@@ -145,6 +154,7 @@ def retrieve_genes(f, genes, tag, additional_length, gene_list):
(gene, coord) = info
# try to extract from genome
gene_id = gene_list.get_gene_id_from_imgt_name(coord['species'], coord['imgt_name'])
debug("gene_id={}\timgt_name={}".format(gene_id, coord['imgt_name']))
allele_additional_length = 0
......@@ -418,6 +428,10 @@ if __name__ == '__main__':
if sys.argv[1] == "--debug":
debug_mode = True
ReferenceSequencesInframe = sys.argv[1]
ReferenceSequencesAll = sys.argv[2]
GeneList = sys.argv[3]
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment