From a90864e5ad08e5c6589494da68cb700934702a64 Mon Sep 17 00:00:00 2001 From: Mikael Salson Date: Thu, 12 Jul 2018 20:28:48 +0200 Subject: [PATCH] split-from-imgt.py: Code factorization Between get_updownstream_sequences and get_gene_sequence --- germline/ncbi.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/germline/ncbi.py b/germline/ncbi.py index 60d616c45..344a7325e 100644 --- a/germline/ncbi.py +++ b/germline/ncbi.py @@ -10,7 +10,7 @@ API_EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' API_EUTILS += 'api_key='+os.environ['NCBI_KEY']+'&' if 'NCBI_KEY' in os.environ else '' API_NUCCORE_ID = API_EUTILS + 'db=nuccore&rettype=fasta&retmode=text' + '&id=%s' -API_NUCCORE_ID_FROM_TO = API_EUTILS + 'db=nuccore&rettype=fasta&retmode=text' + '&id=%s' + '&from=%s&to=%s' +API_NUCCORE_ID_FROM_TO = API_EUTILS + 'db=nuccore&rettype=fasta&retmode=text' + '&id=%s' + '&from=%s&to=%s&strand=%d' API_GENE_ID_XML = API_EUTILS + 'db=gene&retmode=xml&rettype=docsum' + '&id=%s' @@ -25,12 +25,19 @@ def get_gene_sequence(gene, other_gene_name, start, end, additional_length): ''' Return the gene sequences between positions start and end (included). ''' + reversed = False + if end < start: + tmp = end + end = start + start = tmp + reversed = True + if additional_length > 0: end += additional_length elif additional_length < 0: start = max(1, start + additional_length) - fasta_string = urllib.urlopen(API_NUCCORE_ID_FROM_TO % (gene, start, end)).read() + fasta_string = urllib.urlopen(API_NUCCORE_ID_FROM_TO % (gene, start, end, 2 if reversed else 1)).read() return re.sub('(>\S*) ', r'\1|'+other_gene_name+'|', fasta_string) def ncbi_and_write(ncbi, additional_header, outs): @@ -69,16 +76,9 @@ def get_updownstream_sequences(gene, start, end, additional_length): end = start - 1 * reversed start = max(1, start + additional_length * reversed) - if start > end: - tmp = start - start = end - end = tmp - - updown_fasta = urllib.urlopen(API_NUCCORE_ID_FROM_TO % (gene, start, end)).read() + updown_fasta = get_gene_sequence(gene, '', start, end, 0) updown_raw = '\n'.join(updown_fasta.split('\n')[1:]).strip() - if reversed == -1: - updown_raw = fasta.revcomp(updown_raw.upper()) if additional_length > 0: return ('', updown_raw) -- GitLab