Commit dfa98c3f authored by BELCOUR Arnaud's avatar BELCOUR Arnaud

Add script extracting EC from KEGG for a genbank file.

parent 388a3659
# -*- coding: utf-8 -*-
"""
This file is part of padmet-utils.
padmet-utils is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
padmet-utils is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with padmet-utils. If not, see <http://www.gnu.org/licenses/>.
@author: Meziane AITE, meziane.aite@inria.fr
Description:
Extract EC from KEGG of a genbank file.
usage:
extract_ec_kegg.py --input=FILE --output=FILE [-v]
option:
-h --help Show help.
--input=FILE pathname of the genbank input file
--output=FILE pathanme of the genbank output file
-v verbose mode.
"""
import docopt
import pandas as pa
import os
import requests
from Bio import SeqIO
from io import StringIO
def main():
global verbose
args = docopt.docopt(__doc__)
gbk_input = args["--input"]
gbk_output = args["--output"]
if args['-v']:
verbose = args["-v"]
else:
verbose = None
add_ec_kegg(gbk_input, gbk_output)
def add_ec_kegg(gbk_input, gbk_output):
response = requests.get('http://rest.kegg.jp/list/organism/')
df_orga = pa.read_csv(StringIO(response.text), sep='\t', header=None)
orga_code = check_organism_kegg(gbk_input, df_orga)
if orga_code:
gene_ec_dicts = get_ec_kegg(orga_code)
ec_to_gbk(gbk_input, gbk_output, gene_ec_dicts)
def check_organism_kegg(genbank_pathname, df_orga):
with open(genbank_pathname, "r") as gbk:
# Take the first record of the genbank (first contig/chromosome) to retrieve the species name.
first_seq_record = next(SeqIO.parse(gbk, "genbank"))
try:
species_name = first_seq_record.annotations['organism']
except KeyError:
raise KeyError('No organism in the Genbank. In the SOURCE you must have: ORGANISM Species name')
for index, row in df_orga.iterrows():
if species_name in row[2]:
orga_code = row[1]
if verbose:
print('{0} has been found in KEEG with id {1}.'.format(species_name, orga_code))
return orga_code
return None
def get_ec_kegg(organism_id):
response = requests.get('http://rest.kegg.jp/link/enzyme/' + organism_id)
gene_ec_dicts = {}
gene_ecs = response.text.split('\n')
for gene_ec in gene_ecs:
if gene_ec != '':
gene = gene_ec.split('\t')[0].split(':')[1]
ec = gene_ec.split('\t')[1].split(':')[1]
if gene not in gene_ec_dicts:
gene_ec_dicts[gene] = [ec]
else:
gene_ec_dicts[gene].append(ec)
return gene_ec_dicts
def ec_to_gbk(genbank_pathname_input, genbank_pathname_output, gene_ec_dicts):
ec_count = 0
records = []
for record in SeqIO.parse(genbank_pathname_input, 'genbank'):
for feature in record.features:
if feature.type == 'CDS':
if feature.qualifiers['locus_tag'][0] in gene_ec_dicts:
feature.qualifiers['EC_number'] = gene_ec_dicts[feature.qualifiers['locus_tag'][0]]
ec_count += len(gene_ec_dicts[feature.qualifiers['locus_tag'][0]])
records.append(record)
SeqIO.write(records, genbank_pathname_output, 'genbank')
if verbose:
print(str(ec_count) + ' ec have been added to new genbank at ' + genbank_pathname_output)
if __name__ == "__main__":
main()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment