get-CD.py 1.88 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
#!/usr/bin/env python
# -*- coding: utf-8 -*-

'''Get from NCBI CD sequences from the HCDM database (hcdm.org), as exported by HGNC (genenames.org)'''

import urllib

HUGO_REQUEST = 'http://www.genenames.org/cgi-bin/download?'
HUGO_COLS = '&col=gd_hgnc_id&col=md_refseq_id&col=gd_other_ids_list&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=gd_pub_refseq_ids'

# HUGO query on 'hcdm.org' entries
Mathieu Giraud's avatar
Mathieu Giraud committed
12 13 14
HUGO_QUERY_HCDM = '&status=Approved&status=Entry+Withdrawn&status_opt=2&where=gd_other_ids+LIKE+%27%25hcdm.org%25%27&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit'

HUGO_URL_HCDM = HUGO_REQUEST + HUGO_COLS + HUGO_QUERY_HCDM
15 16 17 18


NCBI_API = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&retmode=text'+'&id=%s'

19 20
# Common CD used to sort cells, see https://en.wikipedia.org/wiki/Cluster_of_differentiation
SORTING_CD = [ 'CD3g', 'CD3d', 'CD3e', 'CD4', 'CD8a', 'CD8b', 'CD11a', 'CD11b', 'CD14', 'CD15', 'CD16a', 'CD16b', 'CD19', 'CD20', 'CD22', 'CD24', 'CD25', 'CD30', 'CD31', 'CD34', 'CD38', 'CD45', 'CD56', 'CD61', 'CD91', 'CD114', 'CD117', 'CD182' ]
21

22
OUT = 'homo-sapiens/CD.fa'
23

24
SORTING_OUT = 'homo-sapiens/CD-sorting.fa'
25

26 27 28
print "==>", OUT
out = open(OUT, 'w')

29 30 31
print "==>", SORTING_OUT
sorting_out = open(SORTING_OUT, 'w')

Mathieu Giraud's avatar
Mathieu Giraud committed
32 33 34 35 36 37 38 39 40 41 42 43 44

def ncbi_and_write(ncbi, hugo, cd_id, outs):
    print cd_id, hugo, ncbi
    fasta = urllib.urlopen(NCBI_API % ncbi).read()
    fasta_with_id = fasta.replace('>', '>%s|%s|' % (hugo, cd_id))

    for out in outs:
        out.write(fasta_with_id)




for l in urllib.urlopen(HUGO_URL_HCDM).readlines():
45 46 47 48 49 50 51 52 53
    ll = l.split('\t')

    try:
        hugo, ncbi, ids = ll[0], ll[1], ll[2]
        cd_id = ids.split(',')[2].strip()
    except:
        print "!", l
        continue

Mathieu Giraud's avatar
Mathieu Giraud committed
54 55
    ncbi_and_write(ncbi, hugo, cd_id, [out] + ([sorting_out] if cd_id in SORTING_CD else []))

56 57