get-CD.py 1.59 KB
Newer Older
1 2 3 4 5 6 7
#!/usr/bin/env python
# -*- coding: utf-8 -*-

'''Get from NCBI CD sequences from the HCDM database (hcdm.org), as exported by HGNC (genenames.org)'''

import urllib

8 9
from ncbi import *

10 11 12 13
HUGO_REQUEST = 'http://www.genenames.org/cgi-bin/download?'
HUGO_COLS = '&col=gd_hgnc_id&col=md_refseq_id&col=gd_other_ids_list&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=gd_pub_refseq_ids'

# HUGO query on 'hcdm.org' entries
Mathieu Giraud's avatar
Mathieu Giraud committed
14 15 16
HUGO_QUERY_HCDM = '&status=Approved&status=Entry+Withdrawn&status_opt=2&where=gd_other_ids+LIKE+%27%25hcdm.org%25%27&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit'

HUGO_URL_HCDM = HUGO_REQUEST + HUGO_COLS + HUGO_QUERY_HCDM
17 18 19



20 21
# Common CD used to sort cells, see https://en.wikipedia.org/wiki/Cluster_of_differentiation
SORTING_CD = [ 'CD3g', 'CD3d', 'CD3e', 'CD4', 'CD8a', 'CD8b', 'CD11a', 'CD11b', 'CD14', 'CD15', 'CD16a', 'CD16b', 'CD19', 'CD20', 'CD22', 'CD24', 'CD25', 'CD30', 'CD31', 'CD34', 'CD38', 'CD45', 'CD56', 'CD61', 'CD91', 'CD114', 'CD117', 'CD182' ]
22

23
OUT = 'homo-sapiens/CD.fa'
24

25
SORTING_OUT = 'homo-sapiens/CD-sorting.fa'
26

27 28 29
print "==>", OUT
out = open(OUT, 'w')

30 31 32
print "==>", SORTING_OUT
sorting_out = open(SORTING_OUT, 'w')

Mathieu Giraud's avatar
Mathieu Giraud committed
33 34 35


for l in urllib.urlopen(HUGO_URL_HCDM).readlines():
36 37 38 39 40 41 42 43 44
    ll = l.split('\t')

    try:
        hugo, ncbi, ids = ll[0], ll[1], ll[2]
        cd_id = ids.split(',')[2].strip()
    except:
        print "!", l
        continue

45 46 47
    ncbi_and_write(ncbi,
                   '%s|%s|' % (hugo, cd_id),
                   [out] + ([sorting_out] if cd_id in SORTING_CD else []))
Mathieu Giraud's avatar
Mathieu Giraud committed
48

49 50