Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
vidjil
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
1,688
Issues
1,688
List
Boards
Labels
Service Desk
Milestones
Merge Requests
84
Merge Requests
84
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
vidjil
vidjil
Commits
8537dc89
Commit
8537dc89
authored
Jul 10, 2018
by
Mathieu Giraud
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'feature-g/3009-gene-locations' into 'dev'
Half-time merge, closes
#3009
. See
!182
.
parents
65a03851
b18c2caf
Pipeline
#32566
passed with stages
in 49 seconds
Changes
8
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
272 additions
and
82 deletions
+272
-82
.gitlab-ci.yml
.gitlab-ci.yml
+1
-1
germline/Makefile
germline/Makefile
+6
-1
germline/get-CD.py
germline/get-CD.py
+5
-12
germline/get-germline
germline/get-germline
+3
-1
germline/ncbi.py
germline/ncbi.py
+110
-0
germline/split-from-imgt.py
germline/split-from-imgt.py
+140
-65
germline/tests/should-get-tests/10-germlines-IGHD-upstream.should-get
...ts/should-get-tests/10-germlines-IGHD-upstream.should-get
+5
-0
germline/tests/should-get-tests/10-germlines-upstream.should-get
...e/tests/should-get-tests/10-germlines-upstream.should-get
+2
-2
No files found.
.gitlab-ci.yml
View file @
8537dc89
...
...
@@ -45,7 +45,7 @@ test_germlines:
stage
:
test_germlines
script
:
-
make -C germline get-all-data
-
make -C germline
/
tests
-
make -C germline
tests
only
:
-
/^feature-.*g.*\/.*$/
...
...
germline/Makefile
View file @
8537dc89
...
...
@@ -30,7 +30,12 @@ diff-from-saved:
echo
diff
-r
-u
-x
"*[.][^f][^a]"
-x
"germline*"
-x
"get*"
-x
"Makefile"
-x
"saved-*"
saved-germline/ .
tests
:
python split-from-imgt.py
--test
make
-C
tests
distrib
:
get-all-data js
cd
..
;
tar
cvzf germline-
`
cat
germline/germline_id
`
.tar.gz germline/germline_id germline/
*
/
*
.fa germline/IMGT_RELEASE browser/js/germline.js
.PHONY
:
all germline js get-all-data clean diff-from-saved
.PHONY
:
all germline js get-all-data clean diff-from-saved
tests
germline/get-CD.py
View file @
8537dc89
...
...
@@ -5,6 +5,8 @@
import
urllib
from
ncbi
import
*
HUGO_REQUEST
=
'http://www.genenames.org/cgi-bin/download?'
HUGO_COLS
=
'&col=gd_hgnc_id&col=md_refseq_id&col=gd_other_ids_list&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=gd_pub_refseq_ids'
...
...
@@ -14,7 +16,6 @@ HUGO_QUERY_HCDM = '&status=Approved&status=Entry+Withdrawn&status_opt=2&where=gd
HUGO_URL_HCDM
=
HUGO_REQUEST
+
HUGO_COLS
+
HUGO_QUERY_HCDM
NCBI_API
=
'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&retmode=text'
+
'&id=%s'
# Common CD used to sort cells, see https://en.wikipedia.org/wiki/Cluster_of_differentiation
SORTING_CD
=
[
'CD3g'
,
'CD3d'
,
'CD3e'
,
'CD4'
,
'CD8a'
,
'CD8b'
,
'CD11a'
,
'CD11b'
,
'CD14'
,
'CD15'
,
'CD16a'
,
'CD16b'
,
'CD19'
,
'CD20'
,
'CD22'
,
'CD24'
,
'CD25'
,
'CD30'
,
'CD31'
,
'CD34'
,
'CD38'
,
'CD45'
,
'CD56'
,
'CD61'
,
'CD91'
,
'CD114'
,
'CD117'
,
'CD182'
]
...
...
@@ -30,16 +31,6 @@ print "==>", SORTING_OUT
sorting_out
=
open
(
SORTING_OUT
,
'w'
)
def
ncbi_and_write
(
ncbi
,
hugo
,
cd_id
,
outs
):
print
cd_id
,
hugo
,
ncbi
fasta
=
urllib
.
urlopen
(
NCBI_API
%
ncbi
).
read
()
fasta_with_id
=
fasta
.
replace
(
'>'
,
'>%s|%s|'
%
(
hugo
,
cd_id
))
for
out
in
outs
:
out
.
write
(
fasta_with_id
)
for
l
in
urllib
.
urlopen
(
HUGO_URL_HCDM
).
readlines
():
ll
=
l
.
split
(
'
\t
'
)
...
...
@@ -51,7 +42,9 @@ for l in urllib.urlopen(HUGO_URL_HCDM).readlines():
print
"!"
,
l
continue
ncbi_and_write
(
ncbi
,
hugo
,
cd_id
,
[
out
]
+
([
sorting_out
]
if
cd_id
in
SORTING_CD
else
[]))
ncbi_and_write
(
ncbi
,
'%s|%s|'
%
(
hugo
,
cd_id
),
[
out
]
+
([
sorting_out
]
if
cd_id
in
SORTING_CD
else
[]))
germline/get-germline
View file @
8537dc89
#!/bin/sh
cd
$(
dirname
$0
)
wget
-O
- http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP | python split-from-imgt.py
wget http://www.imgt.org/download/GENE-DB/IMGTGENEDB-GeneList
wget http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP
python split-from-imgt.py IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP IMGTGENEDB-GeneList
wget
-O
IMGT_RELEASE http://www.imgt.org/download/GENE-DB/RELEASE
...
...
germline/ncbi.py
0 → 100644
View file @
8537dc89
import
urllib
import
sys
import
re
import
os
import
fasta
API_EUTILS
=
'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'
API_EUTILS
+=
'api_key='
+
os
.
environ
[
'NCBI_KEY'
]
+
'&'
if
'NCBI_KEY'
in
os
.
environ
else
''
API_NUCCORE_ID
=
API_EUTILS
+
'db=nuccore&rettype=fasta&retmode=text'
+
'&id=%s'
API_NUCCORE_ID_FROM_TO
=
API_EUTILS
+
'db=nuccore&rettype=fasta&retmode=text'
+
'&id=%s'
+
'&from=%s&to=%s'
API_GENE_ID_XML
=
API_EUTILS
+
'db=gene&retmode=xml&rettype=docsum'
+
'&id=%s'
from
xml.dom
import
minidom
,
Node
# The two following functions should be refactored as one (used in split-from-imgt and get-CD)
def
get_gene_sequence
(
gene
,
other_gene_name
,
start
,
end
,
additional_length
):
'''
Return the gene sequences between positions start and end (included).
'''
if
additional_length
>
0
:
end
+=
additional_length
elif
additional_length
<
0
:
start
=
max
(
1
,
start
+
additional_length
)
fasta_string
=
urllib
.
urlopen
(
API_NUCCORE_ID_FROM_TO
%
(
gene
,
start
,
end
)).
read
()
return
re
.
sub
(
'(>\S*) '
,
r
'\1|'
+
other_gene_name
+
'|'
,
fasta_string
)
def
ncbi_and_write
(
ncbi
,
additional_header
,
outs
):
print
ncbi
,
additional_header
fasta
=
urllib
.
urlopen
(
API_NUCCORE_ID
%
ncbi
).
read
()
fasta_with_id
=
fasta
.
replace
(
'>'
,
'>'
+
additional_header
)
for
out
in
outs
:
out
.
write
(
fasta_with_id
)
def
get_updownstream_sequences
(
gene
,
other_gene_name
,
start
,
end
,
additional_length
):
#Only returns upstream or downstream raw sequences
if
additional_length
==
0
:
return
(
''
,
''
)
reversed
=
-
1
if
(
end
<
start
)
else
1
if
additional_length
>
0
:
start
=
end
+
1
*
reversed
end
=
end
+
additional_length
*
reversed
elif
additional_length
<
0
:
end
=
start
-
1
*
reversed
start
=
max
(
1
,
start
+
additional_length
*
reversed
)
if
start
>
end
:
tmp
=
start
start
=
end
end
=
tmp
updown_fasta
=
urllib
.
urlopen
(
API_NUCCORE_ID_FROM_TO
%
(
gene
,
start
,
end
)).
read
()
updown_raw
=
'
\n
'
.
join
(
updown_fasta
.
split
(
'
\n
'
)[
1
:]).
strip
()
if
reversed
==
-
1
:
updown_raw
=
fasta
.
revcomp
(
updown_raw
.
upper
())
if
additional_length
>
0
:
return
(
''
,
updown_raw
)
else
:
return
(
updown_raw
,
''
)
# Parse output from API_GENE_ID_XML to get genomic positions of a gene
def
xml_bang_one
(
parent
):
return
{
node
.
nodeName
:
node
.
firstChild
.
nodeValue
for
node
in
parent
.
childNodes
if
node
.
nodeType
==
Node
.
ELEMENT_NODE
}
def
get_last_LocationHistType
(
gene
):
'''
>>> get_last_LocationHistType(6969)
{u'AssemblyAccVer': u'GCF_000001405.38', u'ChrAccVer': u'NC_000007.14', u'AnnotationRelease': u'109', u'ChrStop': u'38253379', u'ChrStart': u'38253428'}
'''
sys
.
stderr
.
write
(
'%% eutils -> gene %s'
%
gene
+
'
\n
'
)
xml
=
minidom
.
parseString
(
urllib
.
urlopen
(
API_GENE_ID_XML
%
gene
).
read
())
locations
=
xml
.
getElementsByTagName
(
'LocationHistType'
)
if
locations
:
return
xml_bang_one
(
locations
[
0
])
else
:
raise
KeyError
,
gene
def
get_gene_positions
(
gene
):
'''
>>> get_gene_positions(6969)
(u'NC_000007.14', 38253428, 38253379)
>>> get_gene_positions('zoycooxz')
Traceback (most recent call last):
...
KeyError: 'zoycooxz'
'''
loc
=
get_last_LocationHistType
(
gene
)
chr
=
loc
[
'ChrAccVer'
]
start
,
stop
=
int
(
loc
[
'ChrStart'
])
+
1
,
int
(
loc
[
'ChrStop'
])
+
1
return
chr
,
start
,
stop
germline/split-from-imgt.py
View file @
8537dc89
...
...
@@ -8,6 +8,8 @@ import urllib
from
collections
import
defaultdict
,
OrderedDict
import
re
import
ncbi
IMGT_LICENSE
=
'''
# To use the IMGT germline databases (IMGT/GENE-DB), you have to agree to IMGT license:
# academic research only, provided that it is referred to IMGT®,
...
...
@@ -17,9 +19,8 @@ IMGT_LICENSE = '''
# Nucl. Acids Res., 29, 207-209 (2001). PMID: 11125093
'''
print
(
IMGT_LICENSE
)
NCBI_API
=
'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&retmode=text'
+
'&id=%s&from=%s&to=%s'
def
remove_allele
(
name
):
return
name
.
split
(
'*'
)[
0
]
# Parse lines in IMGT/GENE-DB such as:
# >M12949|TRGV1*01|Homo sapiens|ORF|...
...
...
@@ -74,7 +75,7 @@ def get_gene_coord(imgt_line):
>>> line = '>X15272|TRGV4*01|Homo sapiens|F|V-REGION|406..705|300 nt|1| | | | |300+0=300| |rev-compl|'
>>> get_gene_coord(line)[0] == 'X15272'
True
>>> get_gene_coord(line)[1] == {'from': 406, 'to': 705, 'imgt_data': 'TRGV4*01|Homo sapiens|F|V-REGION', 'imgt_name': 'TRGV4*01'}
>>> get_gene_coord(line)[1] == {'from': 406, 'to': 705, 'imgt_data': 'TRGV4*01|Homo sapiens|F|V-REGION', 'imgt_name': 'TRGV4*01'
, 'species': 'Homo sapiens'
}
True
'''
elements
=
imgt_line
.
split
(
'|'
)
...
...
@@ -88,32 +89,59 @@ def get_gene_coord(imgt_line):
end
=
end
.
split
(
','
)[
0
]
return
elements
[
0
][
1
:],
{
'from'
:
int
(
start
),
'to'
:
int
(
end
),
'species'
:
elements
[
2
],
'imgt_name'
:
elements
[
1
],
'imgt_data'
:
'|'
.
join
(
elements
[
1
:
5
])}
def
get_gene_sequence
(
gene
,
other_gene_name
,
start
,
end
):
def
paste_updown_on_fasta
(
fasta
,
up
,
down
):
'''
Return the gene sequences between positions start and end (included).
Put upstream and/or downstream data on an existing FASTA sequences
>>> paste_updown_on_fasta('>seq
\\
nAAAAAAAAAAAAAAAAAAA
\\
nTTTTTTTTTTT', 'CCCC', 'GGGG')
'>seq
\\
nCCCC
\\
nAAAAAAAAAAAAAAAAAAA
\\
nTTTTTTTTTTT
\\
nGGGG
\\
n'
>>> paste_updown_on_fasta('>seq
\\
nAAAAAAAAAAAAAAAAAAA
\\
nTTTTTTTTTTT
\\
n', '', 'GGGG')
'>seq
\\
nAAAAAAAAAAAAAAAAAAA
\\
nTTTTTTTTTTT
\\
nGGGG
\\
n'
>>> paste_updown_on_fasta('>seq
\\
nAAAAAAAAAAAAAAAAAAA
\\
nTTTTTTTTTTT', 'CCCC', '')
'>seq
\\
nCCCC
\\
nAAAAAAAAAAAAAAAAAAA
\\
nTTTTTTTTTTT
\\
n'
'''
fasta_string
=
urllib
.
urlopen
(
NCBI_API
%
(
gene
,
start
,
end
)).
read
()
return
re
.
sub
(
'(>\S*) '
,
r
'\1|'
+
other_gene_name
+
'|'
,
fasta_string
)
lines
=
fasta
.
split
(
'
\n
'
)
return
lines
[
0
]
+
'
\n
'
+
(
up
+
'
\n
'
if
up
else
''
)
+
'
\n
'
.
join
(
filter
(
None
,
lines
[
1
:]))
+
'
\n
'
\
+
(
down
+
'
\n
'
if
down
else
''
)
def
store_data_if_updownstream
(
fasta_header
,
path
,
data
,
genes
):
for
gene
in
gene_matches
(
fasta_header
,
genes
):
gene_name
,
gene_coord
=
get_gene_coord
(
fasta_header
)
if
gene_name
:
data
[
path
+
'/'
+
gene
][
gene_name
].
append
(
gene_coord
)
def
retrieve_genes
(
f
,
genes
,
tag
,
additional_length
):
def
retrieve_genes
(
f
,
genes
,
tag
,
additional_length
,
gene_list
):
for
gene
in
genes
:
for
coord
in
genes
[
gene
]:
start
=
coord
[
'from'
]
end
=
coord
[
'to'
]
if
additional_length
>
0
:
end
+=
additional_length
elif
additional_length
<
0
:
start
=
max
(
1
,
start
+
additional_length
)
gene_data
=
get_gene_sequence
(
gene
,
coord
[
'imgt_data'
]
+
tag
,
start
,
end
)
# try to extract from genome
gene_id
=
gene_list
.
get_gene_id_from_imgt_name
(
coord
[
'species'
],
coord
[
'imgt_name'
])
allele_additional_length
=
0
if
gene_id
:
try
:
(
target
,
start
,
end
)
=
ncbi
.
get_gene_positions
(
gene_id
)
print
(
coord
,
gene_id
,
target
,
start
,
end
)
except
KeyError
:
print
(
'! No positions for %s (%s: %s)'
%
(
gene_id
,
gene
,
str
(
genes
[
gene
])))
allele_additional_length
=
additional_length
gene_id
=
None
# extract from gene
gene_data
=
ncbi
.
get_gene_sequence
(
gene
,
coord
[
'imgt_data'
]
+
tag
,
coord
[
'from'
],
coord
[
'to'
],
allele_additional_length
)
if
gene_id
:
up_down
=
ncbi
.
get_updownstream_sequences
(
target
,
coord
[
'imgt_data'
]
+
tag
,
start
,
end
,
additional_length
)
# We put the up and downstream data before and after the sequence we retrieved previously
gene_data
=
paste_updown_on_fasta
(
gene_data
,
up_down
[
0
],
up_down
[
1
])
# post-process gene_data
if
coord
[
'imgt_data'
].
split
(
'|'
)[
-
1
]
==
FEATURE_J_REGION
:
gene_lines
=
gene_data
.
split
(
'
\n
'
)
gene_lines
[
1
]
=
gap_j
(
gene_lines
[
1
].
lower
())
...
...
@@ -161,7 +189,7 @@ def gap_j(seq):
pos
=
CUSTOM_118
[
custom_seq
]
if
pos
is
None
:
m
=
j118
.
search
(
seq
)
m
=
j118
.
search
(
seq
,
re
.
IGNORECASE
)
if
not
m
:
if
len
(
seq
)
>
PHE_TRP_WARN_SIZE
:
...
...
@@ -220,75 +248,122 @@ class OrderedDefaultListDict(OrderedDict):
self
[
key
]
=
value
=
[]
return
value
downstream_data
=
defaultdict
(
lambda
:
OrderedDefaultListDict
())
upstream_data
=
defaultdict
(
lambda
:
OrderedDefaultListDict
())
for
l
in
sys
.
stdin
:
# New sequence: compute 'current_files' and stores up/downstream_data[]
class
IMGTGENEDBGeneList
():
'''
Parse lines such as
'Homo sapiens;TRGJ2;F;Homo sapiens T cell receptor gamma joining 2;1;7;7p14;M12961;6969;'
>>> gl = IMGTGENEDBGeneList('IMGTGENEDB-GeneList')
>>> gl.get_gene_id_from_imgt_name('Homo sapiens', 'TRGJ2*01')
'6969'
'''
def
__init__
(
self
,
f
):
self
.
data
=
defaultdict
(
str
)
for
l
in
open
(
f
):
ll
=
l
.
split
(
';'
)
species
,
name
,
gene_id
=
ll
[
0
],
ll
[
1
],
ll
[
-
2
]
self
.
data
[
species
,
name
]
=
gene_id
if
">"
in
l
:
current_files
=
[]
current_special
=
None
def
get_gene_id_from_imgt_name
(
self
,
species
,
name
):
return
self
.
data
[
species
,
remove_allele
(
name
)]
species
=
l
.
split
(
'|'
)[
2
].
strip
()
feature
=
l
.
split
(
'|'
)[
4
].
strip
()
if
species
in
SPECIES
and
feature
in
FEATURES
:
seq
=
l
.
split
(
'|'
)[
1
]
path
=
SPECIES
[
species
]
if
feature
in
FEATURES_VDJ
:
system
=
seq
[:
4
]
else
:
system
=
seq
[:
seq
.
find
(
"*"
)]
if
not
system
in
CLASSES
:
print
"! Unknown class: "
,
system
system
=
system
.
replace
(
"IGH"
,
"IGHC="
)
def
split_IMGTGENEDBReferenceSequences
(
f
,
gene_list
):
keys
=
[
path
+
system
]
downstream_data
=
defaultdict
(
lambda
:
OrderedDefaultListDict
())
upstream_data
=
defaultdict
(
lambda
:
OrderedDefaultListDict
())
check_directory_exists
(
path
)
for
l
in
open
(
ReferenceSequences
):
if
system
.
startswith
(
'IG'
)
or
system
.
startswith
(
'TR'
):
# New sequence: compute 'current_files' and stores up/downstream_data[]
if
">"
in
l
:
current_files
=
[]
current_special
=
None
species
=
l
.
split
(
'|'
)[
2
].
strip
()
feature
=
l
.
split
(
'|'
)[
4
].
strip
()
if
species
in
SPECIES
and
feature
in
FEATURES
:
seq
=
l
.
split
(
'|'
)[
1
]
path
=
SPECIES
[
species
]
if
feature
in
FEATURES_VDJ
:
store_data_if_updownstream
(
l
,
path
,
downstream_data
,
DOWNSTREAM_REGIONS
)
store_data_if_updownstream
(
l
,
path
,
upstream_data
,
UPSTREAM_REGIONS
)
system
=
seq
[:
4
]
else
:
system
=
seq
[:
seq
.
find
(
"*"
)]
if
not
system
in
CLASSES
:
print
"! Unknown class: "
,
system
system
=
system
.
replace
(
"IGH"
,
"IGHC="
)
keys
=
[
path
+
system
]
check_directory_exists
(
path
)
if
system
.
startswith
(
'IG'
)
or
system
.
startswith
(
'TR'
):
if
feature
in
FEATURES_VDJ
:
store_data_if_updownstream
(
l
,
path
,
downstream_data
,
DOWNSTREAM_REGIONS
)
store_data_if_updownstream
(
l
,
path
,
upstream_data
,
UPSTREAM_REGIONS
)
systems
=
get_split_files
(
seq
,
SPLIT_SEQUENCES
)
if
systems
:
keys
=
[
path
+
s
for
s
in
systems
]
for
key
in
keys
:
current_files
.
append
(
open_files
[
key
])
systems
=
get_split_files
(
seq
,
SPLIT_SEQUENCES
)
if
systems
:
keys
=
[
path
+
s
for
s
in
systems
]
for
key
in
keys
:
current_files
.
append
(
open_files
[
key
])
if
seq
in
SPECIAL_SEQUENCES
:
name
=
'%s.fa'
%
seq
.
replace
(
'*'
,
'-'
)
current_special
=
verbose_open_w
(
name
)
if
seq
in
SPECIAL_SEQUENCES
:
name
=
'%s.fa'
%
seq
.
replace
(
'*'
,
'-'
)
current_special
=
verbose_open_w
(
name
)
# Possibly gap J_REGION
# Possibly gap J_REGION
if
'>'
not
in
l
and
current_files
and
feature
==
FEATURE_J_REGION
:
l
=
gap_j
(
l
)
if
'>'
not
in
l
and
current_files
and
feature
==
FEATURE_J_REGION
:
l
=
gap_j
(
l
)
# Dump 'l' to the concerned files
# Dump 'l' to the concerned files
for
current_file
in
current_files
:
current_file
.
write
(
l
)
for
current_file
in
current_files
:
current_file
.
write
(
l
)
if
current_special
:
current_special
.
write
(
l
)
if
current_special
:
current_special
.
write
(
l
)
# End, loop to next 'l'
# End, loop to next 'l'
# Dump up/downstream data
# Dump up/downstream data
for
system
in
upstream_data
:
f
=
verbose_open_w
(
system
+
TAG_UPSTREAM
+
'.fa'
)
retrieve_genes
(
f
,
upstream_data
[
system
],
TAG_UPSTREAM
,
-
LENGTH_UPSTREAM
,
gene_list
)
for
system
in
up
stream_data
:
f
=
verbose_open_w
(
system
+
TAG_UP
STREAM
+
'.fa'
)
retrieve_genes
(
f
,
upstream_data
[
system
],
TAG_UPSTREAM
,
-
LENGTH_UPSTREAM
)
for
system
in
down
stream_data
:
f
=
verbose_open_w
(
system
+
TAG_DOWN
STREAM
+
'.fa'
)
retrieve_genes
(
f
,
downstream_data
[
system
],
TAG_DOWNSTREAM
,
LENGTH_DOWNSTREAM
,
gene_list
)
for
system
in
downstream_data
:
f
=
verbose_open_w
(
system
+
TAG_DOWNSTREAM
+
'.fa'
)
retrieve_genes
(
f
,
downstream_data
[
system
],
TAG_DOWNSTREAM
,
LENGTH_DOWNSTREAM
)
if
__name__
==
'__main__'
:
if
sys
.
argv
[
1
]
==
'--test'
:
import
doctest
doctest
.
testmod
()
else
:
print
(
IMGT_LICENSE
)
ReferenceSequences
=
sys
.
argv
[
1
]
GeneList
=
sys
.
argv
[
2
]
gl
=
IMGTGENEDBGeneList
(
GeneList
)
split_IMGTGENEDBReferenceSequences
(
ReferenceSequences
,
gl
)
germline/tests/should-get-tests/10-germlines-IGHD-upstream.should-get
0 → 100644
View file @
8537dc89
!NO_LAUNCHER:
!LAUNCH: (cd $VIDJIL_DIR/germline ; grep -A2 -F 'IGHD2-2*02' homo-sapiens/IGHD+up.fa | tr -d '\n')
$ Correct sequence, with upstream
1:AGGATTTTGTGGGGGCTCGTGTCACTGTGA
germline/tests/should-get-tests/10-germlines-upstream.should-get
View file @
8537dc89
!NO_LAUNCHER:
!LAUNCH: (cd $VIDJIL_DIR/germline ; cat homo-sapiens/TRDD2+up.fa | tr '|' '#' )
!LAUNCH: (cd $VIDJIL_DIR/germline ; cat homo-sapiens/TRDD2+up.fa | tr '|' '#'
| tr -d '\n'
)
$ Correct full header, with TRDD2*01 identifier between pipes
1: .*#TRDD2.01#.*Human T-cell receptor germline delta-chain D-region DNA
f
1: .*#TRDD2.01#.*Human T-cell receptor germline delta-chain D-region DNA
$ Correct sequence, with upstream
1: AAGAGGGTTTTTATACTGATGTGTTTCATTGTGCCTTCCTAC
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment