Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
vidjil
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
1,711
Issues
1,711
List
Boards
Labels
Service Desk
Milestones
Merge Requests
86
Merge Requests
86
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
vidjil
vidjil
Commits
6b8cd835
Commit
6b8cd835
authored
Feb 21, 2020
by
Thonier Florian
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'dev' into 'feature-c/4104-n-afficher-dans-le-graph-que-les-clones-du-sample-courant'
# Conflicts: # doc/.gitlab-ci.yml
parents
ca4c9f04
3b90d5ad
Pipeline
#122453
failed with stages
in 5 minutes and 19 seconds
Changes
4
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
70 additions
and
21 deletions
+70
-21
doc/.gitlab-ci.yml
doc/.gitlab-ci.yml
+4
-6
germline/split-from-imgt.py
germline/split-from-imgt.py
+55
-14
germline/tests/should-get-tests/10-germlines-IGHD-upstream.should-get
...ts/should-get-tests/10-germlines-IGHD-upstream.should-get
+2
-1
germline/tests/should-get-tests/10-germlines-downstream.should-get
...tests/should-get-tests/10-germlines-downstream.should-get
+9
-0
No files found.
doc/.gitlab-ci.yml
View file @
6b8cd835
...
...
@@ -9,9 +9,9 @@ build_doc:
-
site/
expire_in
:
1 month
only
:
changes
:
-
doc/**/*
-
mkdocs.yml
-
/^[^\/]*doc[^\/]*\//
# branch name should contain doc before the first /
# eg.: doc/blabla or feature-adoc/blabla
tags
:
-
doc
...
...
@@ -21,6 +21,4 @@ deploy_doc:
-
scp -r site/ $VIDJIL_WWW:doc/
when
:
manual
only
:
changes
:
-
doc/**/*
-
mkdocs.yml
-
/^[^\/]*doc[^\/]*\//
germline/split-from-imgt.py
View file @
6b8cd835
...
...
@@ -139,8 +139,38 @@ def store_data_if_updownstream(fasta_header, path, data, genes):
data
[
path
+
'/'
+
gene
].
append
((
gene_name
,
gene_coord
))
paths
.
append
(
path
+
'/'
+
gene
)
return
paths
def
retrieve_genes
(
f
,
genes
,
tag
,
additional_length
,
gene_list
):
def
ignore_strand
(
start
,
end
):
if
start
<
end
:
return
(
start
,
end
)
return
(
end
,
start
)
def
compute_updownstream_length
(
genes
,
default_length
):
'''
Returns the maximal `min_length` size (but at most `default_length`)
such that this size never overlaps the previous/next gene.
'''
positions
=
[
ignore_strand
(
info
[
1
][
'target_start'
],
info
[
1
][
'target_end'
])
for
info
in
genes
if
'target_start'
in
info
[
1
]]
positions
=
list
(
set
(
positions
))
positions
.
sort
()
i
=
0
min_length
=
default_length
sign
=
-
1
if
min_length
<
0
else
1
while
i
<
len
(
positions
)
-
1
:
last
=
positions
[
i
][
1
]
first_next
=
positions
[
i
+
1
][
0
]
diff
=
first_next
-
last
-
1
if
diff
<
abs
(
min_length
):
min_length
=
diff
*
sign
i
+=
1
# Should we divide by 2 the length so that we don't have overlaps
# between up and downstream?
return
min_length
def
retrieve_genes
(
f_name
,
genes
,
tag
,
additional_length
,
gene_list
):
f
=
verbose_open_w
(
f_name
)
for
info
in
genes
:
(
gene
,
coord
)
=
info
# try to extract from genome
...
...
@@ -151,17 +181,27 @@ def retrieve_genes(f, genes, tag, additional_length, gene_list):
if
gene_id
:
try
:
(
target
,
start
,
end
)
=
ncbi
.
get_gene_positions
(
gene_id
)
coord
[
'target'
]
=
target
coord
[
'target_start'
]
=
start
coord
[
'target_end'
]
=
end
coord
[
'gene_id'
]
=
gene_id
except
KeyError
:
print
(
'! No positions for %s (%s: %s)'
%
(
gene_id
,
gene
,
str
(
coord
)))
allele_additional_length
=
additional_length
gene_id
=
None
min_updownstream
=
compute_updownstream_length
(
genes
,
additional_length
)
print
(
' %s, '
%
f_name
+
'genes: %d, '
%
len
(
genes
)
+
'up/downstream: %dbp'
%
min_updownstream
)
# gene: is the name of the sequence where the VDJ gene was identified according to IMGT. The gene is just a part of the sequence
# gene_id: is the NCBI ID of the VDJ gene
# target: is the NCBI ID of the chromosome
for
info
in
genes
:
(
gene
,
coord
)
=
info
gene_id
=
coord
[
'gene_id'
]
if
'gene_id'
in
coord
else
None
if
GENES_SEQ_FROM_NCBI
:
gene_data
=
ncbi
.
get_gene_sequence
(
gene
,
coord
[
'imgt_data'
]
+
tag
,
coord
[
'from'
],
coord
[
'to'
],
allele_additional_length
)
gene_data
=
ncbi
.
get_gene_sequence
(
gene
,
coord
[
'imgt_data'
]
+
tag
,
coord
[
'from'
],
coord
[
'to'
],
min_updownstream
)
else
:
# IMGT
gene_data
=
coord
[
'seq'
]
...
...
@@ -169,9 +209,10 @@ def retrieve_genes(f, genes, tag, additional_length, gene_list):
if
gene_id
:
# Check consistency for *01 allele
if
coord
[
'imgt_name'
].
endswith
(
'*01'
):
check_imgt_ncbi_consistency
(
coord
,
gene_data
,
target
,
start
,
end
)
up_down
=
ncbi
.
get_updownstream_sequences
(
target
,
start
,
end
,
additional_length
)
check_imgt_ncbi_consistency
(
coord
,
gene_data
,
coord
[
'target'
],
coord
[
'target_start'
],
coord
[
'target_end'
])
up_down
=
ncbi
.
get_updownstream_sequences
(
coord
[
'target'
],
coord
[
'target_start'
],
coord
[
'target_end'
],
min_updownstream
)
# We put the up and downstream data before and after the sequence we retrieved previously
gene_data
=
paste_updown_on_fasta
(
gene_data
,
up_down
[
0
],
up_down
[
1
])
...
...
@@ -237,8 +278,8 @@ def gap_j(seq):
return
(
MAX_GAP_J
-
pos
)
*
'.'
+
seq
LENGTH_UPSTREAM
=
4
0
LENGTH_DOWNSTREAM
=
4
0
LENGTH_UPSTREAM
=
20
0
LENGTH_DOWNSTREAM
=
20
0
# Create isolated files for some sequences
SPECIAL_SEQUENCES
=
[
]
...
...
@@ -399,12 +440,12 @@ def split_IMGTGENEDBReferenceSequences(sources, gene_list):
# Dump up/downstream data
for
system
in
upstream_data
:
f
=
verbose_open_w
(
system
+
TAG_UPSTREAM
+
'.fa'
)
retrieve_genes
(
f
,
upstream_data
[
system
],
TAG_UPSTREAM
,
-
LENGTH_UPSTREAM
,
gene_list
)
f
_name
=
system
+
TAG_UPSTREAM
+
'.fa'
retrieve_genes
(
f
_name
,
upstream_data
[
system
],
TAG_UPSTREAM
,
-
LENGTH_UPSTREAM
,
gene_list
)
for
system
in
downstream_data
:
f
=
verbose_open_w
(
system
+
TAG_DOWNSTREAM
+
'.fa'
)
retrieve_genes
(
f
,
downstream_data
[
system
],
TAG_DOWNSTREAM
,
LENGTH_DOWNSTREAM
,
gene_list
)
f
_name
=
system
+
TAG_DOWNSTREAM
+
'.fa'
retrieve_genes
(
f
_name
,
downstream_data
[
system
],
TAG_DOWNSTREAM
,
LENGTH_DOWNSTREAM
,
gene_list
)
...
...
germline/tests/should-get-tests/10-germlines-IGHD-upstream.should-get
View file @
6b8cd835
!NO_LAUNCHER:
!LAUNCH: (cd $VIDJIL_DIR/germline ; grep -A2 -F 'IGHD2-2*02' homo-sapiens/IGHD+up.fa | tr -d '\n')
# The awk part prints the IGHD2-2*02 sequence
!LAUNCH: (cd $VIDJIL_DIR/germline ; awk '$0 ~ /IGHD2-2.02/ {print; getline; while ($0 !~ /^>/) {print; getline}}' homo-sapiens/IGHD+up.fa | tr -d '\n')
$ Correct sequence, with upstream
i1:AGGATTTTGTGGGGGCTCGTGTCACTGTGA
germline/tests/should-get-tests/10-germlines-downstream.should-get
0 → 100644
View file @
6b8cd835
!NO_LAUNCHER:
!LAUNCH: (cd $VIDJIL_DIR/germline ; cat homo-sapiens/TRBJ+down.fa | tr -d '\n' | tr '>' '\n')
$ TRBJ1-1 has the correct 71bp downstream
1: GTAAGACATTTTTCAGGTTCTTTTGCAGATCCGTCACAGGGAAAAGTGGGTCCACAGTGTCCCTTTTAGAG
$ The 16 sequences have 71bp downstream, and no more
16: [AGCT]{71}
0: [AGCT]{72}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment