Commit 6587af23 authored by flothoni's avatar flothoni

Merge branch 'dev' of gitlab.inria.fr:vidjil/vidjil into...

Merge branch 'dev' of gitlab.inria.fr:vidjil/vidjil into feature-c/4181-des-clones-segedited-avec-de-nouvelles-germline-bloque-l-interface
parents cd4153fa 3b90d5ad
Pipeline #122915 passed with stages
in 10 minutes and 4 seconds
>IGHV2-26*01 IGHJ4*02 [IGH]
TTCTCCACAGGGGTCTTGTCCCAGGTCACCTTGAAGGAGTCTGGTCCTGTACTGGTTAAACCCACAGAGACCCTCACGCTGACGTGCACCGTCTCTGGGTTCTCACTCAACAGTGCTAGAATGGGTGTGACCTGGATCCGTCAGTCCCCAGGGAAGGCCCTGGAATGGCTTGCACACATTTCCTCGAATGACGAAAAATTGTATAGTACATCTCTGAAGACCAGGCTCACCATCTCCAAGGACACCTCCAGAAGCCAGGTGGTCCTCACCGTGACCAACATGGACCCTGTGGACACAGCCACATATTACTGTGCACGGACAC
GGGGAGTATATAGTTATGATTCTC
TTGAGTACTGGGGCCAGGGAGCCCTGATCACCGTCTCCGCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTC
......@@ -9,11 +9,9 @@ build_doc:
- site/
expire_in: 1 month
only:
refs:
- merge_requests
changes:
- doc/**/*
- mkdocs.yml
- /^[^\/]*doc[^\/]*\//
# branch name should contain doc before the first /
# eg.: doc/blabla or feature-adoc/blabla
tags:
- doc
......@@ -23,6 +21,4 @@ deploy_doc:
- scp -r site/ $VIDJIL_WWW:doc/
when: manual
only:
changes:
- doc/**/*
- mkdocs.yml
- /^[^\/]*doc[^\/]*\//
......@@ -139,8 +139,38 @@ def store_data_if_updownstream(fasta_header, path, data, genes):
data[path+'/'+gene].append((gene_name, gene_coord))
paths.append(path+'/'+gene)
return paths
def retrieve_genes(f, genes, tag, additional_length, gene_list):
def ignore_strand(start, end):
if start < end:
return (start, end)
return (end, start)
def compute_updownstream_length(genes, default_length):
'''
Returns the maximal `min_length` size (but at most `default_length`)
such that this size never overlaps the previous/next gene.
'''
positions = [ ignore_strand(info[1]['target_start'], info[1]['target_end']) for info in genes if 'target_start' in info[1]]
positions = list(set(positions))
positions.sort()
i = 0
min_length = default_length
sign = - 1 if min_length < 0 else 1
while i < len(positions) - 1:
last = positions[i][1]
first_next = positions[i+1][0]
diff = first_next - last - 1
if diff < abs(min_length):
min_length = diff * sign
i += 1
# Should we divide by 2 the length so that we don't have overlaps
# between up and downstream?
return min_length
def retrieve_genes(f_name, genes, tag, additional_length, gene_list):
f = verbose_open_w(f_name)
for info in genes:
(gene, coord) = info
# try to extract from genome
......@@ -151,17 +181,27 @@ def retrieve_genes(f, genes, tag, additional_length, gene_list):
if gene_id:
try:
(target, start, end) = ncbi.get_gene_positions(gene_id)
coord['target'] = target
coord['target_start'] = start
coord['target_end'] = end
coord['gene_id'] = gene_id
except KeyError:
print('! No positions for %s (%s: %s)' % (gene_id, gene, str(coord)))
allele_additional_length = additional_length
gene_id = None
min_updownstream = compute_updownstream_length(genes, additional_length)
print(' %s, ' % f_name + 'genes: %d, ' % len(genes) + 'up/downstream: %dbp' % min_updownstream)
# gene: is the name of the sequence where the VDJ gene was identified according to IMGT. The gene is just a part of the sequence
# gene_id: is the NCBI ID of the VDJ gene
# target: is the NCBI ID of the chromosome
for info in genes:
(gene, coord) = info
gene_id = coord['gene_id'] if 'gene_id' in coord else None
if GENES_SEQ_FROM_NCBI:
gene_data = ncbi.get_gene_sequence(gene, coord['imgt_data'] + tag, coord['from'], coord['to'], allele_additional_length)
gene_data = ncbi.get_gene_sequence(gene, coord['imgt_data'] + tag, coord['from'], coord['to'], min_updownstream)
else:
# IMGT
gene_data = coord['seq']
......@@ -169,9 +209,10 @@ def retrieve_genes(f, genes, tag, additional_length, gene_list):
if gene_id:
# Check consistency for *01 allele
if coord['imgt_name'].endswith('*01'):
check_imgt_ncbi_consistency(coord, gene_data, target, start, end)
up_down = ncbi.get_updownstream_sequences(target, start, end, additional_length)
check_imgt_ncbi_consistency(coord, gene_data, coord['target'], coord['target_start'],
coord['target_end'])
up_down = ncbi.get_updownstream_sequences(coord['target'], coord['target_start'],
coord['target_end'], min_updownstream)
# We put the up and downstream data before and after the sequence we retrieved previously
gene_data = paste_updown_on_fasta(gene_data, up_down[0], up_down[1])
......@@ -237,8 +278,8 @@ def gap_j(seq):
return (MAX_GAP_J - pos) * '.' + seq
LENGTH_UPSTREAM=40
LENGTH_DOWNSTREAM=40
LENGTH_UPSTREAM=200
LENGTH_DOWNSTREAM=200
# Create isolated files for some sequences
SPECIAL_SEQUENCES = [
]
......@@ -399,12 +440,12 @@ def split_IMGTGENEDBReferenceSequences(sources, gene_list):
# Dump up/downstream data
for system in upstream_data:
f = verbose_open_w(system + TAG_UPSTREAM + '.fa')
retrieve_genes(f, upstream_data[system], TAG_UPSTREAM, -LENGTH_UPSTREAM, gene_list)
f_name = system + TAG_UPSTREAM + '.fa'
retrieve_genes(f_name, upstream_data[system], TAG_UPSTREAM, -LENGTH_UPSTREAM, gene_list)
for system in downstream_data:
f = verbose_open_w(system + TAG_DOWNSTREAM + '.fa')
retrieve_genes(f, downstream_data[system], TAG_DOWNSTREAM, LENGTH_DOWNSTREAM, gene_list)
f_name = system + TAG_DOWNSTREAM + '.fa'
retrieve_genes(f_name, downstream_data[system], TAG_DOWNSTREAM, LENGTH_DOWNSTREAM, gene_list)
......
!NO_LAUNCHER:
!LAUNCH: (cd $VIDJIL_DIR/germline ; grep -A2 -F 'IGHD2-2*02' homo-sapiens/IGHD+up.fa | tr -d '\n')
# The awk part prints the IGHD2-2*02 sequence
!LAUNCH: (cd $VIDJIL_DIR/germline ; awk '$0 ~ /IGHD2-2.02/ {print; getline; while ($0 !~ /^>/) {print; getline}}' homo-sapiens/IGHD+up.fa | tr -d '\n')
$ Correct sequence, with upstream
i1:AGGATTTTGTGGGGGCTCGTGTCACTGTGA
!NO_LAUNCHER:
!LAUNCH: (cd $VIDJIL_DIR/germline ; cat homo-sapiens/TRBJ+down.fa | tr -d '\n' | tr '>' '\n')
$ TRBJ1-1 has the correct 71bp downstream
1: GTAAGACATTTTTCAGGTTCTTTTGCAGATCCGTCACAGGGAAAAGTGGGTCCACAGTGTCCCTTTTAGAG
$ The 16 sequences have 71bp downstream, and no more
16: [AGCT]{71}
0: [AGCT]{72}
......@@ -4,12 +4,16 @@ COMPLETE=0
INCREMENTAL=0
DIR=
DATABASE=
BACKUP_DAY=
YESTERDAY=
usage() {
echo "$0: [-c] [-i] dbname [path]
echo "$0: [-c|-i|-d|-y] dbname [path]
-c: Backup everything
-i: Incremental backup, since the first of the month
-d: Backup of the current day
-y: Backup since yesterday
path: Where to save the file" >&2
exit 1
}
......@@ -20,6 +24,7 @@ fi
if [ $# -ge 1 -a "$1" = "-i" ]; then
INCREMENTAL=1
BACKUP_DAY=$(date --date="$(date +%Y-%m-01)" +"%Y-%m-%d")
shift
fi
......@@ -28,6 +33,18 @@ if [ $# -ge 1 -a "$1" = "-c" ]; then
shift
fi
if [ $# -ge 1 -a "$1" = "-d" ]; then
BACKUP_DAY=$(date +%Y-%m-%d)
INCREMENTAL=1
shift
fi
if [ $# -ge 1 -a "$1" = "-y" ]; then
BACKUP_DAY=$(date --date=yesterday +%Y-%m-%d)
INCREMENTAL=1
shift
fi
if [ $# -eq 0 ]; then
usage
fi
......@@ -40,7 +57,6 @@ if [ $# -ge 1 ]; then
fi
now=$(date +"%Y-%m-%d_%H:%M:%S")
FIRST_OF_THE_MONTH=$(date --date="$(date +%Y-%m-01)" +"%Y-%m-%d")
vidjil_path=web2py/applications/vidjil
db_backup_file=/tmp/db-backup-$now.csv
......@@ -65,10 +81,10 @@ if [ $COMPLETE -eq 1 ]; then
zip -r $filename_raw web2py/applications/vidjil/databases/ "$DIR_SEQUENCES" "$DIR_RESULTS" $db_backup_file $sql_backup_file
else
if [ $INCREMENTAL -eq 1 ]; then
filename_raw="${DIR}backup_incremental_${FIRST_OF_THE_MONTH}__${now}.tar"
filename_raw="${DIR}backup_incremental_${BACKUP_DAY}__${now}.tar"
filename=$filename_raw.gz
tar cvf $filename_raw --force-local web2py/applications/vidjil/databases/ $db_backup_file $sql_backup_file
tar rvf $filename_raw --force-local --after-date "$FIRST_OF_THE_MONTH" "$DIR_RESULTS" 2>&1 | grep -v "file is unchanged"
tar rvf $filename_raw --force-local --after-date "$BACKUP_DAY" "$DIR_RESULTS" 2>&1 | grep -v "file is unchanged"
gzip $filename_raw
else
filename_raw="${DIR}backup_essentials_"$now
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment