Commit e6b51cf0 authored by Marc Duez's avatar Marc Duez
parents c29cb867 9a578a93
# 'Undefined name' for web2py globals 'db', 'auth', 'request'
pyflakes:
disable:
- F821
using namespace std;
#include <map>
#include <string>
#include <fstream>
......
This diff is collapsed.
......@@ -21,10 +21,8 @@
segment to a given strand */
#define DETECT_THRESHOLD 5 /* If the number of both V and J affectations
is above this threshold, then the sequence
will be labeled as 'detected', and, if it
not segmented, the remaining germlines will
not be tested */
is above this threshold, then the sequence,
if it not segmented, will be marked as AMBIGUOUS */
#define JSON_REMEMBER_BEST 4 /* The number of V/D/J predictions to keep */
......@@ -57,7 +55,6 @@ const char* const segmented_mesg[] = { "?",
class Segmenter {
protected:
string label;
string sequence;
int Vend, Jstart;
int Dstart, Dend;
......@@ -71,6 +68,7 @@ protected:
public:
Germline *segmented_germline;
string label;
string code;
string code_short;
string code_light;
......@@ -168,13 +166,11 @@ ostream &operator<<(ostream &out, const Segmenter &s);
class KmerSegmenter : public Segmenter
{
private:
int detected;
KmerAffectAnalyser *kaa;
protected:
string affects;
public:
bool isDetected() const;
int score;
int pvalue_left;
int pvalue_right;
......@@ -185,7 +181,7 @@ class KmerSegmenter : public Segmenter
* @param seq: An object read from a FASTA/FASTQ file
* @param germline: the germline
*/
KmerSegmenter(Sequence seq, Germline *germline, int multiplier=1);
KmerSegmenter(Sequence seq, Germline *germline, double threshold = THRESHOLD_NB_EXPECTED, int multiplier=1);
KmerSegmenter(const KmerSegmenter &seg);
......@@ -196,10 +192,13 @@ class KmerSegmenter : public Segmenter
*/
KmerAffectAnalyser *getKmerAffectAnalyser() const;
string getInfoLineWithAffects() const;
void toJsonList(JsonList *seg);
private:
void computeSegmentation(int strand, KmerAffect left, KmerAffect right, int multiplier);
void computeSegmentation(int strand, KmerAffect left, KmerAffect right,
int delta_min, int delta_max,
double threshold, int multiplier);
};
......
......@@ -83,7 +83,7 @@ WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, MultiGermline *mult
// Last line of detailed affects output
if (out_affects) {
*out_affects << "#" << seg->getInfoLine() << endl;
*out_affects << "#>" << seg->label << " " << seg->getInfoLine() << endl << endl;
}
// Progress bar
......
!LAUNCH: ../../vidjil -r 5 -K -g ../../germline ../../data/multi-short.fa ; head -n 15 out/multi-short.affects
!LAUNCH: ../../vidjil -r 5 -K -g ../../germline ../../data/multi-short.fa ; head -n 17 out/multi-short.affects
# Testing .affects output (-K)
$ First sequence (TRA), display sequence
1:TCTTCATTCCTTAGTCGCTCTGATAGTTATGGTTACCTCCTTCTACAGGAGCTCCAGATGAAAGACTCTGCCTCTTACTTCTGCGCTGTGAGAGAGTATGAAAGTATTACCTCCCAGTTGCAATTTGGCAAAGGAACCAGAGTTTCCACTTCTCC
$ First sequence (TRA), count of k-mers
1:.TRA .* 129
1:129 .* TRA
$ First sequence (TRA), segmentation
1:>TRA--V1-1.01--J1.01 .* seed TRA SEG_+
......
!LAUNCH: $LAUNCHER ../../vidjil -k 9 -G ../../germline/IGH -K -c clones ../../data/revcomp.fa ; grep '#IGH' out/revcomp.affects | sed 's/[^X]//g' | awk '{ print length }'
!LAUNCH: $LAUNCHER ../../vidjil -k 9 -G ../../germline/IGH -K -c clones ../../data/revcomp.fa ; grep 'X.X.X' out/revcomp.affects | sed 's/[^X]//g' | awk '{ print length }'
$ Segments both reads, normal and reverse
1:junction detected in 2 reads
......
>__TRG_UNSEG
>___UNSEG
GGTAGTAGCAAATATTCAAACGAGAACTTTGAAGGCCGAAGTGGAGAAGGCTTCCATGTGAACAGCAGTTGAACATGGGTCAGTCGGTCCTGAGAGA
>seq-317-1__TRG_UNSEG
>seq-317-1__UNSEG
ATGAAGTTTTTTTGATGGCTTGAGATGGCTCACAAATTTTGATTTTTTTTTCTTCCTTGTGCTCCCTTTTTTTCTCCTTGCTTTTCCAGTTAACATCTATATTCACATGTAATCTTGTTTTCTCTTCACATTCACTGAGTTGTTCAGGCT
>seq-317-2__TRD_UNSEG
>seq-317-2__UNSEG
GAAGGAAGAGATTTGAAATAAAGCTTTGGTTTCTGAGGAATGTTGCCGTTTGAGAGATTCAAAAAGAAAGAAGATCCAAATTTCGGGACTGGTGTTTAAATTGTAGTGACAGATTTTGGGGGCCATTAAAGGTATGGGAGTGAAA
>seq-317-3__TRD+_UNSEG
>seq-317-3__UNSEG
ACTAACAGTAACTGCCATTTTTTGTCTGTGATAACAGAGTGATTTGTAAAACAGTGGTTGTTTTTTCATTGTGTTTTCTTCGTGGATTGTTTTTTCTGCGGGTCATATTCATACCTTCTGATGAAGTTGTACAACACCAGCAACATT
>seq-317-4__IGK+_UNSEG
>seq-317-4__UNSEG
AGGTATGGTTGCGCTCAGTATAACAAGTGCTGAACAGAAAGCCAGGAAGGGAGGCGAACACAGCCACTGACCATGTAGCCAAACTGGTGATGACCCCATAAGTCAAGGTCCTTGCCCTCAAGGAAAACACCGCGTGCACAAT
>seq-317-5__IGL_UNSEG
>seq-317-5__UNSEG
GTGCAAGGATCAGAAGATAACGAAACGTCCACTTCCTGAAGGGTGGGAAAAAAAGAAAAAGAAAAAGGAGCCACCCACGCTGGAGATGTCCGTTTTAGATCTTCTTATTTTCTTCCCTTTCGCTTCGGTTTTTCTTCTCGAGGCTCA
>seq-317-6__TRD_UNSEG
>seq-317-6__UNSEG
GGTGTGACCGGGGCTTTGGTGGACCCTATTGTGTTCCTGTTGTTCCTCTGCCCTCGATTCTTAAAGACGATTTCAATGGGAATTTACATCCTGACCTTTGGCCTGAAGTGTATGGTGCAGAGA
>seq-317-7__TRD+_UNSEG
>seq-317-7__UNSEG
GTGGTGGTTCTGGTCCTGTTCAAATACAAGCGGCTCAGGTCCATGACTGATGTGTACCTGCTCAACCTTGCCATCTCGGATCTGCTCTTCGTGTTTTCCCTCCCTTTTTGGGGCTACTATGCAGCAGACCAGTGGGTTTTTGGGCTAGGTC
>seq-317-8__TRD+_UNSEG
>seq-317-8__UNSEG
GTTGCTTAAGATCAGTTGCTTTTATACTCAGAATGGAAATACCTGATCTTGGCTAGCTTTGTTTGTTA
>seq-317-9__IGK+_UNSEG
>seq-317-9__UNSEG
GGGCAGCAGCCCAGTCTTCCTACTGTCTGATTTAATTACAGCGGTTCCTGTGGGAGTGGGGGCTGTTATTCTCTTAAACATTTGCAGCTTGAAACAGTTGAGGAAGCAGCTTTAAAAAAAAAAAATCTCCCTACCCCCAACAA
>seq-317-10__IGH_UNSEG
>seq-317-10__UNSEG
GGGCAGAGAGTGCTGGGCTGAGTCGTATTGCTTTGCTAGACTTCCATAGGAATGGGCTTCTTGCGGCGAGAAAAGGGTGCCCCACCCTCTACTTGGACTACACTACTCTGAAACCTTGTGGCAGAGGCAGAGAAAAATCTGCTTTA
>seq-317-11__TRD+_UNSEG
>seq-317-11__UNSEG
AGCATACTCATCAGGCCAGTATTTGACACATTTACTCTTTCCTCTCTCCGCTTCTTTCATTGTCATGACAATCAGTCTGGAGTTTTGGAAAACCATCCGCCAAAAGTCGTTCACTGTGTTTTGTAGGCAGCCTTGTGTGGCAATGTAAC
>seq-317-12__TRD_UNSEG
>seq-317-12__UNSEG
AGGGAAACCTCAAGTGCTCGCAGAAGCATCGGCAAGTCAGGCGTCCTGTTTCTCGGATGGATACCCATTACCATCTTGGACCTGGAAGAAGTGTTCAGACAAGTCT
>seq-317-13__TRG_UNSEG
>seq-317-13__UNSEG
TGGATTCTGGTTTACAGCAGCTTTACAGTGATAGTTAAATTAACTGGGGCTAGGGGAAGAGCAAGCAAAAAGGGAAGAAGGACTCCTAGGCCCTTTCTAGTAAATCCTTCAGCAACAAGGCTGGCTTGGTGCCCTCCAAGCATCTAATG
>seq-317-14__TRG_UNSEG
>seq-317-14__UNSEG
ACCCTTTACCCTGATGATCTGTATTATATTTTAATGTATATGTGAATATATTGAAAATAATTTGTTTTTTCCTGGTTTTTGTTTGGTTTTCGTTTTGCTTTTAGCCTCTACATGCTAGGATCACAGGAAGACTTTGTAAGGACA
>seq-317-15__IGK_UNSEG
>seq-317-15__UNSEG
CATTTAATCTGATGTGGCATTTTCGTCATCTGAAGCATGAGTGACAAGTTGGGAATGATGTGGTGATTTAGAATGCAGTATTGGCCAAGTCCAAGTTGTCAACTTAAGCGTCTGTTTACCAAAGACCGGGAACAGGGGCCCAAACA
>seq-317-16__TRD+_UNSEG
>seq-317-16__UNSEG
TTTAGTTGTAGATTTCAATGGGATACGATAGGACAGAAAAGATTTTTTAAAAAGCAGAAAGAGTGTTTCATGGTGAAAGTACTGGGGGAGGGTGGACAAAGCATGCACACATGCCAATTTGAAAATCAAGTGTGACTTACCTCACGT
>seq-317-17__IGK+_UNSEG
>seq-317-17__UNSEG
CGGGATAGAGGGCAGTGGCACTATGAAAGTCAGCCATCTGGTGAGGCAGCAGCTGTCCGATGGCGTGGAACACAAGGCCCCCCACACGGAACATGTGCAGCCGTTCTCCCCGCTGAATGATGCTAGCGATTTGCTTCACCTCGTCC
>seq-317-18__IGK+_UNSEG
>seq-317-18__UNSEG
TGTCACCATTCACCTTGGACAGTGGGGCAGATTGGTTCCAATTGGGTTTCTCATCCTTGTCCACCGAAGATCCCTTTGTCACTGCAGCTTCTCTAGTGTCAGCCTCACTGCTGTCCTCCGTGAGGTGACCTTCAAAG
>seq-317-19__IGK+_UNSEG
>seq-317-19__UNSEG
CACCACTTTTAGTACCAACACTCTTGGGTGATTTCATGGACCCTAAAGCAGACCTGACACTGATCCAGATTTGCAGTCCATTTTTAAGGACACCTGTCTTTATTTCCTCAAAGTCAAGCAGCTTTCTCTGGAAAATGAATGCTAATTAGT
>seq-317-20__IGK+_UNSEG
>seq-317-20__UNSEG
AGACAGGATCAGGGACAGTCACATTAAAACACCATAGCACCATTTTATTTAGACTATTTCAATTCATAAAAATGCTTCCTAGTCCATAAAACACACTTCAGTAACAAAAGGAAAGAGATACTGGTTGAGGCACGTCAGGGATATCA
>seq-317-21__IGK+_UNSEG
>seq-317-21__UNSEG
GACCAGGGAGCTTGGTGAGCAGCCGGGGACTCTGGGAAGGGCTGAGAGCCAGCACAGCTGAGTGCTGTTGCTGTTGTTGCTGCTGCTGCTGCTGTTGTTGCTGCTGCTTGTTCCGATATTCTGCCATGAGATTAGTGTGCTCCTTC
>seq-317-22__IGK+_UNSEG
>seq-317-22__UNSEG
GTTTTATATCGTTGTAAAATAATTTTTCTAATTTTTTATAGGCTATGGATGAAATGAATGGCAAAGAAATAGAAGGGGAAGAAATTGAAATAGTCTTAGCCAAGCCACCAGACAAGAAAAGGAAAGAGCGCCAAGCTGCTAGACAGGCCTC
>seq-317-23__TRD_UNSEG
>seq-317-23__UNSEG
TTTTCAATGATCTAGATCAATACTTTGTTTCTCACCCATTTCATAATCAGTTTCTGCTACTCTCCTCATTTTGGGGGGTGTTGTGATCCCTGATTCCATTTCTTGCCTTTTAGGAGCCTTTGTGTCTGATATCAAGTGATCAAAA
>seq-317-24__TRG_UNSEG
>seq-317-24__UNSEG
ACTCCAGTCTGGGTGAAAGAGTGAGACCTTGTCTCAAAAAAGGAAGTGAAAGGTAATTAAAAAAGAACTTACGAAGGAAGGTCTTTGGCAGCTCTCAAGCCCCAGCCTTTCTTTTCTGTGAGTATGACTTCCACATCTGCATGCTGTTT
>seq-317-25__IGK_UNSEG
>seq-317-25__UNSEG
CTCTTTATTTGTCCCCTTGCCTCCCTTTCCAATGGACTATTTTAGAAGAAATGGAGCTGTCACCCACATCAAGATTCAGAACACTGGTGATTACTATGACCTGTATGGAGGGGAGAAATTTGCCACTTTGGCTGAGTTGGTCCAG
>seq-317-26__IGK+_UNSEG
>seq-317-26__UNSEG
GATGTGAAAGAAAGGGCGAAGGGTTTTTTGAGTTTTTGTTTTTGAGGAAGGGGAGTTGGGTACTTCTGCCTCTCCTAGCATGATAGGCATTCTCATAGCCAGGGACAGATTTTCTCCTGCAGCCCAGGGTGCTAAGCAGACATCTCTGGGA
>seq-317-27__IGK_UNSEG
>seq-317-27__UNSEG
CTCCATGGCACGAAAGGTCCTGGCTGTGATGGAGTCTCCTTCCTGAAATCCAATTTGTACCGGTCTCAACAGCAGGTCACTTCACCTGGACCCAGCTGAATCACCAGCTCCTTTTATGATATTTGTGTTTGTTTCACAATTTCTCAAGG
>seq-317-28__TRD+_UNSEG
>seq-317-28__UNSEG
AAGAGAACACACTTACTCTCCACGTCGCGGACACAGACGCCATAGAGGTACACGATGTGTTTGTGGGAGACCTGTCTCATCATGCTGGCTGCCTCGAAGAAGGCCTGTGGGCGAGCAGGACATAGGAATGTC
>seq-317-29__IGK+_UNSEG
>seq-317-29__UNSEG
GTCGGCGGCGTCGGCAGCAGTGTCGACGGCAGCGGCGGCGGCGGGTGGGAAATGGCGGAGTATCTGGCCTCCATCTTCGGCACCGAGAAAGACAAGTGAGTGGGAGCCCCCCGCCGGGGGTTGGGCGCGATCGGGGCGCAGGGTGGTTG
>seq-317-30__TRG_UNSEG
>seq-317-30__UNSEG
TGGTTGCTTGATAGGTAGGTACTCACCTATTCTCACAGATCTCCTTTTGTCGGCCTTGGTTGGGACAACATAAGAAACTCCAGGTTTCATGTCCATGTACTCATTAGTACTATCGCTGCTGGGAGAGATGAAACAAGTCATGAC
>seq-317-31__TRD+_UNSEG
>seq-317-31__UNSEG
TTGTCATAAATGCTTTTTGCTGTCACTCTAGTGGTTACTATCCTCTGCCTTCCTCTCCTCCCCTACTCCCCCAGAGAACCAAGGCGTCTGGGGGATTGACTGGGGGGCAGAGGGGGTTTCCCCAGCAAAATCAAACACCTGTCTCCAGAG
>seq-317-32__IGK+_UNSEG
>seq-317-32__UNSEG
ACAGCTACTATATTAGGAGGGATTCCATTTTCACAATTGCAAAACAGATATTTCAGGAACATGGGCCACAAAATGTACTCCTTTCACAGTGTTCTCCTCTTCTGAACTGTGCAGTTATCTATTAAATTTTC
>seq-317-33__TRD_UNSEG
>seq-317-33__UNSEG
GCACTCTGGGCCCGGACCGCCAACCAGCAGACCTGAACGTCCACATCAGACAGGATGGTTTTTCAATGGGAAGAAAACAAAGATGGCGACAGGAAGGAAACTGAGCAGGGAGGTATTTGTACTTTGGAGGTACAAGGGATCTACCCCA
>seq-317-34__IGK+_UNSEG
>seq-317-34__UNSEG
CTCGCGTTATGTGCGACGAGAACTCGTACCGGTCCTGGCTGTGGTAGCCGCACATGTTGCACTCAAAAGGATCACGGAAGCCGTGGCAGCCCATGTGGATGGTGTACATGACGTGATCCAGGAAGAGCACCCGGCAGTGTTCGCAC
>seq-317-35__TRB_UNSEG
>seq-317-35__UNSEG
CCCTGGGTCCTCCCCACATCCATGGTACTAGGAAGGAGAGTATCAAACCTGTTTGTAGGCTCTGAACCTTATGCTCAGCTTCCTCCAGCTTTGAGGTGGTGGACATCTGTGTCTCCTGCAGCTTTCTGCAAAGGAAGAG
>seq-317-36__IGK+_UNSEG
>seq-317-36__UNSEG
TGACCTTAAGAACTTTGTCTGGTGGCTTTGCTGGAACATTGTCACTGTTTTCACTGTCATGCAGGGAGCCCAGCACTGTGGCCAGGATGGCAGAGACTTCCTTGTCATCATGGAGAAGTGCCAGCAGGGGACTGGGAAAAGCACTCTAC
>seq-317-37__TRG_UNSEG
>seq-317-37__UNSEG
GCTGCTTCTCAGCAATGGCCTGAGAATTCACTTCTGTGCTACTAGTAGATGGAGGATTTGATACCTGTCCTTCAATGCTTACAGCTGGCTGGGAAAGCTGGGCAGAAAAAAAAGGGAAGTAGGTGAGACACACGCAGAGTCTGCCAAC
>seq-317-38__IGK+_UNSEG
>seq-317-38__UNSEG
GTGATAATCTTCATATGCAGTGCTTCTGGAGGCCTGTCTAGCAGCTTGGCGCTCTTTCCTTTTCTTGTCTGGTGCCTTGGCTAAGACTATTTCAAGTTCTTCCCCTTCTGTTTCTTTGCCATTCATTTCATCCATAGCCATAACAG
>seq-317-39__IGK+_UNSEG
>seq-317-39__UNSEG
TGCAGCTTTTGTTTTCTGTATGTTGTTGGGGGATCAACTTTCACACATAGCAAGCACATGGCCTCCCTGATGTCAGGATGCCTTTGTTAGGATCTGTATTTGCCCTTAATTTTGTTGAAATCTTTTTTCCTTCTTCCTCTTGAAAA
>seq-317-40__TRD_UNSEG
>seq-317-40__UNSEG
GAGGGCTGAGCTTCCCCTCAAACCTATCGAACATCTCATTGTGATTGGGGACATTTGACACACAGGTTCCTTGTGCAGCTTCAGAAAAAGAAATAGAGAGGAGAGATGCTTTGTTGGAATCATTCAAAACGAGGTCTATTCAACAC
>seq-317-41__IGK+_UNSEG
>seq-317-41__UNSEG
CACTATAAGACCGACTTCGACAAGAATAAGATCTGGTATGAGCACCGGCTCATTGATGACATGGTGGCTCAGGTCCTCAAGTCTTCGGGTGGCTTTGTGTGGGCCTGCAAGAACTATGACGGAGATGTGCAGTCAGACATC
>seq-317-42__IGL_UNSEG
>seq-317-42__UNSEG
GGACAGACCAGCCTCCATCCACAGCTCCCTTTAGGGCACCAAAAACTTTATTGCCAGCGGCAGTTCTGGCAAGCTCTGCATCCAAAGAGTAGACAAAGGCACCCAGCTGACCATCTATGCTCTCCACACTGTATTCATCGCCAGTC
>seq-317-43__TRD_UNSEG
>seq-317-43__UNSEG
ATCCAGGACTGCTCTGTTGGGCCTGGCTGGCCATGACTTGACCTGGGCCACCAACTCCCATATTGAGGTTAGGGGAACTACCAGATCGCAGCAATTCTGACAGCTGTTTATGTTTAGAAGCTGCATCTTGTACC
>seq-317-44__TRA_UNSEG
>seq-317-44__UNSEG
GTACTAGACAGTTTCATCAAAACCAGTGCAACAGGTGGCTTGGGATCAATAAAAGCTGAGGTGATGGCAGATACTGCTGTAGCTTTGGCTTCTGGAAATGTGAAATTGGTTTCAAGCAAGGTAATCACTTTTCTTTTGCCTTCTGTACTAT
>seq-317-45__TRD_UNSEG
>seq-317-45__UNSEG
AGCGAGAGTCCCTGCAGTCCCTTTCGACTTGCATTTTTGCAGGAGCAGTATCATGAAGCCTAAACGCGATGGATATATGTTTTTGAAGGCAGAAAGCAAAATTATGTTTGCCACTTTGCAAAGGAGCTCACTGTGGTGTCTGTGTTCC
>seq-317-46__IGK+_UNSEG
>seq-317-46__UNSEG
ACTTGAAGGCATCACTTTTAAGAAAGCTTACAGTTGGGCCCTGTACCATCCCAAGTCCTTTGTAGCTCCTCTTGAACATGTTTGCCATACTTTTAAAAGGGTAGTTGAATAAATAGCATCACCATTCTTTGCTGTGGCACAGG
>seq-317-47__IGH_UNSEG
>seq-317-47__UNSEG
ATCCAAGAGTAGCGCTCATGTTTTTTGACTCAAGAAAATAGGAAGTTTACTAACTGGCTTCCAGGAAAGGCCAAGGAGAGAAAGCCAATGGGAAGGAGGGTGGGGCAGAGGGACCCACACCAGGAAACCGCTGGCAGGTGGGGGATGGGCC
>seq-317-48__TRG_UNSEG
>seq-317-48__UNSEG
GCATTACCTTGCCTATTTTTAATATTATTAAAGCCTTTCTCCTTCAGTAGTCTATTTTCTTAGAATAACAACTCTTTTATCTATTCTGAACTCTATTTTTTTTCTTTTTTAAGAGACAAGGTTTTGCTCTGTTGCCCAGCTTGGACTCGAA
>seq-317-49__TRD_UNSEG
>seq-317-49__UNSEG
AAAAAAGCACAAAAATGGTCGGTGGGGAACCATATAACAAAACTACATCTCAGGCAGCTCTTTCTCAAGGAAGATTCTAAGATTTTATTATGTGGCTAATTCTAAATTGGAAATGGAACATGCTGGTATGTGAAGCAATTGGTGCTAGGA
>seq-317-50__IGL_UNSEG
>seq-317-50__UNSEG
CCTCCAGCTGGTGTTGGGAGCTTCTTCTTCTTCTCCTTGGGTGGTCTGTCTGGAAGCCGTGGGACAGGTGAACTTGGTGGGGGCCACCAGCCAGGAGCAGAAGGCTCTGAAGGAGCAGGGAAGGAGGTGTCGTGCACCTGTTCTAGGA
>seq-317-51__IGK+_UNSEG
>seq-317-51__UNSEG
GGGCTGGAGTTTCCTGTCAGCCTGTAACTGACCTTGGCACCTGCTTTCCTCCTCCAGAAGAGAAGAATCCCTACAAAGAAGTGTACACGGACATGTGGGTGGAACCTGAGGCAGCTGCCTACGCACCACCTCCACCAGCCAAAAAGC
>seq-317-52__TRD_UNSEG
>seq-317-52__UNSEG
CTGTTAACTACTGTACAACCCGACTTCATAATGGTGCTTTCAAACAGCGAGATGAGTAAAAACATCAGCTTCCACGTTGCCTTCTGCGCAAAGGGTTTCACCAAGGATGGAGAAAGGGAGACAGCTTGCAGATGGCGCGT
......@@ -9,8 +9,6 @@ using namespace std;
void testSegmentationBug1(int delta_min, int delta_max) {
string buggy_sequences = "bugs/kmersegment.fa";
int k = 14;
bool rc = true;
Fasta seqV("../../germline/TRGV.fa");
Fasta seqJ("../../germline/TRGJ.fa");
......
......@@ -54,7 +54,7 @@ void testRandom() {
string sequence = "AA";
for (int i = 0; i < 10; i++) {
seqs.push_back(create_sequence("seq"+id, "seq"+id, sequence, ""));
seqs.push_back(create_sequence("seq" + string_of_int(id), "seq" + string_of_int(id), sequence, ""));
sequence += "A";
id++;
}
......
This diff is collapsed.
......@@ -274,6 +274,9 @@ Axis.prototype = {
if (output=="percent"){
text = ((min+(h*i))*100).toFixed(1) + "%"
}
if (output=="float-2"){
text = (min+(h*i)).toFixed(2)
}
if (this.reverse) pos = 1 - pos;
......
......@@ -51,11 +51,13 @@ function Clone(data, model, index) {
this.m.clones[index]=this
this.tag = this.getTag();
this.computeGCContent()
this.computeCoverage()
}
Clone.prototype = {
COVERAGE_WARN: 0.5,
/**
* return clone's most important name <br>
......@@ -352,6 +354,28 @@ Clone.prototype = {
return s
},
/*
* Compute coverage as the average value of non-zero coverages
*/
computeCoverage: function () {
if (typeof (this._coverage) == 'undefined') {
this.coverage = undefined
return
}
var sum = 0.0
var nb = 0
for (var i=0; i<this._coverage.length; i++) {
if (this._coverage[i] > 0) {
sum += parseFloat(this._coverage[i])
nb += 1
}
}
this.coverage = sum/nb
},
computeGCContent: function () {
if (typeof (this.sequence) == 'undefined') {
this.GCContent = '?'
......@@ -494,6 +518,16 @@ Clone.prototype = {
html += "<tr><td> sequence name </td><td colspan='" + time_length + "'>" + this.getSequenceName() + "</td></tr>"
html += "<tr><td> code </td><td colspan='" + time_length + "'>" + this.getCode() + "</td></tr>"
html += "<tr><td> length </td><td colspan='" + time_length + "'>" + this.getSequenceLength() + "</td></tr>"
//coverage info
if (typeof this.coverage != 'undefined') {
html += "<tr><td> average coverage </td><td colspan='" + time_length + "'><span "
if (this.coverage < this.COVERAGE_WARN)
html += "class='warning'"
html += ">" + this.coverage.toFixed(3) + "</span></td>"
}
// abundance info
html += "<tr><td> size (n-reads (total reads) )</td>"
for (var i = 0; i < time_length; i++) {
html += "<td>" + this.get('reads',this.m.samples.order[i]) +
......
......@@ -448,15 +448,20 @@ List.prototype = {
}
span_size.style.color = this.m.clone(cloneID).getColor();
span_size.appendChild(document.createTextNode(this.m.clone(cloneID).getStrSize()));
var span_info = document.createElement('span')
span_info.className = "infoBox";
span_info.onclick = function () {
self.displayInfoBox(cloneID);
}
span_info.appendChild(document.createTextNode("I"));
if (this.m.clone(cloneID).coverage < this.m.clone(cloneID).COVERAGE_WARN) {
span_info.className += " warning" ;
span_info.appendChild(document.createTextNode("!"));
} else {
span_info.appendChild(document.createTextNode("i"));
}
var span_cluster = document.createElement('span')
span_cluster.className = "clusterBox";
......
......@@ -120,6 +120,7 @@ function ScatterPlot(id, model) {
["otherSize", "size (other point)"],
["sequenceLength", "clone length"],
["GCContent", "GC content"],
["coverage", "clone coverage"],
["n", "N length"],
["lengthCDR3", "CDR3 length"]
];
......@@ -130,6 +131,8 @@ function ScatterPlot(id, model) {
"GCContent" : { "fct" : "GCContent", "output" : "percent" },
"n" : { "fct" : function(cloneID) {return self.m.clone(cloneID).getNlength()} },
"lengthCDR3" : { "fct" : function(cloneID) {return self.m.clone(cloneID).seg["cdr3"].length} },
"coverage": { "fct" : function(cloneID){return self.m.clone(cloneID).coverage},
"min" : 0, "max" : 1, "output" : "float-2", "log": false },
"Size" : { "fct" : function(cloneID){return self.m.clone(cloneID).getSizeZero()},
"min" : function(){return self.m.min_size},
"max" : 1, "output" : "percent", "log" :true },
......@@ -144,6 +147,7 @@ function ScatterPlot(id, model) {
"V/J (alleles)" : { "mode": "plot", "x" : "allele_v", "y": "allele_j"},
"V/N length" : { "mode": "plot", "x" : "gene_v", "y": "n"},
"clone length / GC content " : { "mode": "plot", "x": "sequenceLength", "y" : "GCContent"},
"clone coverage / GC content " : { "mode": "plot", "x": "coverage", "y" : "GCContent"},
// "V/abundance" : { "mode": "plot", "x" : "gene_v", "y": "Size"},
"V distribution" : { "mode": "bar", "x" : "gene_v", "y": "gene_j"},
"Clone length distribution" : { "mode": "bar", "x" : "sequenceLength", "y": "gene_v"},
......
import collections
import json
import sys
import time
if len(sys.argv) < 3:
......
......@@ -4,6 +4,9 @@
import sys
import os
import urllib
from collections import defaultdict
import re
IMGT_LICENSE = '''
# To use the IMGT germline databases (IMGT/GENE-DB), you have to agree to IMGT license:
......@@ -16,6 +19,7 @@ IMGT_LICENSE = '''
print (IMGT_LICENSE)
NCBI_API = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&retmode=text'+'&id=%s&from=%s&to=%s'
# Parse lines in IMGT/GENE-DB such as:
# >M12949|TRGV1*01|Homo sapiens|ORF|...
......@@ -37,6 +41,56 @@ def check_directory_exists(path):
if not(os.path.isdir(path)):
os.mkdir(path)
def gene_matches(string, list_regex):
'''
>>> gene_matches('>M994641|IGHD1-18*01|Toto', ['TRGV', 'IGHD'])
True
>>> gene_matches('>M994641|IGHD1-18*01|Toto', ['TRGV', 'TRGD'])
False
>>> gene_matches('>M994641|IGHJ4*01|Toto', ['[A-Z]{3}J'])
True
>>> gene_matches('>M22153|TRDD2*01|Homo sapiens|F|', ['TRDD2'])
True
'''
for regex in list_regex:
if re.search(regex, string) <> None:
return True
return False
def get_gene_coord(imgt_line):
'''
>>> line = '>X15272|TRGV4*01|Homo sapiens|F|V-REGION|406..705|300 nt|1| | | | |300+0=300| |rev-compl|'
>>> get_gene_coord(line) == {'X15272': {'from': 406, 'to': 705, 'imgt_name': 'TRGV4*01'}}
True
'''
elements = imgt_line.split('|')
assert len(elements) >= 6
start, end = elements[5].split('..')
return {elements[0][1:]: {'from': int(start),
'to': int(end),
'imgt_name': elements[1]}}
def get_gene_sequence(gene, other_gene_name, start, end):
'''
Return the gene sequences between positions start and end (included).
'''
fasta_string = urllib.urlopen(NCBI_API % (gene, start, end)).read()
return re.sub('(>g.\|)', r'\1'+other_gene_name+'|', fasta_string)
def retrieve_genes(filename, genes, additional_length):
file = verbose_open_w(filename)
for gene in genes:
start = genes[gene]['from']
end = genes[gene]['to']
if additional_length > 0:
end += additional_length
elif additional_length < 0:
start = max(1, start + additional_length)
file.write(get_gene_sequence(gene, genes[gene]['imgt_name'], start, end))
LENGTH_UPSTREAM=40
LENGTH_DOWNSTREAM=40
# Create isolated files for some sequences
SPECIAL_SEQUENCES = [
]
......@@ -44,6 +98,9 @@ SPECIAL_SEQUENCES = [
# Split sequences in several files
SPLIT_SEQUENCES = {'/DV': ['TRAV', 'TRDV']}
DOWNSTREAM_REGIONS=['[A-Z]{3}J', 'TRDD3']
UPSTREAM_REGIONS=['IGHD', 'TRDD2']
SPECIES = {
"Homo sapiens": './',
"Mus musculus": 'mus-musculus/',
......@@ -51,6 +108,9 @@ SPECIES = {
"Rattus norvegicus_BN/SsNHsdMCW": 'rattus-norvegicus/',
}
downstream_data = defaultdict(dict)
upstream_data = defaultdict(dict)
for l in sys.stdin:
if ">" in l:
......@@ -69,6 +129,11 @@ for l in sys.stdin:
if system.startswith('IG') or system.startswith('TR'):
if gene_matches(l, DOWNSTREAM_REGIONS):
downstream_data[path+'/'+system].update(get_gene_coord(l))
if gene_matches(l, UPSTREAM_REGIONS):
upstream_data[path+'/'+system].update(get_gene_coord(l))
systems = get_split_files(seq, SPLIT_SEQUENCES)
if systems:
keys = [path + s for s in systems]
......@@ -89,3 +154,7 @@ for l in sys.stdin:
if current_special:
current_special.write(l)
for system in upstream_data:
retrieve_genes(system+"_upstream.fa", upstream_data[system], -LENGTH_UPSTREAM)
for system in downstream_data:
retrieve_genes(system+"_downstream.fa", downstream_data[system], LENGTH_DOWNSTREAM)
from SimpleXMLRPCServer import SimpleXMLRPCServer, SimpleXMLRPCRequestHandler
from SimpleXMLRPCServer import SimpleXMLRPCServer
from subprocess import *
import sys
......@@ -6,7 +6,7 @@ sys.path.insert(0, './web2py/applications/vidjil/modules')
import defs
def fuse(cmd, output_dir, filename):
import time, datetime, sys, os.path, random
import time, datetime, os.path
from subprocess import Popen, PIPE, STDOUT, os
fuse_log_file = open(output_dir+'/'+filename+'.fuse.log', 'w')
......
import sys
import glob
import os
......
......@@ -2,7 +2,6 @@
import gluon.contrib.simplejson, re
import os.path, subprocess
import vidjil_utils
from collections import defaultdict
MAX_LOG_LINES = 500
......
......@@ -152,7 +152,6 @@ def run_request():
# need patient, config
# need patient admin or read permission
def get_data():
import time
from subprocess import Popen, PIPE, STDOUT
if not auth.user :
res = {"redirect" : URL('default', 'user', args='login', scheme=True, host=True,
......
......@@ -3,6 +3,8 @@ import gluon.contrib.simplejson
import defs
import vidjil_utils
import os
from controller_utils import error_message
if request.env.http_origin:
response.headers['Access-Control-Allow-Origin'] = request.env.http_origin
response.headers['Access-Control-Allow-Credentials'] = 'true'
......@@ -11,13 +13,9 @@ if request.env.http_origin:
def add():
if not auth.can_modify_patient(request.vars['id'], auth.user_id):
res = {"success" : "false", "message" : "you need admin permission on this patient to add files"}
log.error(res)
return gluon.contrib.simplejson.dumps(res, separators=(',',':'))
return error_message("you need admin permission on this patient to add files")
elif not auth.can_upload_file(request.vars['id']):
res = {"success" : "false", "message" : "you don't have right to upload files"}
log.error(res)
return gluon.contrib.simplejson.dumps(res, separators=(',',':'))
return error_message("you don't have right to upload files")
else:
query = db((db.sequence_file.patient_id==request.vars['id'])).select()
if len(query) != 0 :
......@@ -48,9 +46,7 @@ def add_form():
query = db((db.sequence_file.patient_id==request.vars['patient_id'])).select()
for row in query :
if row.filename == request.vars['filename'] :
res = {"message": "this sequence file already exists for this patient"}
log.error(res)
return gluon.contrib.simplejson.dumps(res, separators=(',',':'))
return error_message("this sequence file already exists for this patient")
id = db.sequence_file.insert(sampling_date=request.vars['sampling_date'],
info=request.vars['file_info'],
......@@ -71,9 +67,7 @@ def add_form():
return gluon.contrib.simplejson.dumps(res, separators=(',',':'))
else :
res = {"success" : "false", "message" : error}
log.error(res)
return gluon.contrib.simplejson.dumps(res, separators=(',',':'))
return error_message(error)
def edit():
......@@ -83,9 +77,7 @@ def edit():