Commit 6666af77 authored by Mathieu Giraud's avatar Mathieu Giraud

merge - JUNCTION/CDR3 detection based on Cys104 and Phe118/Trp118 position

The JUNCTION/CDR3 detection (-3) works more reliably.
As in IMGT/JunctionAnalysis, the detection relies on the positions of Cys104 and Phe118/Trp118.
The detection is here in O(n), taking advantage of the already aligned V and J segments.
We use V gapped sequences from IMGT, and J gapped sequences that we compute,
based on the Phe/Trp-Gly-X-Gly pattern.

This was realized after prototyping and tests by @flothoni.
parents 403773ad 2bead3c9
......@@ -119,7 +119,9 @@ double Cost::toPValue(const int score)
}
DynProg::DynProg(const string &x, const string &y, DynProgMode mode, const Cost& cost, const bool reverse_x, const bool reverse_y)
DynProg::DynProg(const string &x, const string &y, DynProgMode mode, const Cost& cost,
const bool reverse_x, const bool reverse_y,
const int marked_pos_j)
{
this -> x = reverse_x ? reverse(x) : x ;
this -> y = reverse_y ? reverse(y) : y ;
......@@ -127,6 +129,9 @@ DynProg::DynProg(const string &x, const string &y, DynProgMode mode, const Cost&
this -> reverse_x = reverse_x ;
this -> reverse_y = reverse_y ;
this -> marked_pos_j = marked_pos_j;
this -> marked_pos_i = 0;
m = x.size();
n = y.size();
// cout << "Dynprog " << x << "(" << m << ") / " << y << "(" << n << ")" << endl ;
......@@ -419,6 +424,13 @@ void DynProg::backtrack()
int i=best_i+1;
int j=best_j+1;
// Retake good i/j when there were reversed strings
if (reverse_x)
i = m - i + 1 ;
if (reverse_y)
j = n - j + 1;
// Compute backtrack strings
ostringstream back_x;
......@@ -427,6 +439,13 @@ void DynProg::backtrack()
while (1) {
if ((!reverse_y && (j == marked_pos_j))
|| (reverse_y && (n-j+1 == marked_pos_j)))
{
marked_pos_i = reverse_x ? m-i+1 : i ;
}
if ((i<0) || (j<0))
{
cout << "Invalid backtrack: " << i << "," << j << endl ;
......
......@@ -119,9 +119,14 @@ class DynProg
int best_j ; /* Start at 1 */
int first_i ; /* Start at 1 */
int first_j ; /* Start at 1 */
string str_back ;
int marked_pos_i ; // To be computed (in x)
int marked_pos_j ; // Given (in y)
DynProg(const string &x, const string &y, DynProgMode mode, const Cost &c, const bool reverse_x=false, const bool reverse_y=false);
DynProg(const string &x, const string &y, DynProgMode mode, const Cost &c,
const bool reverse_x=false, const bool reverse_y=false,
const int marked_pos_j=0);
~DynProg();
void init();
......
......@@ -39,10 +39,11 @@ unsigned long long filesize(const char* filename)
return in.tellg();
}
void Fasta::init(int extract_field, string extract_separator)
void Fasta::init(int extract_field, string extract_separator, int mark_pos)
{
this -> extract_field = extract_field ;
this -> extract_separator = extract_separator ;
this -> mark_pos = mark_pos;
total_size = 0;
name = "";
basename = "";
......@@ -55,9 +56,9 @@ Fasta::Fasta(bool virtualfasta, string name)
basename = extract_basename(name);
}
Fasta::Fasta(int extract_field, string extract_separator)
Fasta::Fasta(int extract_field, string extract_separator, int mark_pos)
{
init(extract_field, extract_separator);
init(extract_field, extract_separator, mark_pos);
}
Fasta::Fasta(const string &input,
......@@ -168,18 +169,26 @@ OnlineFasta::~OnlineFasta() {
}
void OnlineFasta::init() {
mark_pos = 0;
nb_sequences_parsed = 0;
nb_sequences_returned = 0;
char_nb = 0;
line_nb = 0;
line = getInterestingLine();
current.seq = NULL;
current.marked_pos = 0;
current_gaps = 0;
line = getInterestingLine();
}
unsigned long long OnlineFasta::getPos() {
return char_nb;
}
void OnlineFasta::setMarkPos(int mark_pos) {
this -> mark_pos = mark_pos;
}
size_t OnlineFasta::getLineNb() {
return line_nb;
}
......@@ -211,6 +220,26 @@ void OnlineFasta::skipToNthSequence() {
return ;
}
void OnlineFasta::addLineToCurrentSequence(string line)
{
for (char& c : line)
{
if (c == ' ')
continue ;
if (c == '.') {
current_gaps++;
continue ;
}
current.sequence += c;
if (mark_pos) {
if (current.sequence.length() + current_gaps == mark_pos)
current.marked_pos = current.sequence.length();
}
}
}
void OnlineFasta::next() {
fasta_state state = FASTX_UNINIT;
......@@ -223,6 +252,8 @@ void OnlineFasta::next() {
if (current.seq) {
delete [] current.seq;
current.seq = NULL;
current.marked_pos = 0;
current_gaps = 0;
}
if (hasNextData()) {
......@@ -247,7 +278,7 @@ void OnlineFasta::next() {
switch(state) {
case FASTX_FASTA: case FASTX_FASTQ_ID:
// Sequence
current.sequence += line;
addLineToCurrentSequence(line);
break;
case FASTX_FASTQ_SEQ:
// FASTQ separator between sequence and quality
......@@ -313,6 +344,7 @@ istream& operator>>(istream& in, Fasta& fasta){
string line;
Sequence read;
OnlineFasta of(in, fasta.extract_field, fasta.extract_separator);
of.setMarkPos(fasta.mark_pos);
while (of.hasNext()) {
of.next();
......@@ -335,7 +367,14 @@ ostream &operator<<(ostream &out, const Sequence &seq) {
out << "@";
} else
out << ">";
out << seq.label << endl;
out << seq.label;
if (seq.marked_pos) {
out << " !@" << seq.marked_pos ;
}
out << endl;
out << seq.sequence << endl;
if (is_fastq) {
out << "+" << endl << seq.quality << endl;
......
......@@ -17,6 +17,7 @@ typedef struct read_t
string sequence; // Sequence: original string representation
string quality;
int* seq; // Sequence: seq representation
int marked_pos; // Some marked position in the sequence
} Sequence;
typedef enum {
......@@ -31,17 +32,18 @@ unsigned long long filesize(const char* filename);
class Fasta
{
void init(int extract_field, string extract_separator);
void init(int extract_field, string extract_separator, int mark_pos=0);
int total_size;
int extract_field;
int mark_pos;
string extract_separator;
vector<Sequence> reads;
// ostream *oout ;
public:
Fasta(int extract_field=0, string extract_separator="|");
Fasta(int extract_field=0, string extract_separator="|", int mark_pos=0);
/**
* Read all the sequences in the input filename and record them in the object.
*
......@@ -93,6 +95,8 @@ public:
class OnlineFasta {
private:
Sequence current;
int current_gaps;
istream *input;
int extract_field;
string extract_separator;
......@@ -101,6 +105,9 @@ class OnlineFasta {
size_t line_nb;
unsigned long long char_nb;
int mark_pos;
void addLineToCurrentSequence(string line);
int nb_sequences_parsed;
int nb_sequences_returned;
int nb_sequences_max;
......@@ -131,6 +138,11 @@ class OnlineFasta {
~OnlineFasta();
/**
* sets a position to be followed in gapped sequences
*/
void setMarkPos(int mark_pos);
/**
* @return the position in the file
*/
......
......@@ -44,6 +44,7 @@ Germline::Germline(string _code, char _shortcut,
f_reps_4.push_back(f_rep_4);
f_reps_3.push_back(f_rep_3);
/// no CYS104_IN_GAPPED_V / PHE118_TRP118_IN_GAPPED_J here ?
rep_5 = Fasta(f_rep_5, 2, "|");
rep_4 = Fasta(f_rep_4, 2, "|");
rep_3 = Fasta(f_rep_3, 2, "|");
......@@ -64,9 +65,9 @@ Germline::Germline(string _code, char _shortcut,
f_reps_4 = _f_reps_4 ;
f_reps_3 = _f_reps_3 ;
rep_5 = Fasta(2, "|") ;
rep_5 = Fasta(2, "|", CYS104_IN_GAPPED_V);
rep_4 = Fasta(2, "|") ;
rep_3 = Fasta(2, "|") ;
rep_3 = Fasta(2, "|", PHE118_TRP118_IN_GAPPED_J);
for (list<string>::const_iterator it = f_reps_5.begin(); it != f_reps_5.end(); ++it)
rep_5.add(*it);
......@@ -108,9 +109,9 @@ Germline::Germline(string code, char shortcut, string path, json json_recom, int
init(code, shortcut, delta_min, max_indexing);
rep_5 = Fasta(2, "|") ;
rep_5 = Fasta(2, "|", CYS104_IN_GAPPED_V) ;
rep_4 = Fasta(2, "|") ;
rep_3 = Fasta(2, "|") ;
rep_3 = Fasta(2, "|", PHE118_TRP118_IN_GAPPED_J) ;
for (json::iterator it = json_recom["5"].begin();
it != json_recom["5"].end(); ++it)
......
......@@ -17,6 +17,12 @@ enum SEGMENTATION_METHODS {
SEG_METHOD_MAX1U // Pseudo-germline, most frequent kmer affection and unknwon affectation (-4)
} ;
// JUNCTION/CDR3 extraction from gapped V/J sequences
#define CYS104_IN_GAPPED_V 310 // First nucleotide of Cys104
#define PHE118_TRP118_IN_GAPPED_J 38 // Last nucleotide of Phe118/Trp118
using namespace std;
using json = nlohmann::json;
......
......@@ -92,7 +92,7 @@ string codeFromBoxes(vector <AlignBox*> boxes, string sequence)
if (i>0) {
code += " " + string_of_int(boxes[i-1]->del_right) + "/"
// From box_left->end + 1 to box_right->start - 1
// From box_left->end + 1 to box_right->start - 1, both positions included
+ sequence.substr(boxes[i-1]->end + 1, boxes[i]->start - boxes[i-1]->end - 1)
+ "/" + string_of_int(boxes[i]->del_left) + " " ;
}
......@@ -254,6 +254,10 @@ string Segmenter::getInfoLine() const
if (evalue_right > NO_LIMIT_VALUE)
s += "/" + scientific_string_of_double(evalue_right);
if (CDR3start > 0)
s += " {" + string_of_int(JUNCTIONstart) + "(" + string_of_int(JUNCTIONend-JUNCTIONstart+1) + ")" + string_of_int(JUNCTIONend) + " "
+ "up"[JUNCTIONproductive] + " " + JUNCTIONaa + "}";
return s ;
}
......@@ -303,6 +307,9 @@ KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline, double threshold,
box_D = new AlignBox();
box_J = new AlignBox();
CDR3start = -1;
CDR3end = -1;
label = seq.label ;
sequence = seq.sequence ;
info = "" ;
......@@ -715,7 +722,8 @@ void align_against_collection(string &read, Fasta &rep, int forbidden_rep_id,
DynProg dp = DynProg(sequence_or_rc, rep.sequence(r),
dpMode, // DynProg::SemiGlobalTrans,
segment_cost, // DNA
reverse_both, reverse_both);
reverse_both, reverse_both,
rep.read(r).marked_pos);
bool onlyBottomTriangle = !local ;
int score = dp.compute(onlyBottomTriangle, BOTTOM_TRIANGLE_SHIFT);
......@@ -733,6 +741,10 @@ void align_against_collection(string &read, Fasta &rep, int forbidden_rep_id,
best_first_j = dp.first_j ;
box->ref_nb = r ;
box->ref_label = rep.label(r) ;
if (!local)
dp.backtrack();
box->marked_pos = dp.marked_pos_i ;
}
score_r.push_back(make_pair(score, r));
......@@ -1029,61 +1041,40 @@ void FineSegmenter::FineSegmentD(Germline *germline, bool several_D,
}
void FineSegmenter::findCDR3(){
string str = getSequence().sequence;
list<string> codon_start;
codon_start.push_back("TGT");
codon_start.push_back("TGC");
list<string> codon_end;
codon_end.push_back("TTT");
codon_end.push_back("TTC");
codon_end.push_back("TGG");
list<int> p_start;
list<int> p_end;
size_t loc;
std::list<string>::const_iterator it;
for (it = codon_start.begin(); it != codon_start.end(); ++it) {//filter 1 : start codon must be in V
loc = 0;
while ( loc != string::npos && loc < (size_t)box_V->end){
loc = str.find(*it, loc+3);
if (loc != string::npos && loc < (size_t)box_V->end) {
p_start.push_front(loc);
}
}
}
for (it = codon_end.begin(); it != codon_end.end(); ++it) {//filter 2 : end codon must be in J
loc = box_J->start;
while ( loc != string::npos){
loc = str.find(*it, loc+3);
if (loc != string::npos) {
p_end.push_back(loc);
}
}
JUNCTIONstart = box_V->marked_pos;
JUNCTIONend = box_J->marked_pos;
// There are two cases when we can not detect a JUNCTION/CDR3:
// - Germline V or J gene has no 'marked_pos'
// - Sequence may be too short on either side, and thus the backtrack did not find a suitable 'marked_pos'
if (JUNCTIONstart == 0 || JUNCTIONend == 0)
return;
// IMGT-CDR3 is, on each side, 3 nucleotides shorter than IMGT-JUNCTION
CDR3start = JUNCTIONstart + 3;
CDR3end = JUNCTIONend - 3;
CDR3nuc = subsequence(getSequence().sequence, CDR3start, CDR3end);
if (CDR3nuc.length() % 3 == 0)
{
CDR3aa = nuc_to_aa(CDR3nuc);
}
else
{
// start of codon fully included in the germline J
int CDR3startJfull = JUNCTIONend - ((JUNCTIONend - box_J->start) / 3) * 3 + 1 ;
CDR3start = -1;
CDR3end = -1;
std::list<int>::const_iterator it1;
for (it1 = p_start.begin(); it1 != p_start.end(); ++it1) {
std::list<int>::const_iterator it2;
for (it2 = p_end.begin(); it2 != p_end.end(); ++it2) {
if ( (*it2-*it1)%3 == 0){ //filter 3 : start/stop codon must be seprated by a multiple of 3
if ( fabs((*it2-*it1)-36 ) < fabs((CDR3end-CDR3start)-36) ){ //filter 4 : cdr3 length must be close to 12 AA
CDR3start = *it1;
CDR3end = *it2;
}
}
}
CDR3aa =
nuc_to_aa(subsequence(getSequence().sequence, CDR3start, CDR3startJfull-1)) +
nuc_to_aa(subsequence(getSequence().sequence, CDR3startJfull, CDR3end));
}
JUNCTIONaa = nuc_to_aa(subsequence(getSequence().sequence, JUNCTIONstart, CDR3start-1))
+ CDR3aa + nuc_to_aa(subsequence(getSequence().sequence, CDR3end+1, JUNCTIONend));
JUNCTIONproductive = (CDR3nuc.length() % 3 == 0) && (JUNCTIONaa.find('*') == string::npos);
}
json FineSegmenter::toJson(){
......@@ -1107,7 +1098,14 @@ json FineSegmenter::toJson(){
if (CDR3start >= 0) {
seg["cdr3"] = {
{"start", CDR3start},
{"stop", CDR3end}
{"stop", CDR3end},
{"aa", CDR3aa}
};
seg["junction"] = {
{"start", JUNCTIONstart},
{"stop", JUNCTIONend},
{"aa", JUNCTIONaa},
{"productive", JUNCTIONproductive}
};
}
}
......
......@@ -91,6 +91,9 @@ class AlignBox
string ref_label;
string ref;
/* Marked position, for Cys104 and Phe118/Trp118 */
int marked_pos;
/* Identifiers and scores of other possible reference sequence */
vector<pair<int, int> > score;
};
......@@ -119,7 +122,15 @@ class Segmenter {
protected:
string sequence;
string sequence_or_rc;
int JUNCTIONstart, JUNCTIONend;
string JUNCTIONaa;
bool JUNCTIONproductive;
int CDR3start, CDR3end;
string CDR3nuc;
string CDR3aa;
bool reversed, segmented, dSegmented;
int because;
......@@ -318,6 +329,10 @@ class FineSegmenter : public Segmenter
int extend_DD_on_Y, int extend_DD_on_Z,
double threshold = THRESHOLD_NB_EXPECTED_D, int multiplier=1);
/**
* find JUNCTION/CDR3, by using marked Cys104 and Phe118/Trp118 positions
* in the germline V and J genes and the backtrack of the DP matrix
*/
void findCDR3();
json toJson();
......
......@@ -136,6 +136,26 @@ int dna_to_int(const string &word, int size) {
return index_word;
}
string nuc_to_aa(const string &word) {
string aa;
int index_word = 0;
int i = 0;
for (; i < word.length() ; i++) {
index_word = (index_word << 2) | nuc_to_int(word[i]);
if (i % 3 == 2) {
aa += GENETIC_CODE[index_word];
index_word = 0 ;
}
}
if (i % 3)
aa += GENETIC_CODE_OUT_OF_FRAME ;
return aa;
}
Sequence create_sequence(string label_full, string label, string sequence, string quality) {
Sequence seq;
seq.label_full = label_full;
......@@ -214,6 +234,10 @@ int remove_trailing_whitespaces(string &str) {
return count;
}
string subsequence(const string &seq, int start, int end) {
return seq.substr(start - 1, end - start + 1);
}
string revcomp(const string &dna, bool do_revcomp) {
if (!do_revcomp)
......
......@@ -126,6 +126,19 @@ inline int nuc_to_int(char nuc) {
*/
int dna_to_int(const string &, int size);
#define GENETIC_CODE \
"KNKN" "TTTT" "RSRS" "IIMI" \
"QHQH" "PPPP" "RRRR" "LLLL" \
"EDED" "AAAA" "GGGG" "VVVV" \
"*Y*Y" "SSSS" "*CWC" "LFLF"
#define GENETIC_CODE_OUT_OF_FRAME '#'
/**
* Convert nucleotides to amino acids
*/
string nuc_to_aa(const string &nuc);
string extract_from_label(string str, int field, string separator);
/**
......@@ -146,6 +159,11 @@ string extract_basename(string path, bool remove_ext = true);
*/
int remove_trailing_whitespaces(string &str);
/**
* @return subsequence delimited by biological positions (starting from 1), including both positions
*/
string subsequence(const string &text, int start, int end);
/**
* @return reverse(complement(dna)) if do_revcomp, otherwise dna
*/
......
!NO_LAUNCHER:
!LAUNCH: (cd ../../germline ; md5sum *.fa || md5 -r *.fa)
$ Check md5 in germline/, sequences split from IMGT
$ Check md5 in germline/, sequences split and processed from IMGT
1:3a655c9d99ca04907a120f1b69febf2e IGHD.fa
1:0d87f672c582e7e15cdcaa0fb5e08f6f IGHJ.fa
1:b59eb57893dbab69d98eb712722bf603 IGHV.fa
1:c574f5bcc98c99d6462365eb2a9aecc6 IGKJ.fa
1:2746f12c8af2a9b42b166ef468dd08d9 IGKV.fa
1:a6bdc2aa82d151e30d10d302559355b6 IGLJ.fa
1:3184a810b22ff5557e99f99906c92fef IGLV.fa
1:5ee643edc326f1e46b0b5914a0b6d3d4 TRAJ.fa
1:a7eedfb8feda1107884ad9a363c90831 TRAV.fa
1:99648acbcf17a675fa7e48ae8c8836d2 IGHJ.fa
1:92aef873dde83c68070843a57904b15b IGHV.fa
1:15a49bc8fc8dcb78153b3e6c3df9f5e4 IGKJ.fa
1:a27659426126c5ab779a00bc9643d73e IGKV.fa
1:fb5f822dfa2eea38b0ece774bbdda17a IGLJ.fa
1:5f2039964e6623a5d88eb9b33be07a1a IGLV.fa
1:cc51221ca24540b7434473d1f27fdc72 TRAJ.fa
1:d249b9fe019f945d0643dd8d0e978f49 TRAV.fa
1:f66c70cda5b369aecd34ab1cd11e7415 TRBD.fa
1:cc20222cdb7a381ff5a34030bee91e08 TRBJ.fa
1:5751f4aea01aebe77575f27d0150a8f2 TRBV.fa
1:7eaa560f388d7bbf596e6577812a3c2c TRBJ.fa
1:2ba9bfe18bd6540f07009489d4881159 TRBV.fa
1:97e784cdfd5994c10ebbbedfcf332ef6 TRDD.fa
1:cfb6f64b2b6d9d8aa7a827ad6937fc82 TRDJ.fa
1:ccde1e004db86e39369ebf5bd09267e3 TRDV.fa
1:0fb74249e406e3689ffad1b5633bbc3f TRGJ.fa
1:64e5f218e9de3443f04c4f4e81468f0d TRGV.fa
1:014c76bb3c4d051f4174aaf768573789 TRDJ.fa
1:74b37f99e9a63b861c03d1a55b4ee58a TRDV.fa
1:45c015c36a2cb34bb26fe22b7ec7401a TRGJ.fa
1:40e7f196afc226db253d6ff5169f99bd TRGV.fa
$ Check md5 in germline/, other sequences
1:e47e9881c18535d719d1858e3781815d IGHD_upstream.fa
......@@ -35,3 +35,13 @@ $ Check md5 in germline/, other sequences
1:1147a04c4e8a8dd534aae65db1ae13ca IGK-KDE.fa
1:3631478c97b8660996850c91adb61409 TRDD2_upstream.fa
1:eadaa15ed34a836f5ecfd2380eadc03e TRDD3_downstream.fa
1:8707043df48f0ce114e287144a9450c5 IGHC=A1.fa
1:c7ff5d88e6177e33d904903880353833 IGHC=A2.fa
1:f4b1d85e86ea58d7beb67b6b09836947 IGHC=D.fa
1:f6b4a148186423e6325b21f97fbcf4e4 IGHC=E.fa
1:a4c7d9e8e62382375b5955b4c6293c5e IGHC=G1.fa
1:0fe7e446c5e7855716b9bc5d6d8ada69 IGHC=G2.fa
1:f0d339fa483fd0222a19583fb3488d28 IGHC=G3.fa
1:eb09ee85b569805670f0b8808340c1ea IGHC=G4.fa
1:ff0edc0d2932bf94096968e4453d435f IGHC=GP.fa
1:dbedebc8b08b4bc7a5c19a8036e42464 IGHC=M.fa
!LAUNCH: ../../vidjil -G ../../germline/TRG -c clones ../../data/segment_lec.fa
!LAUNCH: ../../vidjil -G ../../germline/TRG -c clones -A -3 ../../data/segment_lec.fa
$ Extract 50bp windows (TRG)
1:found . 50-windows
......@@ -12,3 +12,7 @@ $ Find the good statistics on TRG
$ Find the good length statistics
3: SEG.* 117.0
1: UNSEG too short w.* 40.0