...
  View open merge request
Commits (10)
......@@ -220,16 +220,10 @@ affect_infos KmerAffectAnalyser::getMaximum(const KmerAffect &before,
results.nb_before_right++;
}
KmerAffect left_affect = before;
KmerAffect right_affect = after;
if (kms.multiple_in_one) {
left_affect = AFFECT_NOT_UNKNOWN;
right_affect = AFFECT_NOT_UNKNOWN;
}
left_evalue = kms.getProbabilityAtLeastOrAbove(left_affect,
left_evalue = kms.getProbabilityAtLeastOrAbove(before,
results.nb_before_left,
1 + results.last_pos_max);
right_evalue = kms.getProbabilityAtLeastOrAbove(right_affect,
right_evalue = kms.getProbabilityAtLeastOrAbove(after,
results.nb_after_right,
seq.size() - 1 - results.first_pos_max);
......@@ -253,11 +247,7 @@ affect_infos KmerAffectAnalyser::getMaximum(const KmerAffect &before,
double KmerAffectAnalyser::getProbabilityAtLeastOrAbove(const KmerAffect &kmer, int at_least) const {
KmerAffect affect = kmer;
if (kms.multiple_in_one) {
affect = AFFECT_NOT_UNKNOWN;
}
return kms.getProbabilityAtLeastOrAbove(affect, at_least, seq.size());
return kms.getProbabilityAtLeastOrAbove(kmer, at_least, seq.size());
}
pair <double, double> KmerAffectAnalyser::getLeftRightProbabilityAtLeastOrAbove() const {
......
......@@ -5,6 +5,10 @@
#include <fstream>
#include <ctype.h>
map<string, char> Germline::filename_shortcut = map<string, char>();
map<char, char> Germline::shortcut_conversion = map<char, char>();
set<char> Germline::used_shortcuts = set<char>();
void Germline::init(string _code, char _shortcut,
string seed,
int max_indexing, bool build_automaton)
......@@ -14,6 +18,9 @@ void Germline::init(string _code, char _shortcut,
shortcut = _shortcut ;
index = 0 ;
this->max_indexing = max_indexing;
multigermline = nullptr;
overriden_filter = false;
old_filter_5 = nullptr;
this->seed = expand_seed(seed);
......@@ -21,9 +28,34 @@ void Germline::init(string _code, char _shortcut,
affect_4 = "" ;
affect_3 = "J" ;
affect_5 = string(1, toupper(shortcut)) + "-" + code + "V";
affect_4 = string(1, 14 + shortcut) + "-" + code + "D";
affect_3 = string(1, tolower(shortcut)) + "-" + code + "J";
char vShortcut = toupper(shortcut);
char dShortcut = 14+shortcut;
char jShortcut = tolower(shortcut);
vShortcut = get_new_shortcut_when_conflict(vShortcut, rep_5);
dShortcut = get_new_shortcut_when_conflict(dShortcut, rep_4);
jShortcut = get_new_shortcut_when_conflict(jShortcut, rep_3);
used_shortcuts.insert(vShortcut);
used_shortcuts.insert(jShortcut);
if (filename_shortcut.count(rep_5.name) == 0)
for (auto filename: rep_5.filenames)
filename_shortcut[filename] = vShortcut;
if (filename_shortcut.count(rep_3.name) == 0)
for (auto filename: rep_3.filenames)
filename_shortcut[filename] = jShortcut;
if (rep_4.name.size() > 0) {
used_shortcuts.insert(dShortcut);
if (filename_shortcut.count(rep_4.name) == 0)
for(auto filename: rep_4.filenames)
filename_shortcut[filename] = dShortcut;
}
affect_5 = string(1, vShortcut) + "-" + code + "V";
affect_4 = string(1, dShortcut) + "-" + code + "D";
affect_3 = string(1, jShortcut) + "-" + code + "J";
filter_5 = build_automaton ? new FilterWithACAutomaton(rep_5, this->seed) : nullptr;
}
......@@ -119,8 +151,6 @@ Germline::Germline(string code, char shortcut, string path, json json_recom,
rep_5.add(path + filename);
}
init(code, shortcut, seed, max_indexing, build_automaton);
if (json_recom.find("4") != json_recom.end()) {
for (json::iterator it = json_recom["4"].begin();
it != json_recom["4"].end(); ++it)
......@@ -154,6 +184,8 @@ Germline::Germline(string code, char shortcut, string path, json json_recom,
rep_4.add(path + filename);
}
}
init(code, shortcut, seed, max_indexing, build_automaton);
}
int Germline::getMaxIndexing(){
......@@ -182,40 +214,87 @@ void Germline::set_index(IKmerStore<KmerAffect> *_index)
index->refs++ ;
}
MultiGermline *Germline::get_multigermline() const {
return this->multigermline;
}
void Germline::set_multigermline(MultiGermline *multi) {
this->multigermline = multi;
}
void Germline::update_index(IKmerStore<KmerAffect> *_index)
{
if (!_index) _index = index ;
_index->insert(rep_5, affect_5, max_indexing, seed);
_index->insert(rep_4, affect_4, 0, seed);
_index->insert(rep_3, affect_3, -max_indexing, seed);
_index->insert(rep_5, affect_5, this, max_indexing, seed);
_index->insert(rep_4, affect_4, this, 0, seed);
_index->insert(rep_3, affect_3, this, -max_indexing, seed);
}
void Germline::mark_as_ambiguous(Germline *other)
{
index->insert(other->rep_5, AFFECT_AMBIGUOUS_SYMBOL, max_indexing, seed);
index->insert(other->rep_5, AFFECT_AMBIGUOUS_SYMBOL, this, max_indexing, seed);
if (other->affect_4.size())
index->insert(other->rep_4, AFFECT_AMBIGUOUS_SYMBOL, 0, seed);
index->insert(other->rep_4, AFFECT_AMBIGUOUS_SYMBOL, this, 0, seed);
index->insert(other->rep_3, AFFECT_AMBIGUOUS_SYMBOL, -max_indexing, seed);
index->insert(other->rep_3, AFFECT_AMBIGUOUS_SYMBOL, this, -max_indexing, seed);
}
void Germline::override_rep5_rep3_from_labels(KmerAffect left, KmerAffect right)
{
rep_5 = index->getLabel(left);
rep_3 = index->getLabel(right);
Germline *left_germline = index->getLabel(left);
rep_5 = (left_germline) ? left_germline->rep_5 : BIOREADER_AMBIGUOUS;
Germline *right_germline = index->getLabel(right);
rep_3 = (right_germline) ? right_germline->rep_3 : BIOREADER_AMBIGUOUS;
if (multigermline) {
Germline *g = multigermline->get_germline(rep_5);
if (! overriden_filter) {
old_filter_5 = filter_5;
overriden_filter = true;
}
if (g)
filter_5 = g->getFilter_5();
else
filter_5 = nullptr;
}
}
FilterWithACAutomaton* Germline::getFilter_5(){
return this->filter_5;
}
char Germline::get_new_shortcut_when_conflict(char shortcut, BioReader &reader) {
char original_shortcut = shortcut;
if (used_shortcuts.count(shortcut) == 0) {
shortcut_conversion[shortcut] = shortcut;
return shortcut;
}
bool common_files = false;
for (auto filename: reader.filenames) {
if (filename_shortcut.count(filename) > 0) {
common_files = true;
break;
}
}
if (! common_files && reader.filenames.size() > 0) {
// We have the same shortcut, but different files
do {
shortcut = ((shortcut + 30) % 120) + 8;
} while (used_shortcuts.count(shortcut) > 0);
}
shortcut_conversion[shortcut] = original_shortcut;
return shortcut;
}
Germline::~Germline()
{
if(filter_5){
if(filter_5 && ! overriden_filter){
delete filter_5;
} else if (old_filter_5 && overriden_filter) {
delete old_filter_5;
}
if (index)
{
......@@ -387,7 +466,6 @@ void MultiGermline::build_with_one_index(string seed, bool set_index)
index = KmerStoreFactory<KmerAffect>::createIndex(indexType, expand_seed(seed), rc);
index->refs = 1;
insert_in_one_index(index, set_index);
index->multiple_in_one = true;
}
void MultiGermline::finish() {
......@@ -395,10 +473,19 @@ void MultiGermline::finish() {
index->finish_building();
}
for (auto germline: germlines) {
germline->set_multigermline(this);
rep_germlines[germline->rep_5.name] = germline;
rep_germlines[germline->rep_3.name] = germline;
germline->finish();
}
}
Germline *MultiGermline::get_germline(BioReader rep) {
if (rep_germlines.count(rep.name) == 0)
return nullptr;
return rep_germlines[rep.name];
}
/* Mark k-mers common to several germlines as ambiguous */
void MultiGermline::mark_cross_germlines_as_ambiguous()
{
......
......@@ -14,6 +14,8 @@
#include "bioreader.hpp"
#include "filter.h"
#include <climits>
#include <map>
#include <set>
enum SEGMENTATION_METHODS {
SEG_METHOD_53, // Regular or incomplete germlines, 5'-3'
......@@ -37,9 +39,17 @@ enum SEGMENTATION_METHODS {
using namespace std;
using json = nlohmann::json;
class MultiGermline;
class Germline {
private:
FilterWithACAutomaton* filter_5;
static map<string, char> filename_shortcut; /* Association between filename and shortcut */
static map<char, char> shortcut_conversion;
static set<char> used_shortcuts;
FilterWithACAutomaton* filter_5, *old_filter_5;
MultiGermline *multigermline;
bool overriden_filter;
int max_indexing;
......@@ -91,6 +101,12 @@ class Germline {
void new_index(IndexTypes type);
void set_index(IKmerStore<KmerAffect> *index);
MultiGermline *get_multigermline() const;
/**
* Sets the MultiGermline of this germline
*/
void set_multigermline(MultiGermline *multi);
void update_index(IKmerStore<KmerAffect> *_index = NULL);
void mark_as_ambiguous(Germline *other);
......@@ -104,6 +120,8 @@ class Germline {
*/
void override_rep5_rep3_from_labels(KmerAffect left, KmerAffect right);
static char get_new_shortcut_when_conflict(char, BioReader &);
list <string> f_reps_5 ;
list <string> f_reps_4 ;
list <string> f_reps_3 ;
......@@ -133,6 +151,7 @@ enum GERMLINES_FILTER { GERMLINES_ALL,
class MultiGermline {
private:
IndexTypes indexType;
map <string, Germline*> rep_germlines;
public:
bool one_index_per_germline;
list <Germline*> germlines;
......@@ -150,6 +169,8 @@ class MultiGermline {
void insert(Germline *germline);
void add_germline(Germline *germline);
Germline *get_germline(BioReader rep);
/**
* Build from a json .g germline file
* path: path, such as 'germline/'
......
......@@ -10,6 +10,8 @@
#include "bioreader.hpp"
#include "tools.h"
class Germline;
using namespace std;
typedef
......@@ -85,9 +87,8 @@ public:
static int last_id;
int id; // id of this index
int refs; // number of germlines using this index
bool multiple_in_one;
list< pair <T, BioReader> > labels;
map<T, Germline *> labels;
IKmerStore();
......@@ -97,7 +98,7 @@ public:
* @param seed: the seed to use for indexing. By default it will be the seed of the index
* @post All the sequences in the FASTA files have been indexed, and the label is stored in the list of labels
*/
void insert(BioReader& input, const string& label="", int keep_only = 0, string seed = "");
void insert(BioReader& input, const string& label="", Germline *germline=NULL, int keep_only = 0, string seed = "");
/**
* @param input: A list of FASTA files
......@@ -105,7 +106,7 @@ public:
* @param seed: the seed to use for indexing. By default it will be the seed of the index
* @post All the sequences in the FASTA files have been indexed, and the label is stored in the list of labels
*/
void insert(list<BioReader>& input, const string& label="", int keep_only = 0, string seed = "");
void insert(list<BioReader>& input, const string& label="", Germline *germline=NULL, int keep_only = 0, string seed = "");
/**
* @param input: A sequence to be cut in k-mers
......@@ -173,7 +174,7 @@ public:
* @param kmer: a kmer
* @return one label associated with the kmer
*/
BioReader getLabel(T kmer) const;
Germline *getLabel(T kmer) const;
/**
* @return whether the index differentiate kmer types
......@@ -213,7 +214,6 @@ IKmerStore<T>::IKmerStore() {
id = ++last_id;
refs = 0;
finished_building = false;
multiple_in_one = false;
}
template<class T> int IKmerStore<T>::last_id = 0;
......@@ -277,26 +277,31 @@ IKmerStore<T>::~IKmerStore(){}
template<class T>
void IKmerStore<T>::insert(list<BioReader>& input,
const string &label,
Germline *germline,
int keep_only,
string seed){
for(list<BioReader>::iterator it = input.begin() ; it != input.end() ; it++){
insert(*it, label, keep_only, seed);
insert(*it, label, germline, keep_only, seed);
}
}
template<class T>
void IKmerStore<T>::insert(BioReader& input,
const string &label,
Germline *germline,
int keep_only,
string seed){
for (int r = 0; r < input.size(); r++) {
insert(input.sequence(r), label, true, keep_only, seed);
}
labels.push_back(make_pair(T(label, 1, seed.size()), input)) ;
T current_label = T(label, 1, seed.size());
if (labels.count(current_label) == 0) {
labels[current_label] = germline;
if (revcomp_indexed && ! T::hasRevcompSymetry()) {
labels.push_back(make_pair(T(label, -1, seed.size()), input)) ;
if (revcomp_indexed && ! T::hasRevcompSymetry()) {
labels[T(label, -1, seed.size())] = germline ;
}
}
}
......@@ -443,17 +448,16 @@ string IKmerStore<T>::getSeed() const {
}
template<class T>
BioReader IKmerStore<T>::getLabel(T kmer) const {
for (typename list< pair<T, BioReader> >::const_iterator it = labels.begin(); it != labels.end(); ++it)
if (it->first == kmer)
return it->second ;
Germline *IKmerStore<T>::getLabel(T kmer) const {
if (labels.count(kmer) > 0)
return labels.at(kmer);
// Nothing interesting found
// Try by ignoring length if the index is not able to deal with different lengths
if (! hasDifferentKmerTypes() && kmer.getLength() != (unsigned char)~0) {
kmer.setLength(~0);
return getLabel(kmer);
}
return BIOREADER_AMBIGUOUS ;
return NULL ;
}
template<class T>
......
......@@ -593,7 +593,22 @@ KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline, double threshold,
}
} // endif Pseudo-germline
Germline *left_g = segmented_germline->index->getLabel(before);
Germline *right_g = segmented_germline->index->getLabel(after);
int order = left_g->code.compare(right_g->code);
// If germlines differ take the longest one
// if one is prefix of the other.
// (works for IGH/IGH+ for instance, what about TRA+D?)
if (order < 0) {
if (right_g->code.compare(0, left_g->code.size(), left_g->code) == 0)
segmented_germline = right_g;
} else if (order > 0) {
if (left_g->code.compare(0, right_g->code.size(), right_g->code) == 0)
segmented_germline = left_g;
} else {
segmented_germline = left_g;
}
computeSegmentation(strand, before, after, threshold, multiplier);
}
......@@ -626,6 +641,9 @@ KmerMultiSegmenter::KmerMultiSegmenter(Sequence seq, MultiGermline *multigermlin
for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
{
Germline *germline = *it ;
if (germline->code != "unexpected") {
continue;
}
double incomplete_multiplier = 1;
if (germline->code.find("+") != string::npos) {
......@@ -1020,9 +1038,9 @@ FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c,
code = "Unexpected ";
code += left.toStringSigns() + germline->index->getLabel(left).basename;
code += left.toStringSigns() + germline->index->getLabel(left)->rep_5.basename;
code += "/";
code += right.toStringSigns() + germline->index->getLabel(right).basename;
code += right.toStringSigns() + germline->index->getLabel(right)->rep_3.basename;
info_extra += " " + left.toString() + "/" + right.toString() + " (" + code + ")";
if (germline->seg_method == SEG_METHOD_MAX1U)
......
......@@ -301,8 +301,8 @@ def header_igblast_results(ff_fasta, ff_igblast):
### Vidjil
VIDJIL_FINE = '{directory}/vidjil-algo --header-sep "#" -c designations -2 -3 -d -g {directory}/germline/homo-sapiens.g %s >> %s'
VIDJIL_KMER = '{directory}/vidjil-algo -w 20 --header-sep "#" -b out -c windows -uuuU -2 -g {directory}/germline/homo-sapiens.g %s > /dev/null ; cat out/out.segmented.vdj.fa out/out.unsegmented.vdj.fa >> %s'
VIDJIL_FINE = '{directory}/vidjil-algo --header-sep "#" -c designations -1 -2 -3 -d -g {directory}/germline/homo-sapiens.g %s >> %s'
VIDJIL_KMER = '{directory}/vidjil-algo -w 20 --header-sep "#" -b out -c windows -uuuU -1 -2 -g {directory}/germline/homo-sapiens.g %s > /dev/null ; cat out/out.segmented.vdj.fa out/out.unsegmented.vdj.fa >> %s'
def should_results_from_vidjil_output(f_log):
'''
......
......@@ -959,12 +959,12 @@ int main (int argc, char **argv)
IKmerStore<KmerAffect> *index = multigermline->index ;
// Initialize statistics, with two additional categories
index->labels.push_back(make_pair(KmerAffect::getAmbiguous(), BIOREADER_AMBIGUOUS));
index->labels.push_back(make_pair(KmerAffect::getUnknown(), BIOREADER_UNKNOWN));
index->labels[KmerAffect::getAmbiguous()] = NULL;
index->labels[KmerAffect::getUnknown()] = NULL;
for (list< pair <KmerAffect, BioReader> >::const_iterator it = index->labels.begin(); it != index->labels.end(); ++it)
for (auto it: index->labels)
{
char key = affect_char(it->first.affect) ;
char key = affect_char(it.first.affect) ;
stats_kmer[key] = 0 ;
stats_max[key] = 0 ;
}
......@@ -1016,12 +1016,12 @@ int main (int argc, char **argv)
<< endl ;
cout << "\t" << " max" << "\t\t" << " kmers" << "\n" ;
for (list< pair <KmerAffect, BioReader> >::const_iterator it = index->labels.begin(); it != index->labels.end(); ++it)
for (auto it: index->labels)
{
if (it->first.getStrand() == -1)
if (it.first.getStrand() == -1)
continue ;
char key = affect_char(it->first.affect) ;
char key = affect_char(it.first.affect) ;
cout << setw(12) << stats_max[key] << " " ;
cout << setw(6) << fixed << setprecision(2) << (float) stats_max[key] / nb_reads * 100 << "%" ;
......@@ -1031,7 +1031,7 @@ int main (int argc, char **argv)
cout << setw(12) << stats_kmer[key] << " " ;
cout << setw(6) << fixed << setprecision(2) << (float) stats_kmer[key] / total_length * 100 << "%" ;
cout << " " << key << " " << it->second.name << endl ;
cout << " " << key << " " << it.second->code << endl ;
}
if (__only_on_exit__clean_memory) { delete multigermline; } return 0;
......
......@@ -5,6 +5,7 @@ import json
import fasta
import random
import argparse
import os.path
random.seed(33328778554)
......@@ -245,36 +246,37 @@ def list_int(s):
if __name__ == '__main__':
DESCRIPTION='Script generating fake V(D)J recombinations'
parser = argparse.ArgumentParser(description=DESCRIPTION)
parser.add_argument('-g', '--germlines', type=file, default='germlines.data', help='path to the germlines.data file')
parser.add_argument('-g', '--germlines', type=file, default='homo-sapiens.g', help='path to the germlines.data file')
parser.add_argument('--deletions', '-d', type=list_int, default = [(lambda: 5)], help='List -- separated by colons -- of the number of deletions at junctions (or single value, if the number is the same everywhere).')
parser.add_argument('--insertions', '-i', type=list_int, default = [(lambda: 3)], help='List -- separated by colons -- of the number of insertions at junctions (or single value, if the number is the same everywhere')
parser.add_argument('--random-deletions', '-D', type=list_random_tuple, help='List of random deletions at junctions under the format mean,standard_deviation (or single value, if the number is the same everywhere')
parser.add_argument('--random-insertions', '-I', type=list_random_tuple, help='List of the number of insertions at junctions under the format mean,standard_deviation (or single value, if the number is the same everywhere')
parser.add_argument('-n', '--nb-recombinations', type=int, default=5, help='Number of times each recombination (with insertions/deletions) is generated')
parser.add_argument('-e', '--error', type=float, default = 0., help='Probability of error at the nucleotide level')
parser.add_argument('-e', '--error', type=float, default = 0., help='Probability of substitution at the nucleotide level')
parser.add_argument('-b', '--basename', default='generated', help='Basename used for generated filenames')
args = parser.parse_args()
germlines_json = args.germlines.read().replace('germline_data = ', '')
germlines = json.loads(germlines_json)
for code in germlines:
g = germlines[code]
for code in germlines["systems"]:
g = germlines["systems"][code]
print("--- %s - %-4s - %s" % (g['shortcut'], code, g['description']))
basepath = germlines["path"] + os.path.sep
# Read germlines
nb_recomb = 0
for recomb in g['recombinations']:
labels = ['V']
files = [recomb['5']]
files = [[basepath + f for f in recomb['5']]]
if '4' in recomb:
labels.append('D')
files.append(recomb['4'])
files.append([basepath + f for f in recomb['4']])
labels.append('J')
files.append(recomb['3'])
files.append([basepath + f for f in recomb['3']])
repertoire = vdj_repertoire.files(labels, files)
print(" 5: %3d sequences\n" % repertoire.nb_sequences('V'),
......@@ -286,14 +288,14 @@ if __name__ == '__main__':
code_in_filename = code_in_filename + '-%d' % (nb_recomb+1)
# Generate recombinations
recombination0 = vdj_recombination()
generate_to_file(repertoire, recombination0, code, '../data/gen/0-removes-%s.should-vdj.fa' % code_in_filename, 1)
# recombination0 = vdj_recombination()
# generate_to_file(repertoire, recombination0, code, '../data/gen/0-removes-%s.should-vdj.fa' % code_in_filename, 1)
deletions = args.deletions if args.random_deletions is None else args.random_deletions
insertions = args.insertions if args.random_insertions is None else args.random_insertions
recombination5 = vdj_recombination(deletions=deletions, insertions=insertions, processing = [(lambda s: mutate_sequence(s, args.error))])
generate_to_file(repertoire, recombination5, code, '../data/gen/5-removes-%s.should-vdj.fa' % code_in_filename, args.nb_recombinations)
generate_to_file(repertoire, recombination5, code, '../data/gen/%s-%s.should-vdj.fa' % (args.basename, code_in_filename), args.nb_recombinations)
print()