germline.cpp 11.9 KB
Newer Older
1
#include "filter.h"
2
#include "germline.h"
3
#include "automaton.hpp"
4
#include <fstream>
5
#include <ctype.h>
6

7
void Germline::init(string _code, char _shortcut,
8
                    string seed,
9
                    int max_indexing, bool build_automaton)
10
{
11
  seg_method = SEG_METHOD_53 ;
12 13
  code = _code ;
  shortcut = _shortcut ;
14
  index = 0 ;
15
  this->max_indexing = max_indexing;
16
  this->seed = seed;
17 18
  if (this->seed.size() == 0)
    this->seed = DEFAULT_GERMLINE_SEED;
19

20 21 22
  affect_5 = "V" ;
  affect_4 = "" ;
  affect_3 = "J" ;
23 24

  affect_5 = string(1, toupper(shortcut)) + "-" + code + "V";
25
  affect_4 = string(1, 14 + shortcut) + "-" + code + "D";
26
  affect_3 = string(1, tolower(shortcut)) + "-" + code + "J";
27
  filter_5 = build_automaton ? new FilterWithACAutomaton(rep_5, seed) : nullptr;
28 29
}

30 31

Germline::Germline(string _code, char _shortcut,
32
                   string seed, int max_indexing, bool build_automaton)
33
{
34
  init(_code, _shortcut, seed, max_indexing, build_automaton);
35 36 37
}


38
Germline::Germline(string _code, char _shortcut,
39 40
                   string f_rep_5, string f_rep_4, string f_rep_3,
                   string seed, int max_indexing, bool build_automaton)
41 42
{

43 44 45 46
  f_reps_5.push_back(f_rep_5);
  f_reps_4.push_back(f_rep_4);
  f_reps_3.push_back(f_rep_3);

47
  /// no CYS104_IN_GAPPED_V / PHE118_TRP118_IN_GAPPED_J here ?
48 49 50
  rep_5 = BioReader(f_rep_5, 2, "|");
  rep_4 = BioReader(f_rep_4, 2, "|");
  rep_3 = BioReader(f_rep_3, 2, "|");
51 52

  init(_code, _shortcut, seed, max_indexing, build_automaton);
53 54 55

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
56 57 58
}


59
Germline::Germline(string _code, char _shortcut,
60 61
                   list <string> _f_reps_5, list <string> _f_reps_4, list <string> _f_reps_3,
                   string seed, int max_indexing, bool build_automaton)
62 63 64 65 66 67
{

  f_reps_5 = _f_reps_5 ;
  f_reps_4 = _f_reps_4 ;
  f_reps_3 = _f_reps_3 ;

68 69
  bool regular = (code.find("+") == string::npos);

70 71 72
  rep_5 = BioReader(2, "|", regular ? CYS104_IN_GAPPED_V : 0);
  rep_4 = BioReader(2, "|") ;
  rep_3 = BioReader(2, "|", regular ? PHE118_TRP118_IN_GAPPED_J : 0);
73

74
  for (list<string>::const_iterator it = f_reps_5.begin(); it != f_reps_5.end(); ++it)
75
    rep_5.add(*it);
76 77 78

  init(_code, _shortcut, seed, max_indexing, build_automaton);

79
  for (list<string>::const_iterator it = f_reps_4.begin(); it != f_reps_4.end(); ++it)
80
    rep_4.add(*it);
81 82

  for (list<string>::const_iterator it = f_reps_3.begin(); it != f_reps_3.end(); ++it)
83
    rep_3.add(*it);
84 85 86

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
87 88 89
}


90
Germline::Germline(string _code, char _shortcut, 
91
           BioReader _rep_5, BioReader _rep_4, BioReader _rep_3,
92
                   string seed, int max_indexing, bool build_automaton)
93 94 95 96
{
  rep_5 = _rep_5 ;
  rep_4 = _rep_4 ;
  rep_3 = _rep_3 ;
97 98

  init(_code, _shortcut, seed, max_indexing, build_automaton);
99 100 101

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
102 103
}

104
Germline::Germline(string code, char shortcut, string path, json json_recom,
105
                   string seed, int max_indexing, bool build_automaton)
106
{
107 108

  bool regular = (code.find("+") == string::npos);
109
  
110 111 112
  rep_5 = BioReader(2, "|", regular ? CYS104_IN_GAPPED_V : 0) ;
  rep_4 = BioReader(2, "|") ;
  rep_3 = BioReader(2, "|", regular ? PHE118_TRP118_IN_GAPPED_J : 0) ;
113 114 115 116 117 118 119 120

  for (json::iterator it = json_recom["5"].begin();
       it != json_recom["5"].end(); ++it) 
  {
    string filename = *it;
    f_reps_5.push_back(path + filename);
    rep_5.add(path + filename);
  }
121 122 123

  init(code, shortcut, seed, max_indexing, build_automaton);

124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
  if (json_recom.find("4") != json_recom.end()) {
    for (json::iterator it = json_recom["4"].begin();
        it != json_recom["4"].end(); ++it) 
    {
        string filename = *it;
        f_reps_4.push_back(path + filename);
        rep_4.add(path + filename);
    }
  }
  
  for (json::iterator it = json_recom["3"].begin();
       it != json_recom["3"].end(); ++it) 
  {
    string filename = *it;
    f_reps_3.push_back(path + filename);
    rep_3.add(path + filename);
  }
141 142 143

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
144

145 146 147 148 149 150 151 152 153 154 155 156
  // SEG_METHOD_ONE
  if (json_recom.find("1") != json_recom.end())
    {
      seg_method = SEG_METHOD_ONE ;
      for (json::iterator it = json_recom["1"].begin();
           it != json_recom["1"].end(); ++it)
        {
          string filename = *it;
          f_reps_4.push_back(path + filename);
          rep_4.add(path + filename);
        }
    }
157 158
}

159 160 161 162
int Germline::getMaxIndexing(){
  return this->max_indexing;
}

163 164 165 166 167
void Germline::finish() {
  if (index)
    index->finish_building();
}

168
void Germline::new_index(IndexTypes type)
169
{
170 171
  assert(! seed.empty());

172
  bool rc = true ;
173
  index = KmerStoreFactory<KmerAffect>::createIndex(type, seed, rc);
174
  index->refs = 1;
175

176 177 178
  update_index();
}

179
void Germline::set_index(IKmerStore<KmerAffect> *_index)
180 181
{
  index = _index;
182
  index->refs++ ;
183 184 185
}


186
void Germline::update_index(IKmerStore<KmerAffect> *_index)
187
{
188
  if (!_index) _index = index ;
189

190 191 192
  _index->insert(rep_5, affect_5, max_indexing, seed);
  _index->insert(rep_4, affect_4, 0, seed);
  _index->insert(rep_3, affect_3, -max_indexing, seed);
193 194
}

195 196
void Germline::mark_as_ambiguous(Germline *other)
{
197
  index->insert(other->rep_5, AFFECT_AMBIGUOUS_SYMBOL, max_indexing, seed);
198 199

  if (other->affect_4.size())
200
    index->insert(other->rep_4, AFFECT_AMBIGUOUS_SYMBOL, 0, seed);
201

202
  index->insert(other->rep_3, AFFECT_AMBIGUOUS_SYMBOL, -max_indexing, seed);
203 204
}

205 206 207 208 209
void Germline::override_rep5_rep3_from_labels(KmerAffect left, KmerAffect right)
{
  rep_5 = index->getLabel(left);
  rep_3 = index->getLabel(right);
}
210

211 212
FilterWithACAutomaton* Germline::getFilter_5(){
  return this->filter_5;
213 214
}

215 216
Germline::~Germline()
{
217 218
  if(filter_5){
    delete filter_5;
219
  }
220
  if (index)
221 222 223 224
    {
      if (--(index->refs) == 0)
        delete index;
    }
225 226 227 228
}

ostream &operator<<(ostream &out, const Germline &germline)
{
229
  out << setw(5) << left << germline.code << right << " '" << germline.shortcut << "' "
230
      << " ";
231

232
  size_t seed_span = germline.seed.size();
233 234
  size_t seed_w = seed_weight(germline.seed);

235
  if (germline.index) {
236
    out << " 0x" << hex << setw(2) << setfill('0') << germline.index->id << dec << setfill(' ') << " " ;
237
    out << fixed << setprecision(3) << setw(8)
238 239
        << 100 * germline.index->getIndexLoad(KmerAffect(germline.affect_5, 1, seed_span)) << "%" << " "
        << 100 * germline.index->getIndexLoad(KmerAffect(germline.affect_3, 1, seed_span)) << "%";
240
    out << " l" << germline.seed.length() << " k" << seed_w << " " << germline.seed ;
241
  }
242 243

  out << endl;
244 245 246 247
  return out;
}


248
MultiGermline::MultiGermline(IndexTypes indexType, bool _one_index_per_germline):indexType(indexType)
249
{
250
  ref = "custom germlines" ;
251
  species = "custom germlines" ;
252
  species_taxon_id = 0 ;
253
  index = NULL;
254
  one_index_per_germline = _one_index_per_germline;
255 256
}

Mikael Salson's avatar
Mikael Salson committed
257
MultiGermline::~MultiGermline() {
258 259 260 261
  if (index && --(index->refs) == 0) {
    delete index;
  }
  for (list<Germline*>::iterator it = germlines.begin(); it != germlines.end(); ++it)
Mikael Salson's avatar
Mikael Salson committed
262 263 264 265
    {
      delete *it ;
    }
}
266

267 268 269 270 271
void MultiGermline::insert(Germline *germline)
{
  germlines.push_back(germline);
}

272
void MultiGermline::add_germline(Germline *germline)
273
{
274
  if (one_index_per_germline)
275
    germline->new_index(indexType);
276 277 278
  germlines.push_back(germline);
}

279
void MultiGermline::build_from_json(string path, string json_filename_and_filter, int filter,
280
                                    string default_seed, int default_max_indexing, bool build_automaton)
281
{
282 283 284 285 286 287 288 289 290 291 292

  //extract json_filename and systems_filter
  string json_filename = json_filename_and_filter;
  string systems_filter = "";

  size_t pos_lastcolon = json_filename_and_filter.find_last_of(':');
  if (pos_lastcolon != std::string::npos) {
    json_filename = json_filename_and_filter.substr(0, pos_lastcolon);
    systems_filter = "," + json_filename_and_filter.substr(pos_lastcolon+1) + "," ;
  }

293

294 295 296 297 298 299 300 301 302 303 304
  //open and parse .g file
  json germlines ;

  try {
    ifstream germline_data(path + "/" + json_filename);

    string content( (std::istreambuf_iterator<char>(germline_data) ),
                    (std::istreambuf_iterator<char>()    ) );

    germlines = json::parse(content);

305
  } catch (const invalid_argument &e) {
306 307 308
    cerr << ERROR_STRING << "Vidjil cannot open .g file " << path + "/" + json_filename << ": " << e.what() << endl;
    exit(1);
  }
309

310
  ref = germlines["ref"].get<std::string>();
311 312 313
  species = germlines["species"].get<std::string>();
  species_taxon_id = germlines["species_taxon_id"];

314 315
  path += "/" + germlines["path"].get<std::string>();

316
  json j = germlines["systems"];
317 318 319
  
  //for each germline
  for (json::iterator it = j.begin(); it != j.end(); ++it) {
320
    int max_indexing = default_max_indexing;
321
      
322 323 324
    json json_value = it.value();
    json recom = json_value["recombinations"];
    char shortcut = json_value["shortcut"].dump()[1];
325
    string code = it.key();
326
    json json_parameters = json_value["parameters"];
327 328 329
    string seed = json_parameters["seed"];

    if (default_max_indexing == 0) {
330 331
      if (json_parameters.count("trim_sequences") > 0) {
        max_indexing = json_parameters["trim_sequences"];
332 333
      }
    }
334

335 336 337 338 339 340 341 342
    if (systems_filter.size())
      {
        // match 'TRG' inside 'IGH,TRG'
        // TODO: code a more flexible match, regex ?
        if (systems_filter.find("," + code + ",") == string::npos)
          continue ;
      }

343 344 345 346 347 348 349 350 351 352 353 354 355
    switch (filter) {
    case GERMLINES_REGULAR:
      if (code.find("+") != string::npos) continue ;
      break ;

    case GERMLINES_INCOMPLETE:
      if (code.find("+") == string::npos) continue ;
      break ;

    default:
      break ;
    }

356
    seed = (default_seed.size() == 0) ? seed : default_seed;
357

358 359
    //for each set of recombination 3/4/5
    for (json::iterator it2 = recom.begin(); it2 != recom.end(); ++it2) {
360
      add_germline(new Germline(code, shortcut, path + "/", *it2,
361
                                seed, max_indexing, build_automaton));
362 363
    }
  }
364

365 366
}

367
/* if 'one_index_per_germline' was not set, this should be called once all germlines have been loaded */
368
void MultiGermline::insert_in_one_index(IKmerStore<KmerAffect> *_index, bool set_index)
369 370 371 372
{
  for (list<Germline*>::const_iterator it = germlines.begin(); it != germlines.end(); ++it)
    {
      Germline *germline = *it ;
373

374 375
      if (germline->rep_4.size())
	germline->affect_4 = string(1, 14 + germline->shortcut) + "-" + germline->code + "D";
376

377 378 379 380
      germline->update_index(_index);

      if (set_index)
        germline->set_index(_index);
381 382 383
    }
}

384
void MultiGermline::build_with_one_index(string seed, bool set_index)
385 386
{
  bool rc = true ;
387
  index = KmerStoreFactory<KmerAffect>::createIndex(indexType, seed, rc);
388
  index->refs = 1;
389
  insert_in_one_index(index, set_index);
390 391
}

392 393 394
void MultiGermline::finish() {
  if (index) {
    index->finish_building();
395 396 397
  }
  for (auto germline: germlines) {
    germline->finish();
398 399 400
  }
}

401 402 403
/* Mark k-mers common to several germlines as ambiguous */
void MultiGermline::mark_cross_germlines_as_ambiguous()
{
404
  string VdJa = "TRA+D";
405
  
406 407 408 409
  for (list<Germline*>::const_iterator it = germlines.begin(); it != germlines.end(); ++it)
    {
      Germline *germline = *it ;
      cout << *germline << ":" ;
410 411 412 413

      // Skip VdJa
      if (!(germline->code.compare(VdJa)))
        continue;
414 415 416 417 418 419 420
      
      for (list<Germline*>::const_iterator it2 = germlines.begin(); it2 != germlines.end(); ++it2)
      {
        Germline *germline2 = *it2 ;
        if (germline2 == germline)
          continue ;

421 422 423
        // Skip germlines on a same system, such as 'D' (TRD) and 'd' (TRD+)
        if (toupper(germline2->shortcut) == toupper(germline->shortcut))
          continue;
424 425 426 427

        // Skip VdJa
        if (!(germline2->code.compare(VdJa)))
          continue;
428
        
429 430 431 432 433 434 435
        germline->mark_as_ambiguous(germline2);
      }

      cout << endl;
    }
}

436 437 438

ostream &operator<<(ostream &out, const MultiGermline &multigermline)
{
439 440
  out << multigermline.species << " (" << multigermline.species_taxon_id << ")" << endl ;

441 442 443 444 445 446 447 448
  for (list<Germline*>::const_iterator it = multigermline.germlines.begin(); it != multigermline.germlines.end(); ++it)
    {
      Germline *germline = *it ;
      out << "   " << *germline ;
    }

  return out;
}