germline.cpp 13.3 KB
Newer Older
1
#include "filter.h"
2
#include "germline.h"
3
#include "automaton.hpp"
4
#include "../lib/json.hpp"
5
#include <fstream>
6
#include <ctype.h>
7

8
void Germline::init(string _code, char _shortcut,
9
                    string seed_5, string seed_4, string seed_3,
10
                    int max_indexing, bool build_automaton)
11
{
12
  seg_method = SEG_METHOD_53 ;
13
14
  code = _code ;
  shortcut = _shortcut ;
15
  index = 0 ;
16
  this->max_indexing = max_indexing;
17

18
19
20
  this->seed_5 = expand_seed(seed_5);
  this->seed_4 = expand_seed(seed_4);
  this->seed_3 = expand_seed(seed_3);
21

22
23
24
  affect_5 = "V" ;
  affect_4 = "" ;
  affect_3 = "J" ;
25
26

  affect_5 = string(1, toupper(shortcut)) + "-" + code + "V";
27
  affect_4 = string(1, 14 + shortcut) + "-" + code + "D";
28
  affect_3 = string(1, tolower(shortcut)) + "-" + code + "J";
29
  filter_5 = build_automaton ? new FilterWithACAutomaton(rep_5, this->seed_5) : nullptr;
30
31
}

32
33

Germline::Germline(string _code, char _shortcut,
34
                   string seed_5, string seed_4, string seed_3, int max_indexing, bool build_automaton)
35
{
36
  init(_code, _shortcut, seed_5, seed_4, seed_3, max_indexing, build_automaton);
37
38
39
}


40
Germline::Germline(string _code, char _shortcut,
41
                   string f_rep_5, string f_rep_4, string f_rep_3,
42
                   string seed_5, string seed_4, string seed_3, int max_indexing, bool build_automaton)
43
44
{

45
46
47
48
  f_reps_5.push_back(f_rep_5);
  f_reps_4.push_back(f_rep_4);
  f_reps_3.push_back(f_rep_3);

49
  /// no CYS104_IN_GAPPED_V / PHE118_TRP118_IN_GAPPED_J here ?
50
51
52
  rep_5 = BioReader(f_rep_5, 2, "|");
  rep_4 = BioReader(f_rep_4, 2, "|");
  rep_3 = BioReader(f_rep_3, 2, "|");
53

54
  init(_code, _shortcut, seed_5, seed_4, seed_3, max_indexing, build_automaton);
55
56
57

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
58
59
60
}


61
Germline::Germline(string _code, char _shortcut,
62
                   list <string> _f_reps_5, list <string> _f_reps_4, list <string> _f_reps_3,
63
                   string seed_5, string seed_4, string seed_3, int max_indexing, bool build_automaton)
64
65
66
67
68
69
{

  f_reps_5 = _f_reps_5 ;
  f_reps_4 = _f_reps_4 ;
  f_reps_3 = _f_reps_3 ;

70
71
  bool regular = (code.find("+") == string::npos);

72
73
74
  rep_5 = BioReader(2, "|", regular ? CYS104_IN_GAPPED_V : 0);
  rep_4 = BioReader(2, "|") ;
  rep_3 = BioReader(2, "|", regular ? PHE118_TRP118_IN_GAPPED_J : 0);
75

76
  for (list<string>::const_iterator it = f_reps_5.begin(); it != f_reps_5.end(); ++it)
77
    rep_5.add(*it);
78

79
  init(_code, _shortcut, seed_5, seed_4, seed_3, max_indexing, build_automaton);
80

81
  for (list<string>::const_iterator it = f_reps_4.begin(); it != f_reps_4.end(); ++it)
82
    rep_4.add(*it);
83
84

  for (list<string>::const_iterator it = f_reps_3.begin(); it != f_reps_3.end(); ++it)
85
    rep_3.add(*it);
86
87
88

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
89
90
91
}


92
Germline::Germline(string _code, char _shortcut, 
93
           BioReader _rep_5, BioReader _rep_4, BioReader _rep_3,
94
                   string seed_5, string seed_4, string seed_3, int max_indexing, bool build_automaton)
95
96
97
98
{
  rep_5 = _rep_5 ;
  rep_4 = _rep_4 ;
  rep_3 = _rep_3 ;
99

100
  init(_code, _shortcut, seed_5, seed_4, seed_3, max_indexing, build_automaton);
101
102
103

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
104
105
}

106
Germline::Germline(string code, char shortcut, string path, json json_recom,
107
                   string seed_5, string seed_4, string seed_3, int max_indexing, bool build_automaton)
108
{
109
110

  bool regular = (code.find("+") == string::npos);
111
  
112
113
114
  rep_5 = BioReader(2, "|", regular ? CYS104_IN_GAPPED_V : 0) ;
  rep_4 = BioReader(2, "|") ;
  rep_3 = BioReader(2, "|", regular ? PHE118_TRP118_IN_GAPPED_J : 0) ;
115
116
117
118
119
120
121
122

  for (json::iterator it = json_recom["5"].begin();
       it != json_recom["5"].end(); ++it) 
  {
    string filename = *it;
    f_reps_5.push_back(path + filename);
    rep_5.add(path + filename);
  }
123

124
  init(code, shortcut, seed_5, seed_4, seed_3, max_indexing, build_automaton);
125

126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
  if (json_recom.find("4") != json_recom.end()) {
    for (json::iterator it = json_recom["4"].begin();
        it != json_recom["4"].end(); ++it) 
    {
        string filename = *it;
        f_reps_4.push_back(path + filename);
        rep_4.add(path + filename);
    }
  }
  
  for (json::iterator it = json_recom["3"].begin();
       it != json_recom["3"].end(); ++it) 
  {
    string filename = *it;
    f_reps_3.push_back(path + filename);
    rep_3.add(path + filename);
  }
143
144
145

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
146

147
148
149
150
151
152
153
154
155
156
157
158
  // SEG_METHOD_ONE
  if (json_recom.find("1") != json_recom.end())
    {
      seg_method = SEG_METHOD_ONE ;
      for (json::iterator it = json_recom["1"].begin();
           it != json_recom["1"].end(); ++it)
        {
          string filename = *it;
          f_reps_4.push_back(path + filename);
          rep_4.add(path + filename);
        }
    }
159
160
}

161
162
163
164
int Germline::getMaxIndexing(){
  return this->max_indexing;
}

165
166
167
168
169
void Germline::finish() {
  if (index)
    index->finish_building();
}

170
void Germline::new_index(IndexTypes type)
171
{
172
173
174
  assert(! seed_5.empty() && (seed_5.find(SEED_YES) != std::string::npos));
  assert(! seed_4.empty() && (seed_4.find(SEED_YES) != std::string::npos));
  assert(! seed_3.empty() && (seed_3.find(SEED_YES) != std::string::npos));
175

176
  bool rc = true ;
177
  index = KmerStoreFactory<KmerAffect>::createIndex(type, seed_5, rc);
178
  index->refs = 1;
179

180
181
182
  update_index();
}

183
void Germline::set_index(IKmerStore<KmerAffect> *_index)
184
{
185
186
187
188
  if (index != _index) {
    index = _index;
    index->refs++ ;
  }
189
190
191
}


192
void Germline::update_index(IKmerStore<KmerAffect> *_index)
193
{
194
  if (!_index) _index = index ;
195

196
197
198
  _index->insert(rep_5, affect_5, max_indexing, seed_5);
  _index->insert(rep_4, affect_4, 0, seed_4);
  _index->insert(rep_3, affect_3, -max_indexing, seed_3);
199
200
}

201
202
void Germline::mark_as_ambiguous(Germline *other)
{
203
  index->insert(other->rep_5, AFFECT_AMBIGUOUS_SYMBOL, max_indexing, seed_5);
204
205

  if (other->affect_4.size())
206
    index->insert(other->rep_4, AFFECT_AMBIGUOUS_SYMBOL, 0, seed_4);
207

208
  index->insert(other->rep_3, AFFECT_AMBIGUOUS_SYMBOL, -max_indexing, seed_3);
209
210
}

211
212
213
214
215
void Germline::override_rep5_rep3_from_labels(KmerAffect left, KmerAffect right)
{
  rep_5 = index->getLabel(left);
  rep_3 = index->getLabel(right);
}
216

217
218
FilterWithACAutomaton* Germline::getFilter_5(){
  return this->filter_5;
219
220
}

221
222
Germline::~Germline()
{
223
224
  if(filter_5){
    delete filter_5;
225
  }
226
  if (index)
227
228
229
230
    {
      if (--(index->refs) == 0)
        delete index;
    }
231
232
233
234
}

ostream &operator<<(ostream &out, const Germline &germline)
{
235
  out << setw(5) << left << germline.code << right << " '" << germline.shortcut << "' "
236
      << " ";
237

238
239
240
241
  size_t seed_5_span = germline.seed_5.size();
  size_t seed_5_w = seed_weight(germline.seed_5);
  size_t seed_3_span = germline.seed_3.size();
  size_t seed_3_w = seed_weight(germline.seed_3);
242

243
  if (germline.index) {
244
    out << " 0x" << hex << setw(2) << setfill('0') << germline.index->id << dec << setfill(' ') << " " ;
245
    out << fixed << setprecision(3) << setw(8)
246
247
248
249
        << 100 * germline.index->getIndexLoad(KmerAffect(germline.affect_5, 1, seed_5_span)) << "%" << " "
        << 100 * germline.index->getIndexLoad(KmerAffect(germline.affect_3, 1, seed_3_span)) << "%";
    out << " l" << germline.seed_5.length() << " k" << seed_5_w << " " << germline.seed_5 ;
    out << " l" << germline.seed_3.length() << " k" << seed_3_w << " " << germline.seed_3 ;
250
  }
251
252

  out << endl;
253
254
255
256
  return out;
}


257
MultiGermline::MultiGermline(IndexTypes indexType, bool _one_index_per_germline):indexType(indexType)
258
{
259
  ref = "custom germlines" ;
260
  species = "custom germlines" ;
261
  species_taxon_id = 0 ;
262
  index = NULL;
263
  one_index_per_germline = _one_index_per_germline;
264
265
}

Mikaël Salson's avatar
Mikaël Salson committed
266
MultiGermline::~MultiGermline() {
267
268
269
270
  if (index && --(index->refs) == 0) {
    delete index;
  }
  for (list<Germline*>::iterator it = germlines.begin(); it != germlines.end(); ++it)
Mikaël Salson's avatar
Mikaël Salson committed
271
272
273
274
    {
      delete *it ;
    }
}
275

276
277
278
279
280
void MultiGermline::insert(Germline *germline)
{
  germlines.push_back(germline);
}

281
void MultiGermline::add_germline(Germline *germline)
282
{
283
  if (one_index_per_germline)
284
    germline->new_index(indexType);
285
286
287
  germlines.push_back(germline);
}

288
void MultiGermline::build_from_json(string path, string json_filename_and_filter, int filter,
289
                                    string default_seed, int default_max_indexing, bool build_automaton)
290
{
291
292
293
294
295
296
297
298
299
300
301

  //extract json_filename and systems_filter
  string json_filename = json_filename_and_filter;
  string systems_filter = "";

  size_t pos_lastcolon = json_filename_and_filter.find_last_of(':');
  if (pos_lastcolon != std::string::npos) {
    json_filename = json_filename_and_filter.substr(0, pos_lastcolon);
    systems_filter = "," + json_filename_and_filter.substr(pos_lastcolon+1) + "," ;
  }

302

303
304
305
306
307
308
309
310
311
312
313
  //open and parse .g file
  json germlines ;

  try {
    ifstream germline_data(path + "/" + json_filename);

    string content( (std::istreambuf_iterator<char>(germline_data) ),
                    (std::istreambuf_iterator<char>()    ) );

    germlines = json::parse(content);

314
  } catch (const invalid_argument &e) {
315
316
317
    cerr << ERROR_STRING << "Vidjil cannot open .g file " << path + "/" + json_filename << ": " << e.what() << endl;
    exit(1);
  }
318

319
  ref = germlines["ref"].get<std::string>();
320
321
322
  species = germlines["species"].get<std::string>();
  species_taxon_id = germlines["species_taxon_id"];

323
324
  path += "/" + germlines["path"].get<std::string>();

325
  json j = germlines["systems"];
326
327
328
  
  //for each germline
  for (json::iterator it = j.begin(); it != j.end(); ++it) {
329
    int max_indexing = default_max_indexing;
330
      
331
332
333
    json json_value = it.value();
    json recom = json_value["recombinations"];
    char shortcut = json_value["shortcut"].dump()[1];
334
    string code = it.key();
335
    json json_parameters = json_value["parameters"];
336
337
    string seed;
    string seed_5, seed_4, seed_3;
338

339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
    if (json_parameters.find("seed") != json_parameters.end()) {
      seed = json_parameters["seed"];
    }
    if (default_seed.size() > 0)
      seed = default_seed;
    if (seed.size() > 0)
      seed_5 = seed_4 = seed_3 = seed;
    if (json_parameters.find("seed_5") != json_parameters.end()) {
      seed_5 = json_parameters["seed_5"];
    }
    if (json_parameters.find("seed_4") != json_parameters.end()) {
      seed_4 = json_parameters["seed_4"];
    }
    if (json_parameters.find("seed_3") != json_parameters.end()) {
      seed_3 = json_parameters["seed_3"];
    }
    
356
    if (default_max_indexing == 0) {
357
358
      if (json_parameters.count("trim_sequences") > 0) {
        max_indexing = json_parameters["trim_sequences"];
359
360
      }
    }
361

362
363
364
365
366
367
368
369
    if (systems_filter.size())
      {
        // match 'TRG' inside 'IGH,TRG'
        // TODO: code a more flexible match, regex ?
        if (systems_filter.find("," + code + ",") == string::npos)
          continue ;
      }

370
371
372
373
374
375
376
377
378
379
380
381
382
    switch (filter) {
    case GERMLINES_REGULAR:
      if (code.find("+") != string::npos) continue ;
      break ;

    case GERMLINES_INCOMPLETE:
      if (code.find("+") == string::npos) continue ;
      break ;

    default:
      break ;
    }

383
384
    //for each set of recombination 3/4/5
    for (json::iterator it2 = recom.begin(); it2 != recom.end(); ++it2) {
385
      add_germline(new Germline(code, shortcut, path + "/", *it2,
386
                                seed_5, seed_4, seed_3, max_indexing, build_automaton));
387
388
    }
  }
389

390
391
}

392
/* if 'one_index_per_germline' was not set, this should be called once all germlines have been loaded */
393
void MultiGermline::insert_in_one_index(IKmerStore<KmerAffect> *_index, bool set_index)
394
395
396
397
{
  for (list<Germline*>::const_iterator it = germlines.begin(); it != germlines.end(); ++it)
    {
      Germline *germline = *it ;
398

399
400
      if (germline->rep_4.size())
	germline->affect_4 = string(1, 14 + germline->shortcut) + "-" + germline->code + "D";
401

402
403
404
405
      germline->update_index(_index);

      if (set_index)
        germline->set_index(_index);
406
407
408
    }
}

409
void MultiGermline::build_with_one_index(string seed, bool set_index)
410
411
{
  bool rc = true ;
412
  index = KmerStoreFactory<KmerAffect>::createIndex(indexType, expand_seed(seed), rc);
413
  index->refs = 1;
414
  insert_in_one_index(index, set_index);
415
  index->multiple_in_one = true;
416
417
}

418
419
420
void MultiGermline::finish() {
  if (index) {
    index->finish_building();
421
422
423
  }
  for (auto germline: germlines) {
    germline->finish();
424
425
426
  }
}

427
428
429
/* Mark k-mers common to several germlines as ambiguous */
void MultiGermline::mark_cross_germlines_as_ambiguous()
{
430
  string VdJa = "TRA+D";
431
  
432
433
434
435
  for (list<Germline*>::const_iterator it = germlines.begin(); it != germlines.end(); ++it)
    {
      Germline *germline = *it ;
      cout << *germline << ":" ;
436
437
438
439

      // Skip VdJa
      if (!(germline->code.compare(VdJa)))
        continue;
440
441
442
443
444
445
446
      
      for (list<Germline*>::const_iterator it2 = germlines.begin(); it2 != germlines.end(); ++it2)
      {
        Germline *germline2 = *it2 ;
        if (germline2 == germline)
          continue ;

447
448
449
        // Skip germlines on a same system, such as 'D' (TRD) and 'd' (TRD+)
        if (toupper(germline2->shortcut) == toupper(germline->shortcut))
          continue;
450
451
452
453

        // Skip VdJa
        if (!(germline2->code.compare(VdJa)))
          continue;
454
        
455
456
457
458
459
460
461
        germline->mark_as_ambiguous(germline2);
      }

      cout << endl;
    }
}

462
463
464

ostream &operator<<(ostream &out, const MultiGermline &multigermline)
{
465
466
  out << multigermline.species << " (" << multigermline.species_taxon_id << ")" << endl ;

467
468
469
470
471
472
473
474
  for (list<Germline*>::const_iterator it = multigermline.germlines.begin(); it != multigermline.germlines.end(); ++it)
    {
      Germline *germline = *it ;
      out << "   " << *germline ;
    }

  return out;
}