Attention une mise à jour du serveur va être effectuée le lundi 17 mai entre 13h et 13h30. Cette mise à jour va générer une interruption du service de quelques minutes.

germline.cpp 11.5 KB
Newer Older
1 2

#include "germline.h"
3
#include "automaton.hpp"
4
#include <fstream>
5
#include <ctype.h>
6

7
void Germline::init(string _code, char _shortcut,
8
                    string seed,
9
                    int max_indexing)
10
{
11
  seg_method = SEG_METHOD_53 ;
12 13
  code = _code ;
  shortcut = _shortcut ;
14
  index = 0 ;
15
  this->max_indexing = max_indexing;
16
  this->seed = seed;
17 18
  if (this->seed.size() == 0)
    this->seed = DEFAULT_GERMLINE_SEED;
19

20 21 22
  affect_5 = "V" ;
  affect_4 = "" ;
  affect_3 = "J" ;
23 24

  affect_5 = string(1, toupper(shortcut)) + "-" + code + "V";
25
  affect_4 = string(1, 14 + shortcut) + "-" + code + "D";
26
  affect_3 = string(1, tolower(shortcut)) + "-" + code + "J";
27 28
}

29 30

Germline::Germline(string _code, char _shortcut,
31
		   string seed, int max_indexing)
32
{
33
  init(_code, _shortcut, seed, max_indexing);
34 35 36
}


37 38
Germline::Germline(string _code, char _shortcut,
		   string f_rep_5, string f_rep_4, string f_rep_3,
39
		   string seed, int max_indexing)
40
{
41
  init(_code, _shortcut, seed, max_indexing);
42

43 44 45 46
  f_reps_5.push_back(f_rep_5);
  f_reps_4.push_back(f_rep_4);
  f_reps_3.push_back(f_rep_3);

47
  /// no CYS104_IN_GAPPED_V / PHE118_TRP118_IN_GAPPED_J here ?
48 49 50
  rep_5 = BioReader(f_rep_5, 2, "|");
  rep_4 = BioReader(f_rep_4, 2, "|");
  rep_3 = BioReader(f_rep_3, 2, "|");
51 52 53

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
54 55 56
}


57 58
Germline::Germline(string _code, char _shortcut,
		   list <string> _f_reps_5, list <string> _f_reps_4, list <string> _f_reps_3,
59
		   string seed, int max_indexing)
60
{
61
  init(_code, _shortcut, seed, max_indexing);
62 63 64 65 66

  f_reps_5 = _f_reps_5 ;
  f_reps_4 = _f_reps_4 ;
  f_reps_3 = _f_reps_3 ;

67 68
  bool regular = (code.find("+") == string::npos);

69 70 71
  rep_5 = BioReader(2, "|", regular ? CYS104_IN_GAPPED_V : 0);
  rep_4 = BioReader(2, "|") ;
  rep_3 = BioReader(2, "|", regular ? PHE118_TRP118_IN_GAPPED_J : 0);
72

73
  for (list<string>::const_iterator it = f_reps_5.begin(); it != f_reps_5.end(); ++it)
74
    rep_5.add(*it);
75 76
  
  for (list<string>::const_iterator it = f_reps_4.begin(); it != f_reps_4.end(); ++it)
77
    rep_4.add(*it);
78 79

  for (list<string>::const_iterator it = f_reps_3.begin(); it != f_reps_3.end(); ++it)
80
    rep_3.add(*it);
81 82 83

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
84 85 86
}


87
Germline::Germline(string _code, char _shortcut, 
88
           BioReader _rep_5, BioReader _rep_4, BioReader _rep_3,
89
                   string seed, int max_indexing)
90
{
91
  init(_code, _shortcut, seed, max_indexing);
92 93 94 95

  rep_5 = _rep_5 ;
  rep_4 = _rep_4 ;
  rep_3 = _rep_3 ;
96 97 98

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
99 100
}

101
Germline::Germline(string code, char shortcut, string path, json json_recom,
102
                   string seed, int max_indexing)
103
{
104
  init(code, shortcut, seed, max_indexing);
105 106

  bool regular = (code.find("+") == string::npos);
107
  
108 109 110
  rep_5 = BioReader(2, "|", regular ? CYS104_IN_GAPPED_V : 0) ;
  rep_4 = BioReader(2, "|") ;
  rep_3 = BioReader(2, "|", regular ? PHE118_TRP118_IN_GAPPED_J : 0) ;
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136

  for (json::iterator it = json_recom["5"].begin();
       it != json_recom["5"].end(); ++it) 
  {
    string filename = *it;
    f_reps_5.push_back(path + filename);
    rep_5.add(path + filename);
  }
  
  if (json_recom.find("4") != json_recom.end()) {
    for (json::iterator it = json_recom["4"].begin();
        it != json_recom["4"].end(); ++it) 
    {
        string filename = *it;
        f_reps_4.push_back(path + filename);
        rep_4.add(path + filename);
    }
  }
  
  for (json::iterator it = json_recom["3"].begin();
       it != json_recom["3"].end(); ++it) 
  {
    string filename = *it;
    f_reps_3.push_back(path + filename);
    rep_3.add(path + filename);
  }
137 138 139

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
140

141 142 143 144 145 146 147 148 149 150 151 152
  // SEG_METHOD_ONE
  if (json_recom.find("1") != json_recom.end())
    {
      seg_method = SEG_METHOD_ONE ;
      for (json::iterator it = json_recom["1"].begin();
           it != json_recom["1"].end(); ++it)
        {
          string filename = *it;
          f_reps_4.push_back(path + filename);
          rep_4.add(path + filename);
        }
    }
153 154
}

155 156 157 158 159
void Germline::finish() {
  if (index)
    index->finish_building();
}

160
void Germline::new_index(IndexTypes type)
161
{
162 163
  assert(! seed.empty());

164
  bool rc = true ;
165
  index = KmerStoreFactory<KmerAffect>::createIndex(type, seed, rc);
166
  index->refs = 1;
167

168 169 170
  update_index();
}

171
void Germline::set_index(IKmerStore<KmerAffect> *_index)
172 173
{
  index = _index;
174
  index->refs++ ;
175 176 177
}


178
void Germline::update_index(IKmerStore<KmerAffect> *_index)
179
{
180
  if (!_index) _index = index ;
181

182 183 184
  _index->insert(rep_5, affect_5, max_indexing, seed);
  _index->insert(rep_4, affect_4, 0, seed);
  _index->insert(rep_3, affect_3, -max_indexing, seed);
185 186
}

187 188
void Germline::mark_as_ambiguous(Germline *other)
{
189
  index->insert(other->rep_5, AFFECT_AMBIGUOUS_SYMBOL, max_indexing, seed);
190 191

  if (other->affect_4.size())
192
    index->insert(other->rep_4, AFFECT_AMBIGUOUS_SYMBOL, 0, seed);
193

194
  index->insert(other->rep_3, AFFECT_AMBIGUOUS_SYMBOL, -max_indexing, seed);
195 196
}

197 198 199 200 201
void Germline::override_rep5_rep3_from_labels(KmerAffect left, KmerAffect right)
{
  rep_5 = index->getLabel(left);
  rep_3 = index->getLabel(right);
}
202

203 204
Germline::~Germline()
{
205
  if (index)
206 207 208 209
    {
      if (--(index->refs) == 0)
        delete index;
    }
210 211 212 213
}

ostream &operator<<(ostream &out, const Germline &germline)
{
214
  out << setw(5) << left << germline.code << right << " '" << germline.shortcut << "' "
215
      << " ";
216

217
  size_t seed_span = germline.seed.size();
218 219
  size_t seed_w = seed_weight(germline.seed);

220
  if (germline.index) {
221
    out << " 0x" << hex << setw(2) << setfill('0') << germline.index->id << dec << setfill(' ') << " " ;
222
    out << fixed << setprecision(3) << setw(8)
223 224
        << 100 * germline.index->getIndexLoad(KmerAffect(germline.affect_5, 1, seed_span)) << "%" << " "
        << 100 * germline.index->getIndexLoad(KmerAffect(germline.affect_3, 1, seed_span)) << "%";
225
    out << " l" << germline.seed.length() << " k" << seed_w << " " << germline.seed ;
226
  }
227 228

  out << endl;
229 230 231 232
  return out;
}


233
MultiGermline::MultiGermline(IndexTypes indexType, bool _one_index_per_germline):indexType(indexType)
234
{
235
  ref = "custom germlines" ;
236
  species = "custom germlines" ;
237
  species_taxon_id = 0 ;
238
  index = NULL;
239
  one_index_per_germline = _one_index_per_germline;
240 241
}

Mikaël Salson's avatar
Mikaël Salson committed
242
MultiGermline::~MultiGermline() {
243 244 245 246
  if (index && --(index->refs) == 0) {
    delete index;
  }
  for (list<Germline*>::iterator it = germlines.begin(); it != germlines.end(); ++it)
Mikaël Salson's avatar
Mikaël Salson committed
247 248 249 250
    {
      delete *it ;
    }
}
251

252 253 254 255 256
void MultiGermline::insert(Germline *germline)
{
  germlines.push_back(germline);
}

257
void MultiGermline::add_germline(Germline *germline)
258
{
259
  if (one_index_per_germline)
260
    germline->new_index(indexType);
261 262 263
  germlines.push_back(germline);
}

264
void MultiGermline::build_from_json(string path, string json_filename_and_filter, int filter,
265
                                    string default_seed, int default_max_indexing)
266
{
267 268 269 270 271 272 273 274 275 276 277

  //extract json_filename and systems_filter
  string json_filename = json_filename_and_filter;
  string systems_filter = "";

  size_t pos_lastcolon = json_filename_and_filter.find_last_of(':');
  if (pos_lastcolon != std::string::npos) {
    json_filename = json_filename_and_filter.substr(0, pos_lastcolon);
    systems_filter = "," + json_filename_and_filter.substr(pos_lastcolon+1) + "," ;
  }

278

279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
  //open and parse .g file
  json germlines ;

  try {
    ifstream germline_data(path + "/" + json_filename);

    string content( (std::istreambuf_iterator<char>(germline_data) ),
                    (std::istreambuf_iterator<char>()    ) );

    germlines = json::parse(content);

  } catch (const invalid_argument e) {
    cerr << ERROR_STRING << "Vidjil cannot open .g file " << path + "/" + json_filename << ": " << e.what() << endl;
    exit(1);
  }
294

295
  ref = germlines["ref"].get<std::string>();
296 297 298
  species = germlines["species"].get<std::string>();
  species_taxon_id = germlines["species_taxon_id"];

299 300
  path += "/" + germlines["path"].get<std::string>();

301
  json j = germlines["systems"];
302 303 304
  
  //for each germline
  for (json::iterator it = j.begin(); it != j.end(); ++it) {
305
    int max_indexing = default_max_indexing;
306
      
307 308 309
    json json_value = it.value();
    json recom = json_value["recombinations"];
    char shortcut = json_value["shortcut"].dump()[1];
310
    string code = it.key();
311
    json json_parameters = json_value["parameters"];
312 313 314
    string seed = json_parameters["seed"];

    if (default_max_indexing == 0) {
315 316
      if (json_parameters.count("trim_sequences") > 0) {
        max_indexing = json_parameters["trim_sequences"];
317 318
      }
    }
319

320 321 322 323 324 325 326 327
    if (systems_filter.size())
      {
        // match 'TRG' inside 'IGH,TRG'
        // TODO: code a more flexible match, regex ?
        if (systems_filter.find("," + code + ",") == string::npos)
          continue ;
      }

328 329 330 331 332 333 334 335 336 337 338 339 340
    switch (filter) {
    case GERMLINES_REGULAR:
      if (code.find("+") != string::npos) continue ;
      break ;

    case GERMLINES_INCOMPLETE:
      if (code.find("+") == string::npos) continue ;
      break ;

    default:
      break ;
    }

341 342 343 344 345
    map<string, string> seedMap;
    seedMap["13s"] = SEED_S13;
    seedMap["12s"] = SEED_S12;
    seedMap["10s"] = SEED_S10;
    seedMap["9s"] = SEED_9;
346

347
    seed = (default_seed.size() == 0) ? seedMap[seed] : default_seed;
348 349 350
    
    //for each set of recombination 3/4/5
    for (json::iterator it2 = recom.begin(); it2 != recom.end(); ++it2) {
351 352
      add_germline(new Germline(code, shortcut, path + "/", *it2,
                                seed, max_indexing));
353 354 355 356 357
    }
  }
  
}

358
/* if 'one_index_per_germline' was not set, this should be called once all germlines have been loaded */
359
void MultiGermline::insert_in_one_index(IKmerStore<KmerAffect> *_index, bool set_index)
360 361 362 363
{
  for (list<Germline*>::const_iterator it = germlines.begin(); it != germlines.end(); ++it)
    {
      Germline *germline = *it ;
364

365 366
      if (germline->rep_4.size())
	germline->affect_4 = string(1, 14 + germline->shortcut) + "-" + germline->code + "D";
367

368 369 370 371
      germline->update_index(_index);

      if (set_index)
        germline->set_index(_index);
372 373 374
    }
}

375
void MultiGermline::build_with_one_index(string seed, bool set_index)
376 377
{
  bool rc = true ;
378
  index = KmerStoreFactory<KmerAffect>::createIndex(indexType, seed, rc);
379
  index->refs = 1;
380
  insert_in_one_index(index, set_index);
381 382
}

383 384 385
void MultiGermline::finish() {
  if (index) {
    index->finish_building();
386 387 388
  }
  for (auto germline: germlines) {
    germline->finish();
389 390 391
  }
}

392 393 394
/* Mark k-mers common to several germlines as ambiguous */
void MultiGermline::mark_cross_germlines_as_ambiguous()
{
395
  string VdJa = "TRA+D";
396
  
397 398 399 400
  for (list<Germline*>::const_iterator it = germlines.begin(); it != germlines.end(); ++it)
    {
      Germline *germline = *it ;
      cout << *germline << ":" ;
401 402 403 404

      // Skip VdJa
      if (!(germline->code.compare(VdJa)))
        continue;
405 406 407 408 409 410 411
      
      for (list<Germline*>::const_iterator it2 = germlines.begin(); it2 != germlines.end(); ++it2)
      {
        Germline *germline2 = *it2 ;
        if (germline2 == germline)
          continue ;

412 413 414
        // Skip germlines on a same system, such as 'D' (TRD) and 'd' (TRD+)
        if (toupper(germline2->shortcut) == toupper(germline->shortcut))
          continue;
415 416 417 418

        // Skip VdJa
        if (!(germline2->code.compare(VdJa)))
          continue;
419
        
420 421 422 423 424 425 426
        germline->mark_as_ambiguous(germline2);
      }

      cout << endl;
    }
}

427 428 429

ostream &operator<<(ostream &out, const MultiGermline &multigermline)
{
430 431
  out << multigermline.species << " (" << multigermline.species_taxon_id << ")" << endl ;

432 433 434 435 436 437 438 439
  for (list<Germline*>::const_iterator it = multigermline.germlines.begin(); it != multigermline.germlines.end(); ++it)
    {
      Germline *germline = *it ;
      out << "   " << *germline ;
    }

  return out;
}