germline.cpp 9.56 KB
Newer Older
1 2

#include "germline.h"
3
#include "automaton.hpp"
4
#include <fstream>
5
#include <ctype.h>
6

7
void Germline::init(string _code, char _shortcut,
8
                    int _delta_min, string seed,
9
                    int max_indexing)
10
{
11
  seg_method = SEG_METHOD_53 ;
12 13
  code = _code ;
  shortcut = _shortcut ;
14
  index = 0 ;
15
  this->max_indexing = max_indexing;
16
  this->seed = seed;
17

18 19 20
  affect_5 = "V" ;
  affect_4 = "" ;
  affect_3 = "J" ;
21 22

  affect_5 = string(1, toupper(shortcut)) + "-" + code + "V";
23
  affect_4 = string(1, 14 + shortcut) + "-" + code + "D";
24 25
  affect_3 = string(1, tolower(shortcut)) + "-" + code + "J";
     
26 27 28
  delta_min = _delta_min ;
}

29 30

Germline::Germline(string _code, char _shortcut,
31
		   int _delta_min, string seed,
32
                    int max_indexing)
33
{
34
  init(_code, _shortcut, _delta_min, seed, max_indexing);
35 36 37
}


38 39
Germline::Germline(string _code, char _shortcut,
		   string f_rep_5, string f_rep_4, string f_rep_3,
40
		   int _delta_min, string seed,
41
                    int max_indexing)
42
{
43
  init(_code, _shortcut, _delta_min, seed, max_indexing);
44

45 46 47 48
  f_reps_5.push_back(f_rep_5);
  f_reps_4.push_back(f_rep_4);
  f_reps_3.push_back(f_rep_3);

49
  /// no CYS104_IN_GAPPED_V / PHE118_TRP118_IN_GAPPED_J here ?
50 51 52
  rep_5 = Fasta(f_rep_5, 2, "|");
  rep_4 = Fasta(f_rep_4, 2, "|");
  rep_3 = Fasta(f_rep_3, 2, "|");
53 54 55

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
56 57 58
}


59 60
Germline::Germline(string _code, char _shortcut,
		   list <string> _f_reps_5, list <string> _f_reps_4, list <string> _f_reps_3,
61
		   int _delta_min, string seed,
62
                    int max_indexing)
63
{
64
  init(_code, _shortcut, _delta_min, seed, max_indexing);
65 66 67 68 69

  f_reps_5 = _f_reps_5 ;
  f_reps_4 = _f_reps_4 ;
  f_reps_3 = _f_reps_3 ;

70 71 72
  bool regular = (code.find("+") == string::npos);

  rep_5 = Fasta(2, "|", regular ? CYS104_IN_GAPPED_V : 0);
73
  rep_4 = Fasta(2, "|") ;
74
  rep_3 = Fasta(2, "|", regular ? PHE118_TRP118_IN_GAPPED_J : 0);
75

76
  for (list<string>::const_iterator it = f_reps_5.begin(); it != f_reps_5.end(); ++it)
77
    rep_5.add(*it);
78 79
  
  for (list<string>::const_iterator it = f_reps_4.begin(); it != f_reps_4.end(); ++it)
80
    rep_4.add(*it);
81 82

  for (list<string>::const_iterator it = f_reps_3.begin(); it != f_reps_3.end(); ++it)
83
    rep_3.add(*it);
84 85 86

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
87 88 89
}


90 91
Germline::Germline(string _code, char _shortcut, 
           Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
92
		   int _delta_min, string seed,
93
                    int max_indexing)
94
{
95
  init(_code, _shortcut, _delta_min, seed, max_indexing);
96 97 98 99

  rep_5 = _rep_5 ;
  rep_4 = _rep_4 ;
  rep_3 = _rep_3 ;
100 101 102

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
103 104
}

105 106
Germline::Germline(string code, char shortcut, string path, json json_recom,
                   string seed, int max_indexing)
107 108
{
    
109
  int delta_min = -10;
110 111
  
  if (json_recom.find("4") != json_recom.end()) {
112
      delta_min = 0;
113 114
  }
  
115
  init(code, shortcut, delta_min, seed, max_indexing);
116 117

  bool regular = (code.find("+") == string::npos);
118
  
119
  rep_5 = Fasta(2, "|", regular ? CYS104_IN_GAPPED_V : 0) ;
120
  rep_4 = Fasta(2, "|") ;
121
  rep_3 = Fasta(2, "|", regular ? PHE118_TRP118_IN_GAPPED_J : 0) ;
122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147

  for (json::iterator it = json_recom["5"].begin();
       it != json_recom["5"].end(); ++it) 
  {
    string filename = *it;
    f_reps_5.push_back(path + filename);
    rep_5.add(path + filename);
  }
  
  if (json_recom.find("4") != json_recom.end()) {
    for (json::iterator it = json_recom["4"].begin();
        it != json_recom["4"].end(); ++it) 
    {
        string filename = *it;
        f_reps_4.push_back(path + filename);
        rep_4.add(path + filename);
    }
  }
  
  for (json::iterator it = json_recom["3"].begin();
       it != json_recom["3"].end(); ++it) 
  {
    string filename = *it;
    f_reps_3.push_back(path + filename);
    rep_3.add(path + filename);
  }
148 149 150

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
151 152
}

153 154 155 156 157
void Germline::finish() {
  if (index)
    index->finish_building();
}

158
void Germline::new_index()
159
{
160 161
  assert(! seed.empty());

162
  bool rc = true ;
163
  index = new PointerACAutomaton<KmerAffect>(seed, rc);
164
  index->refs = 1;
165

166 167 168
  update_index();
}

169
void Germline::set_index(IKmerStore<KmerAffect> *_index)
170 171
{
  index = _index;
172
  index->refs++ ;
173 174 175
}


176
void Germline::update_index(IKmerStore<KmerAffect> *_index)
177
{
178
  if (!_index) _index = index ;
179

180 181 182
  _index->insert(rep_5, affect_5, max_indexing, seed);
  _index->insert(rep_4, affect_4, 0, seed);
  _index->insert(rep_3, affect_3, -max_indexing, seed);
183 184
}

185 186
void Germline::mark_as_ambiguous(Germline *other)
{
187
  index->insert(other->rep_5, AFFECT_AMBIGUOUS_SYMBOL, max_indexing, seed);
188 189

  if (other->affect_4.size())
190
    index->insert(other->rep_4, AFFECT_AMBIGUOUS_SYMBOL, 0, seed);
191

192
  index->insert(other->rep_3, AFFECT_AMBIGUOUS_SYMBOL, -max_indexing, seed);
193 194 195
}


196 197 198 199 200
void Germline::override_rep5_rep3_from_labels(KmerAffect left, KmerAffect right)
{
  rep_5 = index->getLabel(left);
  rep_3 = index->getLabel(right);
}
201

202 203
Germline::~Germline()
{
204
  if (index)
205 206 207 208
    {
      if (--(index->refs) == 0)
        delete index;
    }
209 210 211 212
}

ostream &operator<<(ostream &out, const Germline &germline)
{
213
  out << setw(5) << left << germline.code << right << " '" << germline.shortcut << "' "
214
      << setw(3) << germline.delta_min
215
      << " ";
216

217 218
  size_t seed_w = seed_weight(germline.seed);

219
  if (germline.index) {
220
    out << " 0x" << hex << setw(2) << setfill('0') << germline.index->id << dec << setfill(' ') << " " ;
221 222 223 224
    out << fixed << setprecision(3) << setw(8)
        << 100 * germline.index->getIndexLoad(KmerAffect(germline.affect_5, 1, seed_w)) << "%" << " "
        << 100 * germline.index->getIndexLoad(KmerAffect(germline.affect_3, 1, seed_w)) << "%";
    out << " l" << germline.seed.length() << " k" << seed_w << " " << germline.seed ;
225
  }
226 227

  out << endl;
228 229 230 231
  return out;
}


232
MultiGermline::MultiGermline(bool _one_index_per_germline)
233
{
234
  index = NULL;
235
  one_index_per_germline = _one_index_per_germline;
236 237
}

Mikaël Salson's avatar
Mikaël Salson committed
238 239 240 241 242 243
MultiGermline::~MultiGermline() {
  for (list<Germline*>::const_iterator it = germlines.begin(); it != germlines.end(); ++it)
    {
      delete *it ;
    }
}
244

245 246 247 248 249
void MultiGermline::insert(Germline *germline)
{
  germlines.push_back(germline);
}

250
void MultiGermline::add_germline(Germline *germline)
251
{
252
  if (one_index_per_germline)
253
    germline->new_index();
254 255 256
  germlines.push_back(germline);
}

257
void MultiGermline::build_from_json(string path, string json_filename, int filter, int max_indexing)
258 259
{
  //parse germlines.data
260
  ifstream germline_data(path + "/" + json_filename);
261 262 263 264 265 266 267 268 269 270 271 272
  string content( (std::istreambuf_iterator<char>(germline_data) ),
                  (std::istreambuf_iterator<char>()    ) );

  json j = json::parse(content);
  
  //for each germline
  for (json::iterator it = j.begin(); it != j.end(); ++it) {
      
    json recom = it.value()["recombinations"];
    char shortcut = it.value()["shortcut"].dump()[1];
    string code = it.key();
    string seed = it.value()["parameters"]["seed"];
273 274 275 276 277 278 279 280 281 282 283 284 285 286

    switch (filter) {
    case GERMLINES_REGULAR:
      if (code.find("+") != string::npos) continue ;
      break ;

    case GERMLINES_INCOMPLETE:
      if (code.find("+") == string::npos) continue ;
      break ;

    default:
      break ;
    }

287 288 289 290 291 292 293 294
    map<string, string> seedMap;
    seedMap["13s"] = SEED_S13;
    seedMap["12s"] = SEED_S12;
    seedMap["10s"] = SEED_S10;
    seedMap["9s"] = SEED_9;
    
    //for each set of recombination 3/4/5
    for (json::iterator it2 = recom.begin(); it2 != recom.end(); ++it2) {
295 296
      add_germline(new Germline(code, shortcut, path + "/", *it2 , seedMap[seed],
                                max_indexing));
297 298 299 300 301
    }
  }
  
}

302

303
/* if 'one_index_per_germline' was not set, this should be called once all germlines have been loaded */
304
void MultiGermline::insert_in_one_index(IKmerStore<KmerAffect> *_index, bool set_index)
305 306 307 308
{
  for (list<Germline*>::const_iterator it = germlines.begin(); it != germlines.end(); ++it)
    {
      Germline *germline = *it ;
309

310 311
      if (germline->rep_4.size())
	germline->affect_4 = string(1, 14 + germline->shortcut) + "-" + germline->code + "D";
312

313 314 315 316
      germline->update_index(_index);

      if (set_index)
        germline->set_index(_index);
317 318 319
    }
}

320
void MultiGermline::build_with_one_index(string seed, bool set_index)
321 322
{
  bool rc = true ;
323
  index = new PointerACAutomaton<KmerAffect>(seed, rc);
324
  insert_in_one_index(index, set_index);
325 326
}

327 328 329 330 331 332 333 334 335 336
void MultiGermline::finish() {
  if (index) {
    index->finish_building();
  } else {
    for (auto germline: germlines) {
      germline->finish();
    }
  }
}

337 338 339
/* Mark k-mers common to several germlines as ambiguous */
void MultiGermline::mark_cross_germlines_as_ambiguous()
{
340
  string VdJa = "TRA+D";
341
  
342 343 344 345
  for (list<Germline*>::const_iterator it = germlines.begin(); it != germlines.end(); ++it)
    {
      Germline *germline = *it ;
      cout << *germline << ":" ;
346 347 348 349

      // Skip VdJa
      if (!(germline->code.compare(VdJa)))
        continue;
350 351 352 353 354 355 356
      
      for (list<Germline*>::const_iterator it2 = germlines.begin(); it2 != germlines.end(); ++it2)
      {
        Germline *germline2 = *it2 ;
        if (germline2 == germline)
          continue ;

357 358 359
        // Skip germlines on a same system, such as 'D' (TRD) and 'd' (TRD+)
        if (toupper(germline2->shortcut) == toupper(germline->shortcut))
          continue;
360 361 362 363

        // Skip VdJa
        if (!(germline2->code.compare(VdJa)))
          continue;
364
        
365 366 367 368 369 370 371
        germline->mark_as_ambiguous(germline2);
      }

      cout << endl;
    }
}

372 373 374 375 376 377 378 379 380 381 382

ostream &operator<<(ostream &out, const MultiGermline &multigermline)
{
  for (list<Germline*>::const_iterator it = multigermline.germlines.begin(); it != multigermline.germlines.end(); ++it)
    {
      Germline *germline = *it ;
      out << "   " << *germline ;
    }

  return out;
}