germline.cpp 8.82 KB
Newer Older
1 2

#include "germline.h"
3
#include <fstream>
4
#include <ctype.h>
5

6
void Germline::init(string _code, char _shortcut,
7
                    int _delta_min,
8
                    int max_indexing)
9
{
10
  seg_method = SEG_METHOD_53 ;
11 12
  code = _code ;
  shortcut = _shortcut ;
13
  index = 0 ;
14
  this->max_indexing = max_indexing;
15

16 17 18
  affect_5 = "V" ;
  affect_4 = "" ;
  affect_3 = "J" ;
19 20

  affect_5 = string(1, toupper(shortcut)) + "-" + code + "V";
21
  affect_4 = string(1, 14 + shortcut) + "-" + code + "D";
22 23
  affect_3 = string(1, tolower(shortcut)) + "-" + code + "J";
     
24 25 26
  delta_min = _delta_min ;
}

27 28

Germline::Germline(string _code, char _shortcut,
29
		   int _delta_min,
30
                    int max_indexing)
31
{
32
  init(_code, _shortcut, _delta_min, max_indexing);
33 34 35
}


36 37
Germline::Germline(string _code, char _shortcut,
		   string f_rep_5, string f_rep_4, string f_rep_3,
38
		   int _delta_min,
39
                    int max_indexing)
40
{
41
  init(_code, _shortcut, _delta_min, max_indexing);
42

43 44 45 46
  f_reps_5.push_back(f_rep_5);
  f_reps_4.push_back(f_rep_4);
  f_reps_3.push_back(f_rep_3);

47
  /// no CYS104_IN_GAPPED_V / PHE118_TRP118_IN_GAPPED_J here ?
48 49 50
  rep_5 = Fasta(f_rep_5, 2, "|");
  rep_4 = Fasta(f_rep_4, 2, "|");
  rep_3 = Fasta(f_rep_3, 2, "|");
51 52 53

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
54 55 56
}


57 58
Germline::Germline(string _code, char _shortcut,
		   list <string> _f_reps_5, list <string> _f_reps_4, list <string> _f_reps_3,
59
		   int _delta_min,
60
                    int max_indexing)
61
{
62
  init(_code, _shortcut, _delta_min, max_indexing);
63 64 65 66 67

  f_reps_5 = _f_reps_5 ;
  f_reps_4 = _f_reps_4 ;
  f_reps_3 = _f_reps_3 ;

68
  rep_5 = Fasta(2, "|", CYS104_IN_GAPPED_V);
69
  rep_4 = Fasta(2, "|") ;
70
  rep_3 = Fasta(2, "|", PHE118_TRP118_IN_GAPPED_J);
71

72
  for (list<string>::const_iterator it = f_reps_5.begin(); it != f_reps_5.end(); ++it)
73
    rep_5.add(*it);
74 75
  
  for (list<string>::const_iterator it = f_reps_4.begin(); it != f_reps_4.end(); ++it)
76
    rep_4.add(*it);
77 78

  for (list<string>::const_iterator it = f_reps_3.begin(); it != f_reps_3.end(); ++it)
79
    rep_3.add(*it);
80 81 82

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
83 84 85
}


86 87
Germline::Germline(string _code, char _shortcut, 
           Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
88
		   int _delta_min,
89
                    int max_indexing)
90
{
91
  init(_code, _shortcut, _delta_min, max_indexing);
92 93 94 95

  rep_5 = _rep_5 ;
  rep_4 = _rep_4 ;
  rep_3 = _rep_3 ;
96 97 98

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
99 100
}

101 102 103
Germline::Germline(string code, char shortcut, string path, json json_recom, int max_indexing)
{
    
104
  int delta_min = -10;
105 106
  
  if (json_recom.find("4") != json_recom.end()) {
107
      delta_min = 0;
108 109 110 111
  }
  
  init(code, shortcut, delta_min, max_indexing);
  
112
  rep_5 = Fasta(2, "|", CYS104_IN_GAPPED_V) ;
113
  rep_4 = Fasta(2, "|") ;
114
  rep_3 = Fasta(2, "|", PHE118_TRP118_IN_GAPPED_J) ;
115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140

  for (json::iterator it = json_recom["5"].begin();
       it != json_recom["5"].end(); ++it) 
  {
    string filename = *it;
    f_reps_5.push_back(path + filename);
    rep_5.add(path + filename);
  }
  
  if (json_recom.find("4") != json_recom.end()) {
    for (json::iterator it = json_recom["4"].begin();
        it != json_recom["4"].end(); ++it) 
    {
        string filename = *it;
        f_reps_4.push_back(path + filename);
        rep_4.add(path + filename);
    }
  }
  
  for (json::iterator it = json_recom["3"].begin();
       it != json_recom["3"].end(); ++it) 
  {
    string filename = *it;
    f_reps_3.push_back(path + filename);
    rep_3.add(path + filename);
  }
141 142 143

  if (rep_4.size())
    seg_method = SEG_METHOD_543 ;
144 145
}

146
void Germline::new_index(string seed)
147
{
148 149
  bool rc = true ;
  index = KmerStoreFactory::createIndex<KmerAffect>(seed, rc);
150
  index->refs = 1;
151

152 153 154
  update_index();
}

155
void Germline::set_index(IKmerStore<KmerAffect> *_index)
156 157
{
  index = _index;
158
  index->refs++ ;
159 160 161
}


162
void Germline::update_index(IKmerStore<KmerAffect> *_index)
163
{
164
  if (!_index) _index = index ;
165

166 167
  _index->insert(rep_5, affect_5, max_indexing);
  _index->insert(rep_4, affect_4);
168
  _index->insert(rep_3, affect_3, -max_indexing);
169 170
}

171 172
void Germline::mark_as_ambiguous(Germline *other)
{
173
  index->insert(other->rep_5, AFFECT_AMBIGUOUS_SYMBOL, max_indexing);
174 175 176 177

  if (other->affect_4.size())
    index->insert(other->rep_4, AFFECT_AMBIGUOUS_SYMBOL);

178
  index->insert(other->rep_3, AFFECT_AMBIGUOUS_SYMBOL, -max_indexing);
179 180 181
}


182 183 184 185 186
void Germline::override_rep5_rep3_from_labels(KmerAffect left, KmerAffect right)
{
  rep_5 = index->getLabel(left);
  rep_3 = index->getLabel(right);
}
187

188 189
Germline::~Germline()
{
190
  if (index)
191 192 193 194
    {
      if (--(index->refs) == 0)
        delete index;
    }
195 196 197 198
}

ostream &operator<<(ostream &out, const Germline &germline)
{
199
  out << setw(5) << left << germline.code << right << " '" << germline.shortcut << "' "
200
      << setw(3) << germline.delta_min
201
      << " ";
202

203
  if (germline.index) {
204
    out << " 0x" << hex << setw(2) << setfill('0') << germline.index->id << dec << setfill(' ') << " " ;
205
    out << fixed << setprecision(3) << setw(8) << 100 * germline.index->getIndexLoad() << "%";
206
    out << " l" << germline.index->getS() << " k" << germline.index->getK() << " " << germline.index->getSeed() ; // TODO: there should be a << for index
207
  }
208 209

  out << endl;
210 211 212 213
  return out;
}


214
MultiGermline::MultiGermline(bool _one_index_per_germline)
215
{
216
  index = NULL;
217
  one_index_per_germline = _one_index_per_germline;
218 219
}

Mikaël Salson's avatar
Mikaël Salson committed
220 221 222 223 224 225
MultiGermline::~MultiGermline() {
  for (list<Germline*>::const_iterator it = germlines.begin(); it != germlines.end(); ++it)
    {
      delete *it ;
    }
}
226

227 228 229 230 231
void MultiGermline::insert(Germline *germline)
{
  germlines.push_back(germline);
}

232 233
void MultiGermline::add_germline(Germline *germline, string seed)
{
234 235
  if (one_index_per_germline)
    germline->new_index(seed);
236 237 238
  germlines.push_back(germline);
}

239
void MultiGermline::build_from_json(string path, string json_filename, int filter, int max_indexing)
240 241
{
  //parse germlines.data
242
  ifstream germline_data(path + "/" + json_filename);
243 244 245 246 247 248 249 250 251 252 253 254
  string content( (std::istreambuf_iterator<char>(germline_data) ),
                  (std::istreambuf_iterator<char>()    ) );

  json j = json::parse(content);
  
  //for each germline
  for (json::iterator it = j.begin(); it != j.end(); ++it) {
      
    json recom = it.value()["recombinations"];
    char shortcut = it.value()["shortcut"].dump()[1];
    string code = it.key();
    string seed = it.value()["parameters"]["seed"];
255 256 257 258 259 260 261 262 263 264 265 266 267 268

    switch (filter) {
    case GERMLINES_REGULAR:
      if (code.find("+") != string::npos) continue ;
      break ;

    case GERMLINES_INCOMPLETE:
      if (code.find("+") == string::npos) continue ;
      break ;

    default:
      break ;
    }

269 270 271 272 273 274 275 276
    map<string, string> seedMap;
    seedMap["13s"] = SEED_S13;
    seedMap["12s"] = SEED_S12;
    seedMap["10s"] = SEED_S10;
    seedMap["9s"] = SEED_9;
    
    //for each set of recombination 3/4/5
    for (json::iterator it2 = recom.begin(); it2 != recom.end(); ++it2) {
277
      add_germline(new Germline(code, shortcut, path + "/", *it2 , max_indexing), seedMap[seed]);
278 279 280 281 282
    }
  }
  
}

283

284
/* if 'one_index_per_germline' was not set, this should be called once all germlines have been loaded */
285
void MultiGermline::insert_in_one_index(IKmerStore<KmerAffect> *_index, bool set_index)
286 287 288 289
{
  for (list<Germline*>::const_iterator it = germlines.begin(); it != germlines.end(); ++it)
    {
      Germline *germline = *it ;
290

291 292
      if (germline->rep_4.size())
	germline->affect_4 = string(1, 14 + germline->shortcut) + "-" + germline->code + "D";
293

294 295 296 297
      germline->update_index(_index);

      if (set_index)
        germline->set_index(_index);
298 299 300
    }
}

301
void MultiGermline::build_with_one_index(string seed, bool set_index)
302 303 304
{
  bool rc = true ;
  index = KmerStoreFactory::createIndex<KmerAffect>(seed, rc);
305
  insert_in_one_index(index, set_index);
306 307
}

308 309 310
/* Mark k-mers common to several germlines as ambiguous */
void MultiGermline::mark_cross_germlines_as_ambiguous()
{
311
  string VdJa = "TRA+D";
312
  
313 314 315 316
  for (list<Germline*>::const_iterator it = germlines.begin(); it != germlines.end(); ++it)
    {
      Germline *germline = *it ;
      cout << *germline << ":" ;
317 318 319 320

      // Skip VdJa
      if (!(germline->code.compare(VdJa)))
        continue;
321 322 323 324 325 326 327
      
      for (list<Germline*>::const_iterator it2 = germlines.begin(); it2 != germlines.end(); ++it2)
      {
        Germline *germline2 = *it2 ;
        if (germline2 == germline)
          continue ;

328 329 330
        // Skip germlines on a same system, such as 'D' (TRD) and 'd' (TRD+)
        if (toupper(germline2->shortcut) == toupper(germline->shortcut))
          continue;
331 332 333 334

        // Skip VdJa
        if (!(germline2->code.compare(VdJa)))
          continue;
335
        
336 337 338 339 340 341 342
        germline->mark_as_ambiguous(germline2);
      }

      cout << endl;
    }
}

343 344 345 346 347 348 349 350 351 352 353

ostream &operator<<(ostream &out, const MultiGermline &multigermline)
{
  for (list<Germline*>::const_iterator it = multigermline.germlines.begin(); it != multigermline.germlines.end(); ++it)
    {
      Germline *germline = *it ;
      out << "   " << *germline ;
    }

  return out;
}