vidjil.cpp 41.5 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1 2
/*
  This file is part of "Vidjil" <http://bioinfo.lifl.fr/vidjil>
Mathieu Giraud's avatar
Mathieu Giraud committed
3 4
  Copyright (C) 2011, 2012, 2013, 2014 by Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille
  Contributors: Mathieu Giraud <mathieu.giraud@lifl.fr>, Mikaël Salson <mikael.salson@lifl.fr>, Marc Duez <marc.duez@lifl.fr>
Mikaël Salson's avatar
Mikaël Salson committed
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

Mathieu Giraud's avatar
Mathieu Giraud committed
20
//$$ #include
Mikaël Salson's avatar
Mikaël Salson committed
21 22 23 24 25 26 27 28 29 30 31 32 33 34

#include<algorithm>
#include<utility>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <cstring>
#include <time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#include "core/tools.h"
35
#include "core/json.h"
36
#include "core/germline.h"
Mikaël Salson's avatar
Mikaël Salson committed
37 38 39
#include "core/kmerstore.h"
#include "core/fasta.h"
#include "core/segment.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
40
#include "core/windows.h"
Mikaël Salson's avatar
Mikaël Salson committed
41 42 43 44 45 46 47
#include "core/cluster-junctions.h"
#include "core/dynprog.h"
#include "core/read_score.h"
#include "core/read_chooser.h"
#include "core/compare-all.h"
#include "core/mkdir.h"
#include "core/labels.h"
Mikaël Salson's avatar
Mikaël Salson committed
48
#include "core/list_utils.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
49
#include "core/windowExtractor.h"
Mikaël Salson's avatar
Mikaël Salson committed
50 51 52 53 54 55 56 57

#include "vidjil.h"

// RELEASE_TAG may be defined in the "release.h" file.
// If RELEASE_TAG is undefined, the version will be the git hash.
// #define RELEASE_TAG  "2013.04"
#include "release.h"

58 59 60
// GIT_VERSION should be defined in "git-version.h", created by "create-git-version-h.sh", to be used outside of releases
#include "git-version.h"

61
#define VIDJIL_JSON_VERSION "2014.10"
Mikaël Salson's avatar
Mikaël Salson committed
62

Mathieu Giraud's avatar
Mathieu Giraud committed
63 64
//$$ #define (mainly default options)

65
#define DEFAULT_MULTIGERMLINE "germline"
Mathieu Giraud's avatar
Mathieu Giraud committed
66 67
#define DEFAULT_GERMLINE_SYSTEM "IGH" 
#define DEFAULT_V_REP  "./germline/IGHV.fa"
Mikaël Salson's avatar
Mikaël Salson committed
68
#define DEFAULT_D_REP  "./germline/IGHD.fa" 
Mathieu Giraud's avatar
Mathieu Giraud committed
69
#define DEFAULT_J_REP  "./germline/IGHJ.fa"
Mikaël Salson's avatar
Mikaël Salson committed
70

Mathieu Giraud's avatar
Mathieu Giraud committed
71
#define DEFAULT_READS  "./data/Stanford_S22.fasta"
72
#define DEFAULT_MIN_READS_CLONE 5
73
#define DEFAULT_MAX_REPRESENTATIVES 100
74 75
#define DEFAULT_MAX_CLONES 20
#define DEFAULT_RATIO_READS_CLONE 0.0
76
#define NO_LIMIT "all"
Mikaël Salson's avatar
Mikaël Salson committed
77

Mathieu Giraud's avatar
Mathieu Giraud committed
78
#define COMMAND_WINDOWS "windows"
79
#define COMMAND_CLONES "clones"
Mikaël Salson's avatar
Mikaël Salson committed
80
#define COMMAND_SEGMENT "segment"
81
#define COMMAND_GERMLINES "germlines"
Mikaël Salson's avatar
Mikaël Salson committed
82
 
83
enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
Mikaël Salson's avatar
Mikaël Salson committed
84

85 86 87
#define DEFAULT_OUT_DIR "./out/" 

// Fixed filenames/suffixes
88
#define CLONES_FILENAME ".vdj.fa"
Mikaël Salson's avatar
Mikaël Salson committed
89
#define CLONE_FILENAME "clone.fa-"
90 91 92 93 94 95
#define WINDOWS_FILENAME ".windows.fa"
#define SEGMENTED_FILENAME ".segmented.vdj.fa"
#define UNSEGMENTED_FILENAME ".unsegmented.fa"
#define EDGES_FILENAME ".edges"
#define COMP_FILENAME "comp.vidjil"
#define JSON_SUFFIX ".vidjil"
Mikaël Salson's avatar
Mikaël Salson committed
96 97 98

// "tests/data/leukemia.fa" 

99
#define DEFAULT_K      0
Mikaël Salson's avatar
Mikaël Salson committed
100
#define DEFAULT_W      40
Mikaël Salson's avatar
Mikaël Salson committed
101
#define DEFAULT_W_D    60
102
#define DEFAULT_SEED   ""
Mikaël Salson's avatar
Mikaël Salson committed
103 104

#define DEFAULT_DELTA_MIN  -10
Mathieu Giraud's avatar
Mathieu Giraud committed
105
#define DEFAULT_DELTA_MAX   20
Mikaël Salson's avatar
Mikaël Salson committed
106 107

#define DEFAULT_DELTA_MIN_D  0
108
#define DEFAULT_DELTA_MAX_D  80
Mikaël Salson's avatar
Mikaël Salson committed
109

Mikaël Salson's avatar
Mikaël Salson committed
110
#define DEFAULT_MAX_AUDITIONED 2000
Mathieu Giraud's avatar
Mathieu Giraud committed
111 112
#define DEFAULT_RATIO_REPRESENTATIVE 0.5

Mikaël Salson's avatar
Mikaël Salson committed
113 114 115 116 117 118
#define DEFAULT_EPSILON  0
#define DEFAULT_MINPTS   10

#define DEFAULT_CLUSTER_COST  Cluster
#define DEFAULT_SEGMENT_COST   VDJ

119 120 121
// error
#define ERROR_STRING "[error] "

122
// warn
123
#define WARN_MAX_CLONES 100
124
#define WARN_PERCENT_SEGMENTED 40
125

Mikaël Salson's avatar
Mikaël Salson committed
126 127 128
// display
#define WIDTH_NB_READS 7
#define WIDTH_NB_CLONES 3
129

Mikaël Salson's avatar
Mikaël Salson committed
130 131 132

using namespace std ;

Mathieu Giraud's avatar
Mathieu Giraud committed
133
//$$ options: usage
Mikaël Salson's avatar
Mikaël Salson committed
134

Mathieu Giraud's avatar
Mathieu Giraud committed
135
extern char *optarg;
Mikaël Salson's avatar
Mikaël Salson committed
136 137 138 139 140

extern int optind, optopt, opterr;

void usage(char *progname)
{
141
  cerr << "Usage: " << progname << " [options] <reads.fa/.fq/.gz>" << endl << endl;
Mikaël Salson's avatar
Mikaël Salson committed
142 143

  cerr << "Command selection" << endl
144 145
       << "  -c <command> \t" << COMMAND_WINDOWS << "  \t window extracting" << endl 
       << "  \t\t" << COMMAND_CLONES  << "  \t clone gathering (default)" << endl 
146
       << "  \t\t" << COMMAND_SEGMENT   << "  \t V(D)J segmentation (not recommended)" << endl
147
       << "  \t\t" << COMMAND_GERMLINES << "  \t discover all germlines" << endl
Mikaël Salson's avatar
Mikaël Salson committed
148 149 150 151
       << endl       

       << "Germline databases" << endl
       << "  -V <file>     V germline multi-fasta file" << endl
152
       << "  -D <file>     D germline multi-fasta file (implies -d)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
153
       << "  -J <file>     J germline multi-fasta file" << endl
154
       << "  -G <prefix>   prefix for V (D) and J repertoires (shortcut for -V <prefix>V.fa -D <prefix>D.fa -J <prefix>J.fa) (basename gives germline code)" << endl
155
       << "  -g <path>     multiple germlines (experimental)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
156 157
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
158
       << "Window prediction" << endl
Mikaël Salson's avatar
Mikaël Salson committed
159
#ifndef NO_SPACED_SEEDS
160
       << "  (use either -s or -k option, but not both)" << endl
161 162
       << "  -s <string>   spaced seed used for the V/J affectation" << endl
       << "                (default: #####-#####, ######-######, #######-#######, depends on germline)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
163
#endif
164
       << "  -k <int>      k-mer size used for the V/J affectation (default: 10, 12, 13, depends on germline)" << endl
165 166 167
#ifndef NO_SPACED_SEEDS
       << "                (using -k option is equivalent to set with -s a contiguous seed with only '#' characters)" << endl
#endif
168 169 170
       << "  -m <int>      minimal admissible delta between last V and first J k-mer (default: " << DEFAULT_DELTA_MIN << ") (default with -d: " << DEFAULT_DELTA_MIN_D << ")" << endl
       << "  -M <int>      maximal admissible delta between last V and first J k-mer (default: " << DEFAULT_DELTA_MAX << ") (default with -d: " << DEFAULT_DELTA_MAX_D << ")" << endl
       << "  -w <int>      w-mer size used for the length of the extracted window (default: " << DEFAULT_W << ") (default with -d: " << DEFAULT_W_D << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
171 172
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
173 174
       << "Window annotations" << endl
       << "  -l <file>     labels for some windows -- these windows will be kept even if some limits are not reached" << endl
Mikaël Salson's avatar
Mikaël Salson committed
175 176
       << endl

177
       << "Limits to report a clone (or a window)" << endl
178 179
       << "  -r <nb>       minimal number of reads supporting a clone (default: " << DEFAULT_MIN_READS_CLONE << ")" << endl
       << "  -% <ratio>    minimal percentage of reads supporting a clone (default: " << DEFAULT_RATIO_READS_CLONE << ")" << endl
180 181
       << endl

182
       << "Limits to further analyze some clones" << endl
183
       << "  -y <nb>       maximal number of clones computed with a representative ('" << NO_LIMIT << "': no limit) (default: " << DEFAULT_MAX_REPRESENTATIVES << ")" << endl
184
       << "  -z <nb>       maximal number of clones to be segmented ('" << NO_LIMIT << "': no limit, do not use) (default: " << DEFAULT_MAX_CLONES << ")" << endl
185
       << "  -A            reports and segments all clones (-r 0 -% 0 -y " << NO_LIMIT << " -z " << NO_LIMIT << "), to be used only on very small datasets" << endl
Mikaël Salson's avatar
Mikaël Salson committed
186 187
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
188
       << "Fine segmentation options (second pass, see warning in doc/algo.org)" << endl
189
       << "  -d            segment into V(D)J components instead of VJ (and resets -m, -M and -w options)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
190 191 192
       << "  -f <string>   use custom Cost for fine segmenter : format \"match, subst, indels, homo, del_end\" (default "<<VDJ<<" )"<< endl
       << endl

193 194 195 196 197 198 199 200 201
       << "Additional clustering" << endl
       << "  -e <file>     manual clustering -- a file used to force some specific edges" << endl
       << "  -n <int>      maximum distance between neighbors for automatic clustering (default " << DEFAULT_EPSILON << "). No automatic clusterisation if =0." << endl
       << "  -N <int>      minimum required neighbors for automatic clustering (default " << DEFAULT_MINPTS << ")" << endl
       << "  -S            generate and save comparative matrix for clustering" << endl
       << "  -L            load comparative matrix for clustering" << endl
       << "  -C <string>   use custom Cost for automatic clustering : format \"match, subst, indels, homo, del_end\" (default "<<Cluster<<" )"<< endl
       << endl

202
       << "Debug" << endl
203 204
       << "  -U            output segmented (in " << SEGMENTED_FILENAME << " file) sequences" << endl
       << "  -u            output unsegmented (in " << UNSEGMENTED_FILENAME << " file) sequences" << endl
205
       << "                and display detailed k-mer affectation both on segmented and on unsegmented sequences" << endl
Mikaël Salson's avatar
Mikaël Salson committed
206
       << "Output" << endl
207
       << "  -o <dir>      output directory (default: " << DEFAULT_OUT_DIR << ")" <<  endl
208
       << "  -b <string>   output basename (by default basename of the input file)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
209
    
210
       << "  -a            output all sequences by cluster (" << CLONE_FILENAME << "*), to be used only on small datasets" << endl
Mikaël Salson's avatar
Mikaël Salson committed
211 212 213
       << "  -v            verbose mode" << endl
       << endl        

214 215 216
       << "The full help is available in the doc/algo.org file."
       << endl

Mikaël Salson's avatar
Mikaël Salson committed
217
       << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
218
       << "Examples (see doc/algo.org)" << endl
219
       << "  " << progname << " -c clones   -G germline/IGH  -r 5       -d data/Stanford_S22.fasta" << endl
220
       << "  " << progname << " -c segment  -G germline/IGH             -d data/Stanford_S22.fasta   # (only for testing)" << endl
221
       << "  " << progname << " -c germlines                               data/Stanford_S22.fasta" << endl
Mikaël Salson's avatar
Mikaël Salson committed
222 223 224 225 226 227
    ;
  exit(1);
}

int main (int argc, char **argv)
{
228
  cout << "# Vidjil -- V(D)J recombinations analysis <http://www.vidjil.org/>" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
229 230
       << "# Copyright (C) 2011, 2012, 2013, 2014 by the Vidjil team" << endl
       << "# Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille" << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
231 232 233 234 235
       << endl
       << "# Vidjil is free software, and you are welcome to redistribute it" << endl
       << "# under certain conditions -- see http://git.vidjil.org/blob/master/doc/LICENSE" << endl
       << "# No lymphocyte was harmed in the making of this software," << endl
       << "# however this software is for research use only and comes with no warranty." << endl
Mikaël Salson's avatar
Mikaël Salson committed
236 237
       << endl ;

Mathieu Giraud's avatar
Mathieu Giraud committed
238 239
  //$$ options: defaults

Mikaël Salson's avatar
Mikaël Salson committed
240
  string germline_system = DEFAULT_GERMLINE_SYSTEM ;
241 242 243 244 245
  
  list <string> f_reps_V ;
  list <string> f_reps_D ;
  list <string> f_reps_J ;

Mikaël Salson's avatar
Mikaël Salson committed
246 247
  string f_reads = DEFAULT_READS ;
  string seed = DEFAULT_SEED ;
248
  string f_basename = "";
Mikaël Salson's avatar
Mikaël Salson committed
249

250
  string out_dir = DEFAULT_OUT_DIR;
Mikaël Salson's avatar
Mikaël Salson committed
251 252 253 254
  
  string comp_filename = COMP_FILENAME;

  int k = DEFAULT_K ;
Mikaël Salson's avatar
Mikaël Salson committed
255 256 257
  int w = 0 ;
  int default_w = DEFAULT_W ;

Mikaël Salson's avatar
Mikaël Salson committed
258 259 260 261 262 263 264 265 266 267 268
  int epsilon = DEFAULT_EPSILON ;
  int minPts = DEFAULT_MINPTS ;
  Cost cluster_cost = DEFAULT_CLUSTER_COST ;
  Cost segment_cost = DEFAULT_SEGMENT_COST ;
  
  
  int save_comp = 0;
  int load_comp = 0;
  int segment_D = 0;
  
  int verbose = 0 ;
269
  int command = CMD_CLONES;
Mikaël Salson's avatar
Mikaël Salson committed
270

271
  int max_representatives = DEFAULT_MAX_REPRESENTATIVES ;
272 273 274
  int max_clones = DEFAULT_MAX_CLONES ;
  int min_reads_clone = DEFAULT_MIN_READS_CLONE ;
  float ratio_reads_clone = DEFAULT_RATIO_READS_CLONE;
Mikaël Salson's avatar
Mikaël Salson committed
275 276
  // int average_deletion = 4;     // Average number of deletion in V or J

Mathieu Giraud's avatar
Mathieu Giraud committed
277
  float ratio_representative = DEFAULT_RATIO_REPRESENTATIVE;
Mikaël Salson's avatar
Mikaël Salson committed
278
  unsigned int max_auditionned = DEFAULT_MAX_AUDITIONED;
Mathieu Giraud's avatar
Mathieu Giraud committed
279

Mikaël Salson's avatar
Mikaël Salson committed
280 281 282 283 284
  // Admissible delta between left and right segmentation points
  int delta_min = DEFAULT_DELTA_MIN ; // Kmer+Fine
  int delta_max = DEFAULT_DELTA_MAX ; // Fine

  bool output_sequences_by_cluster = false;
285
  bool output_segmented = false;
Mathieu Giraud's avatar
Mathieu Giraud committed
286
  bool output_unsegmented = false;
287
  bool multi_germline = false;
288
  string multi_germline_file = DEFAULT_MULTIGERMLINE;
Mikaël Salson's avatar
Mikaël Salson committed
289 290 291

  string forced_edges = "" ;

Mathieu Giraud's avatar
Mathieu Giraud committed
292
  string windows_labels_file = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
293 294 295

  char c ;

296 297
  int options_s_k = 0 ;

298 299 300
  //JsonArray which contains the Levenshtein distances
  JsonArray jsonLevenshtein;

Mathieu Giraud's avatar
Mathieu Giraud committed
301 302
  //$$ options: getopt

303
  while ((c = getopt(argc, argv, "Ahag:G:V:D:J:k:r:vw:e:C:f:l:dc:m:M:N:s:b:Sn:o:L%:y:z:uU")) != EOF)
Mikaël Salson's avatar
Mikaël Salson committed
304 305 306 307 308

    switch (c)
      {
      case 'h':
        usage(argv[0]);
309

Mikaël Salson's avatar
Mikaël Salson committed
310
      case 'c':
311 312
        if (!strcmp(COMMAND_CLONES,optarg))
          command = CMD_CLONES;
Mikaël Salson's avatar
Mikaël Salson committed
313 314
        else if (!strcmp(COMMAND_SEGMENT,optarg))
          command = CMD_SEGMENT;
Mathieu Giraud's avatar
Mathieu Giraud committed
315 316
        else if (!strcmp(COMMAND_WINDOWS,optarg))
          command = CMD_WINDOWS;
317 318
        else if (!strcmp(COMMAND_GERMLINES,optarg))
          command = CMD_GERMLINES;
Mikaël Salson's avatar
Mikaël Salson committed
319 320 321 322 323
        else {
          cerr << "Unknwown command " << optarg << endl;
	  usage(argv[0]);
        }
        break;
324

Mikaël Salson's avatar
Mikaël Salson committed
325 326 327 328

      // Germline

      case 'V':
329
	f_reps_V.push_back(optarg);
330
	germline_system = "custom" ;
Mikaël Salson's avatar
Mikaël Salson committed
331 332 333
	break;

      case 'D':
334
	f_reps_D.push_back(optarg);
Mikaël Salson's avatar
Mikaël Salson committed
335
        segment_D = 1;
336 337 338
	delta_min = DEFAULT_DELTA_MIN_D ;
	delta_max = DEFAULT_DELTA_MAX_D ;
	default_w = DEFAULT_W_D ;
Mikaël Salson's avatar
Mikaël Salson committed
339 340 341
	break;
        
      case 'J':
342
	f_reps_J.push_back(optarg);
343
	germline_system = "custom" ;
Mikaël Salson's avatar
Mikaël Salson committed
344 345
	break;

346
      case 'g':
347 348
	multi_germline = true;
	multi_germline_file = string(optarg);
349 350
	break;
	
Mikaël Salson's avatar
Mikaël Salson committed
351
      case 'G':
Mikaël Salson's avatar
Mikaël Salson committed
352
	germline_system = string(optarg);
353 354 355
	f_reps_V.push_back((germline_system + "V.fa").c_str()) ;
	f_reps_D.push_back((germline_system + "D.fa").c_str()) ;
	f_reps_J.push_back((germline_system + "J.fa").c_str()) ;
356
	germline_system = extract_basename(germline_system);
Mikaël Salson's avatar
Mikaël Salson committed
357 358 359 360 361
	// TODO: if VDJ, set segment_D // NO, bad idea, depends on naming convention
	break;

      // Algorithm

362 363 364 365 366 367 368 369 370 371
      case 's':
#ifndef NO_SPACED_SEEDS
	seed = string(optarg);
	k = seed_weight(seed);
	options_s_k++ ;
#else
        cerr << "To enable the option -s, please compile without NO_SPACED_SEEDS" << endl;
#endif
        break;

Mikaël Salson's avatar
Mikaël Salson committed
372 373 374
      case 'k':
	k = atoi(optarg);
	seed = seed_contiguous(k);
375
	options_s_k++ ;
Mikaël Salson's avatar
Mikaël Salson committed
376 377 378 379 380 381 382 383 384 385 386 387 388 389
        break;

      case 'w':
	w = atoi(optarg);
        break;

      case 'm':
	delta_min = atoi(optarg);
        break;

      case 'M':
	delta_max = atoi(optarg);
        break;

390 391
      // Output 

Mikaël Salson's avatar
Mikaël Salson committed
392 393 394 395
      case 'o':
        out_dir = optarg ;
        break;

396 397
      case 'b':
        f_basename = optarg;
Mikaël Salson's avatar
Mikaël Salson committed
398 399
        break;

400 401 402 403 404 405 406 407
      case 'a':
	output_sequences_by_cluster = true;
	break;

      case 'v':
	verbose += 1 ;
	break;

408 409
      // Limits

Mikaël Salson's avatar
Mikaël Salson committed
410 411 412 413
      case '%':
	ratio_reads_clone = atof(optarg);
	break;

414
      case 'r':
Mikaël Salson's avatar
Mikaël Salson committed
415 416 417
	min_reads_clone = atoi(optarg);
        break;

418
      case 'y':
419 420 421 422 423
	if (!strcmp(NO_LIMIT, optarg))
	  {
	    max_representatives = -1;
	    break;
	  }
424 425 426
	max_representatives = atoi(optarg);
        break;

Mikaël Salson's avatar
Mikaël Salson committed
427
      case 'z':
428 429 430 431 432
	if (!strcmp(NO_LIMIT, optarg))
	  {
	    max_clones = -1;
	    break;
	  }
Mikaël Salson's avatar
Mikaël Salson committed
433 434
	max_clones = atoi(optarg);
        break;
435 436 437

      case 'A': // --all
	ratio_reads_clone = 0 ;
438 439 440
	min_reads_clone = 1 ;
	max_representatives = -1 ;
	max_clones = -1 ;
441 442
	break ;

443 444 445 446
      case 'l':
	windows_labels_file = optarg; 
	break;

447
      // Clustering
448 449 450 451

      case 'e':
	forced_edges = optarg;
	break;
Mikaël Salson's avatar
Mikaël Salson committed
452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472
	
      case 'n':
	epsilon = atoi(optarg);
        break;

      case 'N':
	minPts = atoi(optarg);
        break;
	
      case 'S':
	save_comp=1;
        break;

      case 'L':
	load_comp=1;
        break;
	
      case 'C':
	cluster_cost=strToCost(optarg, Cluster);
        break;
	
473 474 475 476 477 478 479 480 481
      // Fine segmentation

      case 'd':
	segment_D = 1 ;
	delta_min = DEFAULT_DELTA_MIN_D ;
	delta_max = DEFAULT_DELTA_MAX_D ;
	default_w = DEFAULT_W_D ;
	break;

482
      case 'f':
Mikaël Salson's avatar
Mikaël Salson committed
483 484
	segment_cost=strToCost(optarg, VDJ);
        break;
Mathieu Giraud's avatar
Mathieu Giraud committed
485 486 487 488

      case 'u':
        output_unsegmented = true;
        break;
489 490 491
      case 'U':
        output_segmented = true;
        break;
Mikaël Salson's avatar
Mikaël Salson committed
492 493
      }

494 495 496

  //$$ options: post-processing+display

Mikaël Salson's avatar
Mikaël Salson committed
497 498 499 500 501
  // If there was no -w option, then w is either DEFAULT_W or DEFAULT_W_D
  if (w == 0)
    w = default_w ;

  
502 503
  if (options_s_k > 1)
    {
504
      cout << ERROR_STRING << "Use at most one -s or -k option." << endl ;
505 506 507
      exit(1);
    }

Mikaël Salson's avatar
Mikaël Salson committed
508 509
  string out_seqdir = out_dir + "/seq/" ;

Mikaël Salson's avatar
Mikaël Salson committed
510 511 512 513 514 515 516 517 518 519 520 521 522
  if (verbose)
    cout << "# verbose " << verbose << endl ;

  if (optind == argc)
    {
      cout << "# using default sequence file: " << f_reads << endl ;
    }
  else if (optind+1 == argc)
    {
      f_reads = argv[optind] ; 
    }
  else
    {
523
      cout << ERROR_STRING << "Wrong number of arguments." << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
524 525
      exit(1);
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
526

527
  size_t min_cover_representative = (size_t) (min_reads_clone < (int) max_auditionned ? min_reads_clone : max_auditionned) ;
528 529 530 531 532 533 534 535 536 537 538 539 540 541

  // Default repertoires

  if (f_reps_V.empty())
    f_reps_V.push_back(DEFAULT_V_REP) ;

  if (f_reps_D.empty())
    f_reps_D.push_back(DEFAULT_D_REP) ;

  if (f_reps_J.empty())
    f_reps_J.push_back(DEFAULT_J_REP) ;

  if (!segment_D)
    f_reps_D.clear();
542

543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560
  // Default seeds

#ifndef NO_SPACED_SEEDS
  if (k == DEFAULT_K)
    {
      if (germline_system.find("TRA") != string::npos)
	seed = "#######-######" ;

      else if ((germline_system.find("TRB") != string::npos)
	       || (germline_system.find("IGH") != string::npos))
	seed = "######-######" ; 
      else // TRD, TRG, IGK, IGL
	seed = "#####-#####" ; 

      k = seed_weight(seed);
    }
#else
  {
561
    cout << ERROR_STRING << "Vidjil was compiled with NO_SPACED_SEEDS: please provide a -k option." << endl;
562 563 564 565 566
    exit(1) ;
  }
#endif
	  

Mathieu Giraud's avatar
Mathieu Giraud committed
567 568 569 570
#ifndef NO_SPACED_SEEDS
  // Check seed buffer  
  if (seed.size() >= MAX_SEED_SIZE)
    {
571
      cout << ERROR_STRING << "Seed size is too large (MAX_SEED_SIZE)." << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
572 573 574 575
      exit(1);
    }
#endif

576 577 578

  if (w < seed_weight(seed))
    {
579
      cout << ERROR_STRING << "Too small -w. The window size should be at least equal to the seed size (" << seed_weight(seed) << ")." << endl;
580 581 582
      exit(1);
    }

Mikaël Salson's avatar
Mikaël Salson committed
583 584 585 586
  // Check that out_dir is an existing directory or creates it
  const char *out_cstr = out_dir.c_str();

  if (mkpath(out_cstr, 0755) == -1) {
587
    cout << ERROR_STRING << "Directory creation: " << out_dir << endl; perror("");
Mikaël Salson's avatar
Mikaël Salson committed
588 589 590
    exit(2);
  }

Mikaël Salson's avatar
Mikaël Salson committed
591 592
  const char *outseq_cstr = out_seqdir.c_str();
  if (mkpath(outseq_cstr, 0755) == -1) {
593
    cout << ERROR_STRING << "Directory creation: " << out_seqdir << endl; perror("");
Mikaël Salson's avatar
Mikaël Salson committed
594 595 596
    exit(2);
  }

597 598 599 600 601
  // Compute basename if not given as an option
  if (f_basename == "") {
    f_basename = extract_basename(f_reads);
  }

Mikaël Salson's avatar
Mikaël Salson committed
602 603 604
  out_dir += "/" ;

  /// Load labels ;
Mathieu Giraud's avatar
Mathieu Giraud committed
605
  map <string, string> windows_labels = load_map(windows_labels_file);
Mikaël Salson's avatar
Mikaël Salson committed
606 607

  switch(command) {
Mathieu Giraud's avatar
Mathieu Giraud committed
608
  case CMD_WINDOWS: cout << "Extracting windows" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
609
    break;
610
  case CMD_CLONES: cout << "Analysing clones" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
611 612 613
    break;
  case CMD_SEGMENT: cout << "Segmenting V(D)J" << endl;
    break;
614 615
  case CMD_GERMLINES: cout << "Discovering germlines" << endl;
    break;
Mikaël Salson's avatar
Mikaël Salson committed
616 617
  }

Mathieu Giraud's avatar
Mathieu Giraud committed
618
  cout << "Command line: ";
Mikaël Salson's avatar
Mikaël Salson committed
619
  for (int i=0; i < argc; i++) {
Mathieu Giraud's avatar
Mathieu Giraud committed
620
    cout << argv[i] << " ";
Mikaël Salson's avatar
Mikaël Salson committed
621
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
622
  cout << endl;
Mikaël Salson's avatar
Mikaël Salson committed
623 624 625 626 627 628 629 630 631 632 633 634

  //////////////////////////////////
  // Display time and date
  time_t rawtime;
  struct tm *timeinfo;
  char time_buffer[80];

  time (&rawtime );
  timeinfo = localtime (&rawtime);

  strftime (time_buffer, 80,"%F %T", timeinfo);

Mathieu Giraud's avatar
Mathieu Giraud committed
635
  cout << "# " << time_buffer << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
636 637 638 639 640 641


  //////////////////////////////////
  // Display version information or git log

#ifdef RELEASE_TAG
Mathieu Giraud's avatar
Mathieu Giraud committed
642
  cout << "# version: vidjil " << RELEASE_TAG << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
643
#else
644
  cout << "# development version" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
645 646
#endif

647 648 649
#ifdef GIT_VERSION
  cout << "# git: " << GIT_VERSION << endl ;
#endif
650

651

652 653
  //////////////////////////////////
  // Warning for non-optimal use
654

655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675
  if (max_clones == -1 || max_clones > WARN_MAX_CLONES)
    {
      cout << endl
	   << "* WARNING: vidjil was run with '-A' option or with a large '-z' option" << endl ;
    }
  
  if (command == CMD_SEGMENT)
    {
      cout << endl
	   << "* WARNING: vidjil was run with '-c segment' option" << endl ;
    }
  
  if (max_clones == -1 || max_clones > WARN_MAX_CLONES || command == CMD_SEGMENT)
    {
      cout << "* Vidjil purpose is to extract very quickly windows overlapping the CDR3" << endl
	   << "* and to gather reads into clones (-c clones), and not to provide an accurate V(D)J segmentation." << endl
	   << "* The following segmentations are slow to compute and are provided only for convenience." << endl
	   << "* They should be checked with other softwares such as IgBlast, iHHMune-align or IMGT/V-QUEST." << endl 
	   << "* More information is provided in the 'doc/algo.org' file." << endl 
	   << endl ;
    }
676

677 678 679
  /////////////////////////////////////////
  //            LOAD GERMLINES           //
  /////////////////////////////////////////
680

681
  MultiGermline *multigermline = new MultiGermline();
682

683 684 685 686 687 688 689 690
    {
      cout << "Build Kmer indexes" << endl ;
    
      if (multi_germline)
	{
	  multigermline->build_default_set(multi_germline_file);
	}
      else if (command == CMD_GERMLINES)
691
	{
692 693
	  multigermline->load_standard_set(multi_germline_file);
	  multigermline->build_with_one_index(seed);
694
	}
695 696 697 698 699
      else
	{
	  // Custom germline
	  Germline *germline;
	  germline = new Germline(germline_system, 'X',
700 701 702
                                  f_reps_V, f_reps_D, f_reps_J, 
                                  delta_min, delta_max);

703 704 705

	  if (command == CMD_WINDOWS || command == CMD_CLONES)
	    germline->new_index(seed);
706

707 708 709
	  multigermline->insert(germline);
	}
    }
710

711 712 713 714 715 716 717 718

  //////////////////////////////////
  //$$ Read sequence files
 
  OnlineFasta *reads;

  try {
    reads = new OnlineFasta(f_reads, 1, " ");
719
  } catch (const invalid_argument e) {
720
    cout << ERROR_STRING << "Vidjil cannot open reads file " << f_reads << ": " << e.what() << endl;
721 722 723 724 725 726
    exit(1);
  }

  out_dir += "/";


727 728 729 730 731
  //////////////////////////////://////////
  //         DISCOVER GERMLINES          //
  /////////////////////////////////////////
  if (command == CMD_GERMLINES)
    {
732
      map <char, int> stats_kmer, stats_max;
733
      IKmerStore<KmerAffect> *index = multigermline->index ;
734 735

      // Initialize statistics, with two additional categories
736 737
      index->labels.push_back(make_pair(KmerAffect::getAmbiguous(), AFFECT_AMBIGUOUS_SYMBOL));
      index->labels.push_back(make_pair(KmerAffect::getUnknown(), AFFECT_UNKNOWN_SYMBOL));
738 739
      
      for (list< pair <KmerAffect, string> >::const_iterator it = index->labels.begin(); it != index->labels.end(); ++it)
740
	{
741 742 743
	  char key = affect_char(it->first.affect) ;
	  stats_kmer[key] = 0 ;
	  stats_max[key] = 0 ;
744 745
	}
      
746
      // init forbidden for .max()
747 748 749
      set<KmerAffect> forbidden;
      forbidden.insert(KmerAffect::getAmbiguous());
      forbidden.insert(KmerAffect::getUnknown());
750
      
751 752 753
      // Loop through all reads

      int nb_reads = 0 ;
754 755 756
      int total_length = 0 ;
      int s = index->getS();

757 758 759 760 761
      while (reads->hasNext())
	{
	  reads->next();
	  nb_reads++;
	  string seq = reads->getSequence().sequence;
762
	  total_length += seq.length() - s + 1;
763

764
	  KmerAffectAnalyser *kaa = new KmerAffectAnalyser(*index, seq);
765 766 767

	  for (int i = 0; i < kaa->count(); i++) 
	    { 
768 769
	      KmerAffect ksa = kaa->getAffectation(i);
	      stats_kmer[affect_char(ksa.affect)]++ ;
770
	    }
771

772
          delete kaa;
773

774
	  CountKmerAffectAnalyser ckaa(*index, seq);
775 776
	  ckaa.setAllowedOverlap(k-1);

777
	  stats_max[affect_char(ckaa.max(forbidden).affect)]++ ;
778

779 780
	}

781 782
      delete reads;

783 784
      // Display statistics

785 786 787 788
      cout << "  <== "
	   << nb_reads << " reads, "
	   << total_length << " kmers"
	   << endl ;
789
      cout << "\t" << " max" << "\t\t" << "        kmers" << "\n" ;
790 791

      for (list< pair <KmerAffect, string> >::const_iterator it = index->labels.begin(); it != index->labels.end(); ++it)
792
	{
793 794 795 796
	  char key = affect_char(it->first.affect) ;
	  
	  cout << setw(12) << stats_max[key] << " " ;
	  cout << setw(6) << fixed << setprecision(2) <<  (float) stats_max[key] / nb_reads * 100 << "%" ;
797

798
	  cout << "     " ;
799

800 801
	  cout << setw(12) << stats_kmer[key] << " " ;
	  cout << setw(6) << fixed << setprecision(2) <<  (float) stats_kmer[key] / total_length * 100 << "%" ;
802

803
	  cout << "     " << key << " " << it->second << endl ;
804
	}
805 806
      
      delete index;
807 808 809 810
      exit(0);
    }


Mikaël Salson's avatar
Mikaël Salson committed
811 812 813
  ////////////////////////////////////////
  //           CLONE ANALYSIS           //
  ////////////////////////////////////////
814
  if (command == CMD_CLONES || command == CMD_WINDOWS) {
Mikaël Salson's avatar
Mikaël Salson committed
815 816

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
817 818 819 820 821 822
    cout << "# seed = " << seed << "," ;
    cout << " weight = " << seed_weight(seed) << "," ;
    cout << " span = " << seed.size() << endl ;
    cout << "# k = " << k << "," ;
    cout << " w = " << w << "," ;
    cout << " delta = [" << delta_min << "," << delta_max << "]" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
823 824 825


    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
826
    //$$ Kmer Segmentation
Mikaël Salson's avatar
Mikaël Salson committed
827

Mathieu Giraud's avatar
Mathieu Giraud committed
828 829
    cout << endl;
    cout << "Loop through reads, looking for windows" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
830

831
    ofstream *out_segmented = NULL;
Mathieu Giraud's avatar
Mathieu Giraud committed
832
    ofstream *out_unsegmented = NULL;
Mikaël Salson's avatar
Mikaël Salson committed
833
 
Mathieu Giraud's avatar
Mathieu Giraud committed
834
    WindowExtractor we;
835 836
 
    if (output_segmented) {
837
      string f_segmented = out_dir + f_basename + SEGMENTED_FILENAME ;
838 839 840 841 842
      cout << "  ==> " << f_segmented << endl ;
      out_segmented = new ofstream(f_segmented.c_str());
      we.setSegmentedOutput(out_segmented);
    }

Mathieu Giraud's avatar
Mathieu Giraud committed
843
    if (output_unsegmented) {
844
      string f_unsegmented = out_dir + f_basename +  UNSEGMENTED_FILENAME ;
Mathieu Giraud's avatar
Mathieu Giraud committed
845 846 847 848
      cout << "  ==> " << f_unsegmented << endl ;
      out_unsegmented = new ofstream(f_unsegmented.c_str());
      we.setUnsegmentedOutput(out_unsegmented);
    }
849

850 851

    WindowsStorage *windowsStorage = we.extract(reads, multigermline, w, windows_labels);
852
    windowsStorage->setIdToAll();
Mathieu Giraud's avatar
Mathieu Giraud committed
853
    size_t nb_total_reads = we.getNbReads();
Mikaël Salson's avatar
Mikaël Salson committed
854 855


Mathieu Giraud's avatar
Mathieu Giraud committed
856
    //$$ Display statistics on segmentation causes
Mikaël Salson's avatar
Mikaël Salson committed
857

858 859
        
    ostringstream stream_segmentation_info;
Mikaël Salson's avatar
Mikaël Salson committed
860

Mathieu Giraud's avatar
Mathieu Giraud committed
861 862
    int nb_segmented_including_too_short = we.getNbSegmented(TOTAL_SEG_AND_WINDOW) 
      + we.getNbSegmented(TOTAL_SEG_BUT_TOO_SHORT_FOR_THE_WINDOW);
Mikaël Salson's avatar
Mikaël Salson committed
863

864
    stream_segmentation_info << "  ==> segmented " << nb_segmented_including_too_short << " reads"
Mikaël Salson's avatar
Mikaël Salson committed
865
	<< " (" << setprecision(3) << 100 * (float) nb_segmented_including_too_short / nb_total_reads << "%)" 
Mathieu Giraud's avatar
Mathieu Giraud committed
866 867
	<< endl ;

868
    // nb_segmented is the main denominator for the following
Mathieu Giraud's avatar
Mathieu Giraud committed
869
    int nb_segmented = we.getNbSegmented(TOTAL_SEG_AND_WINDOW);
870
    float ratio_segmented = 100 * (float) nb_segmented / nb_total_reads ;
Mikaël Salson's avatar
Mikaël Salson committed
871

872
    stream_segmentation_info << "  ==> found " << windowsStorage->size() << " " << w << "-windows"
Mikaël Salson's avatar
Mikaël Salson committed
873
	<< " in " << nb_segmented << " segments"
874
	<< " (" << setprecision(3) << ratio_segmented << "%)"
Mikaël Salson's avatar
Mikaël Salson committed
875 876
	<< " inside " << nb_total_reads << " sequences" << endl ;
  
877
    // warn if there are too few segmented sequences
878
    if (ratio_segmented < WARN_PERCENT_SEGMENTED)
879
      {
880 881
        stream_segmentation_info << "  ! There are not so many CDR3 windows found in this set of reads." << endl ;
        stream_segmentation_info << "  ! If this is unexpected, check the germline (-G) and try to change seed parameters (-k)." << endl ;
882 883
      }

884
    stream_segmentation_info << "                                  #      av. length" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
885

886
    multigermline->out_stats(stream_segmentation_info);
887 888
    stream_segmentation_info << endl;
    we.out_stats(stream_segmentation_info);
Mikaël Salson's avatar
Mikaël Salson committed
889
    
890
    cout << stream_segmentation_info.str();
Mathieu Giraud's avatar
Mathieu Giraud committed
891
      map <junction, JsonList> json_data_segment ;
Mikaël Salson's avatar
Mikaël Salson committed
892
    
Mikaël Salson's avatar
Mikaël Salson committed
893 894

	//////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
895
	//$$ Sort windows
Mikaël Salson's avatar
Mikaël Salson committed
896
	
Mathieu Giraud's avatar
Mathieu Giraud committed
897 898
        cout << "Sort windows by number of occurrences" << endl;
        windowsStorage->sort();
Mikaël Salson's avatar
Mikaël Salson committed
899 900

	//////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
901
	//$$ Output windows
Mikaël Salson's avatar
Mikaël Salson committed
902 903
	//////////////////////////////////

904
	string f_all_windows = out_dir + f_basename + WINDOWS_FILENAME;
Mathieu Giraud's avatar
Mathieu Giraud committed
905
	cout << "  ==> " << f_all_windows << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
906

Mathieu Giraud's avatar
Mathieu Giraud committed
907
	ofstream out_all_windows(f_all_windows.c_str());
Mathieu Giraud's avatar
Mathieu Giraud committed
908
        windowsStorage->printSortedWindows(out_all_windows);
Mikaël Salson's avatar
Mikaël Salson committed
909 910 911


    //////////////////////////////////
912 913 914 915 916 917 918
    //$$ min_reads_clone (ou label)

    int min_reads_clone_ratio = (int) (ratio_reads_clone * nb_segmented / 100.0);
    cout << "Considering only labeled windows and windows with >= " << min_reads_clone << " reads"
	 << " and with a ratio >= " << ratio_reads_clone << " (" << min_reads_clone_ratio << ")" << endl ;

    int min_reads_clone_final = max(min_reads_clone, min_reads_clone_ratio);
Mathieu Giraud's avatar
Mathieu Giraud committed
919

920
    pair<int, int> info_remove = windowsStorage->keepInterestingWindows((size_t) min_reads_clone_final);
Mikaël Salson's avatar
Mikaël Salson committed
921
	 
Mathieu Giraud's avatar
Mathieu Giraud committed
922 923
    cout << "  ==> keep " <<  windowsStorage->size() << " windows in " << info_remove.second << " reads" ;
    cout << " (" << setprecision(3) << 100 * (float) info_remove.second / nb_total_reads << "%)  " << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
924

925 926 927 928
    if (windowsStorage->size() == 0)
      {
	cout << "  ! No windows with current parameters." << endl;
      }
929

930 931 932 933 934 935
    //////////////////////////////////
    //$$ Clustering
    windowsStorage->sort();
    list<pair <junction, int> > sort_clones = windowsStorage->getSortedList();
    cout << "  ==> " << sort_clones.size() << " clones" << endl ;
    
936
    list <list <junction> > clones_windows;
937
    comp_matrix comp=comp_matrix(sort_clones);
938
      
939
    if (command == CMD_CLONES) {
Mikaël Salson's avatar
Mikaël Salson committed
940 941 942

    if (epsilon || forced_edges.size())
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
943
	cout << "Cluster similar windows" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
944 945 946

	if (load_comp==1) 
	  {
947
	    comp.load((out_dir+f_basename + "." + comp_filename).c_str());
Mikaël Salson's avatar
Mikaël Salson committed
948 949 950
	  }
	else
	  {
Mathieu Giraud's avatar
Mathieu Giraud committed
951
	    comp.compare( cout, cluster_cost);
Mikaël Salson's avatar
Mikaël Salson committed
952 953 954 955
	  }
	
	if (save_comp==1)
	  {
956
	    comp.save(( out_dir+f_basename + "." + comp_filename).c_str());
Mikaël Salson's avatar
Mikaël Salson committed
957 958
	  }
       
Mathieu Giraud's avatar
Mathieu Giraud committed
959
	clones_windows  = comp.cluster(forced_edges, w, cout, epsilon, minPts) ;
960
	comp.stat_cluster(clones_windows, cout );
Mikaël Salson's avatar
Mikaël Salson committed
961
	comp.del();
962
	cout << "  ==> " << clones_windows.size() << " clusters" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
963 964
      } 
    else
965 966
      { 
	cout << "No clustering" << endl ; 
Mikaël Salson's avatar
Mikaël Salson committed
967 968
      }

969 970 971 972 973 974 975 976 977

    // TODO: output clones_windows (.data, other places ?)

    // TODO: Are these constraints checked somewhere ? keepInterestingWindows ?
    // if (labeled 
    //     || ((clone_nb_reads >= min_reads_clone) 
    //		  && (clone_nb_reads * 100.0 / nb_segmented >= ratio_reads_clone)))

    if (sort_clones.size() == 0)
978 979
      {
	cout << "  ! No clones with current parameters." << endl;
980
	cout << "  ! See the 'Limits to report a clone' options (-r, -%, -z, -A)." << endl;
981
      }
982
    else
Mikaël Salson's avatar
Mikaël Salson committed
983 984
      {

985 986
    cout << endl;

Mikaël Salson's avatar
Mikaël Salson committed
987
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
988
    //$$ Output clones
989

990
    if (max_clones > 0)
991
      cout << "Detailed analysis of at most " << max_clones<< " clones" ;
992
    else
993 994
      cout << "Detailed analysis of all clones" ;
    cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
995 996

    map <string, int> clones_codes ;
Mathieu Giraud's avatar
Mathieu Giraud committed
997
    map <string, string> clones_map_windows ;
Mikaël Salson's avatar
Mikaël Salson committed
998 999 1000 1001

    list <Sequence> representatives ;
    list <string> representatives_labels ;

1002
    // VirtualReadScore *scorer = new KmerAffectReadScore(*(germline->index));
1003
    int last_num_clone_on_stdout = 0 ;
Mikaël Salson's avatar
Mikaël Salson committed
1004 1005
    int num_clone = 0 ;

1006
    ofstream out_edges((out_dir+f_basename + EDGES_FILENAME).c_str());
Mikaël Salson's avatar
Mikaël Salson committed
1007
    int nb_edges = 0 ;
1008
    cout << "  ==> suggested edges in " << out_dir+ f_basename + EDGES_FILENAME
Mikaël Salson's avatar
Mikaël Salson committed
1009 1010
        << endl ;

1011
    string f_clones = out_dir + f_basename + CLONES_FILENAME ;
1012 1013
    cout << "  ==> " << f_clones << "   \t(main result file)" << endl ;
    ofstream out_clones(f_clones.c_str()) ;
Mikaël Salson's avatar
Mikaël Salson committed
1014

1015
    cout << "  ==> " << out_seqdir + CLONE_FILENAME + "*" << "\t(detail, by clone)" << endl ; 
1016
    cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
1017

1018 1019

    for (list <pair<junction,int> >::const_iterator it = sort_clones.begin();
Mikaël Salson's avatar
Mikaël Salson committed
1020
         it != sort_clones.end(); ++it) {
1021
      junction win = it->first;
Mikaël Salson's avatar
Mikaël Salson committed
1022 1023 1024 1025
      int clone_nb_reads = it->second;

    
      ++num_clone ;
Mikaël Salson's avatar
Mikaël Salson committed
1026

1027
      Germline *segmented_germline = windowsStorage->getGermline(it->first);
Mikaël Salson's avatar
Mikaël Salson committed
1028
      
1029 1030 1031
      //$$ Computing labels

      // Clone label
1032
      ostringstream oss;
1033 1034
      oss << "clone-"  << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone
	  << "--" << segmented_germline->code
1035 1036 1037 1038
	  << "--" << setfill('0') << setw(WIDTH_NB_READS) << clone_nb_reads 
	  << "--" << setprecision(3) << 100 * (float) clone_nb_reads / nb_segmented << "%" ;
      string clone_id = oss.str();

Mikaël Salson's avatar
Mikaël Salson committed
1039

1040 1041 1042 1043 1044
      // Clone label -- Human readable information (is it really useful ?)
      ostringstream oss_human;
      oss_human << "#### Clone #" << right << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone 
		<< " – " << setfill(' ') << setw(WIDTH_NB_READS) << clone_nb_reads << " reads" 
		<< " – " << setprecision(3) << 100 * (float) clone_nb_reads / nb_segmented << "%  "  
1045
	;
1046
      string clone_id_human = oss_human.str();
1047 1048

      // Window label
1049
      string window_str = ">" + clone_id + "--window" + " " + windowsStorage->getLabel(it->first) + '\n' + it->first + '\n' ;
1050

1051
      //$$ If max_representatives is reached, we stop here but still outputs the window
1052

1053
      if ((max_representatives >= 0) && (num_clone >=