vidjil.cpp 41.1 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1 2
/*
  This file is part of "Vidjil" <http://bioinfo.lifl.fr/vidjil>
Mathieu Giraud's avatar
Mathieu Giraud committed
3 4
  Copyright (C) 2011, 2012, 2013, 2014 by Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille
  Contributors: Mathieu Giraud <mathieu.giraud@lifl.fr>, Mikaël Salson <mikael.salson@lifl.fr>, Marc Duez <marc.duez@lifl.fr>
Mikaël Salson's avatar
Mikaël Salson committed
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

Mathieu Giraud's avatar
Mathieu Giraud committed
20
//$$ #include
Mikaël Salson's avatar
Mikaël Salson committed
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37

#include<algorithm>
#include<utility>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <cstring>
#include <time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#include "core/tools.h"
#include "core/kmerstore.h"
#include "core/fasta.h"
#include "core/segment.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
38
#include "core/windows.h"
Mikaël Salson's avatar
Mikaël Salson committed
39 40 41 42 43 44 45 46 47
#include "core/cluster-junctions.h"
#include "core/dynprog.h"
#include "core/read_score.h"
#include "core/read_chooser.h"
#include "core/msa.h"
#include "core/compare-all.h"
#include "core/teestream.h"
#include "core/mkdir.h"
#include "core/labels.h"
Mikaël Salson's avatar
Mikaël Salson committed
48
#include "core/list_utils.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
49
#include "core/windowExtractor.h"
Mikaël Salson's avatar
Mikaël Salson committed
50 51 52 53 54 55 56 57

#include "vidjil.h"

// RELEASE_TAG may be defined in the "release.h" file.
// If RELEASE_TAG is undefined, the version will be the git hash.
// #define RELEASE_TAG  "2013.04"
#include "release.h"

Mathieu Giraud's avatar
Mathieu Giraud committed
58
#define VIDJIL_JSON_VERSION "2014.02"
Mikaël Salson's avatar
Mikaël Salson committed
59

Mathieu Giraud's avatar
Mathieu Giraud committed
60 61 62 63
//$$ #define (mainly default options)

#define DEFAULT_GERMLINE_SYSTEM "IGH" 
#define DEFAULT_V_REP  "./germline/IGHV.fa"
Mikaël Salson's avatar
Mikaël Salson committed
64
#define DEFAULT_D_REP  "./germline/IGHD.fa" 
Mathieu Giraud's avatar
Mathieu Giraud committed
65
#define DEFAULT_J_REP  "./germline/IGHJ.fa"
Mikaël Salson's avatar
Mikaël Salson committed
66

Mathieu Giraud's avatar
Mathieu Giraud committed
67
#define DEFAULT_READS  "./data/Stanford_S22.fasta"
Mathieu Giraud's avatar
Mathieu Giraud committed
68
#define MIN_READS_WINDOW 10
Mikaël Salson's avatar
Mikaël Salson committed
69 70
#define MIN_READS_CLONE 10
#define MAX_CLONES 20
71
#define RATIO_READS_CLONE 0.0
Mikaël Salson's avatar
Mikaël Salson committed
72

Mathieu Giraud's avatar
Mathieu Giraud committed
73
#define COMMAND_WINDOWS "windows"
Mikaël Salson's avatar
Mikaël Salson committed
74 75 76
#define COMMAND_ANALYSIS "clones"
#define COMMAND_SEGMENT "segment"
 
Mathieu Giraud's avatar
Mathieu Giraud committed
77
enum { CMD_WINDOWS, CMD_ANALYSIS, CMD_SEGMENT } ;
Mikaël Salson's avatar
Mikaël Salson committed
78 79

#define OUT_DIR "./out/" 
80
#define CLONES_FILENAME "clones.vdj.fa"
Mikaël Salson's avatar
Mikaël Salson committed
81
#define CLONE_FILENAME "clone.fa-"
Mathieu Giraud's avatar
Mathieu Giraud committed
82
#define WINDOWS_FILENAME "windows.fa"
Mikaël Salson's avatar
Mikaël Salson committed
83 84
#define SEQUENCES_FILENAME "sequences.fa"
#define SEGMENTED_FILENAME "segmented.vdj.fa"
Mikaël Salson's avatar
Mikaël Salson committed
85
#define UNSEGMENTED_FILENAME "unsegmented.fa"
Mikaël Salson's avatar
Mikaël Salson committed
86 87 88
#define EDGES_FILENAME "edges"
#define COMP_FILENAME "comp.data"
#define GRAPH_FILENAME "graph"
Mathieu Giraud's avatar
Mathieu Giraud committed
89
#define JSON_SUFFIX ".data"
Mikaël Salson's avatar
Mikaël Salson committed
90 91 92

// "tests/data/leukemia.fa" 

93
#define DEFAULT_K      0
Mikaël Salson's avatar
Mikaël Salson committed
94
#define DEFAULT_W      40
Mikaël Salson's avatar
Mikaël Salson committed
95
#define DEFAULT_W_D    60
96
#define DEFAULT_SEED   ""
Mikaël Salson's avatar
Mikaël Salson committed
97 98

#define DEFAULT_DELTA_MIN  -10
Mathieu Giraud's avatar
Mathieu Giraud committed
99
#define DEFAULT_DELTA_MAX   20
Mikaël Salson's avatar
Mikaël Salson committed
100 101

#define DEFAULT_DELTA_MIN_D  0
Mathieu Giraud's avatar
Mathieu Giraud committed
102
#define DEFAULT_DELTA_MAX_D  60
Mikaël Salson's avatar
Mikaël Salson committed
103

Mikaël Salson's avatar
Mikaël Salson committed
104
#define DEFAULT_MAX_AUDITIONED 2000
Mathieu Giraud's avatar
Mathieu Giraud committed
105
#define DEFAULT_RATIO_REPRESENTATIVE 0.5
106
#define MIN_COVER_REPRESENTATIVE_RATIO_MIN_READS_CLONE 1.0
Mathieu Giraud's avatar
Mathieu Giraud committed
107

Mikaël Salson's avatar
Mikaël Salson committed
108 109 110 111 112 113
#define DEFAULT_EPSILON  0
#define DEFAULT_MINPTS   10

#define DEFAULT_CLUSTER_COST  Cluster
#define DEFAULT_SEGMENT_COST   VDJ

114
// warn
115
#define WARN_PERCENT_SEGMENTED 40
116

Mikaël Salson's avatar
Mikaël Salson committed
117 118 119
// display
#define WIDTH_NB_READS 7
#define WIDTH_NB_CLONES 3
Mikaël Salson's avatar
Mikaël Salson committed
120
#define WIDTH_WINDOW_POS 14+WIDTH_NB_CLONES
Mikaël Salson's avatar
Mikaël Salson committed
121 122 123

using namespace std ;

Mathieu Giraud's avatar
Mathieu Giraud committed
124
//$$ options: usage
Mikaël Salson's avatar
Mikaël Salson committed
125

Mathieu Giraud's avatar
Mathieu Giraud committed
126
extern char *optarg;
Mikaël Salson's avatar
Mikaël Salson committed
127 128 129 130 131 132 133 134

extern int optind, optopt, opterr;

void usage(char *progname)
{
  cerr << "Usage: " << progname << " [options] <reads.fa>" << endl << endl;

  cerr << "Command selection" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
135
       << "  -c <command> \t" << COMMAND_WINDOWS << "\t window extracting (default)" << endl 
Mikaël Salson's avatar
Mikaël Salson committed
136
       << "  \t\t" << COMMAND_ANALYSIS  << "  \t clone analysis" << endl 
137
       << "  \t\t" << COMMAND_SEGMENT   << "  \t V(D)J segmentation (not recommended)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
138 139 140 141 142 143 144 145 146
       << endl       

       << "Germline databases" << endl
       << "  -V <file>     V germline multi-fasta file" << endl
       << "  -D <file>     D germline multi-fasta file (automatically implies -d)" << endl
       << "  -J <file>     J germline multi-fasta file" << endl
       << "  -G <prefix>   prefix for V (D) and J repertoires (shortcut for -V <prefix>V.fa -D <prefix>D.fa -J <prefix>J.fa)" << endl
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
147
       << "Window prediction" << endl
Mikaël Salson's avatar
Mikaël Salson committed
148
#ifndef NO_SPACED_SEEDS
149
       << "  (use either -s or -k option, but not both)" << endl
150 151
       << "  -s <string>   spaced seed used for the V/J affectation" << endl
       << "                (default: #####-#####, ######-######, #######-#######, depends on germline)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
152
#endif
153
       << "  -k <int>      k-mer size used for the V/J affectation (default: 10, 12, 13, depends on germline)" << endl
154 155 156
#ifndef NO_SPACED_SEEDS
       << "                (using -k option is equivalent to set with -s a contiguous seed with only '#' characters)" << endl
#endif
Mathieu Giraud's avatar
Mathieu Giraud committed
157
       << "  -w <int>      w-mer size used for the length of the extracted window (default: " << DEFAULT_W << ")(default with -d: " << DEFAULT_W_D << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
158 159
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
160 161
       << "Window annotations" << endl
       << "  -l <file>     labels for some windows -- these windows will be kept even if some limits are not reached" << endl
Mikaël Salson's avatar
Mikaël Salson committed
162 163
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
164 165
       << "Limit to keep a window" << endl
       << "  -r <nb>       minimal number of reads containing a window (default: " << MIN_READS_WINDOW << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
166 167 168 169 170 171 172 173 174 175 176 177 178
       << endl

       << "Clusterisation" << endl
       << "  -e <file>     manual clusterisation -- a file used to force some specific edges" << endl
       << "  -n <int>      maximum distance between neighbors for automatic clusterisation (default " << DEFAULT_EPSILON << "). No automatic clusterisation if =0." << endl
       << "  -N <int>      minimum required neighbors for automatic clusterisation (default " << DEFAULT_MINPTS << ")" << endl
       << "  -S            generate and save comparative matrix for clustering" << endl
       << "  -L            load comparative matrix for clustering" << endl
       << "  -C <string>   use custom Cost for automatic clustering : format \"match, subst, indels, homo, del_end\" (default "<<Cluster<<" )"<< endl
       << endl

       << "Limits to report a clone" << endl
       << "  -R <nb>       minimal number of reads supporting a clone (default: " << MIN_READS_CLONE << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
179
       << "  -% <ratio>    minimal percentage of reads supporting a clone (default: " << RATIO_READS_CLONE << ")" << endl
180
       << "  -z <nb>       maximal number of clones reported (0: no limit) (default: " << MAX_CLONES << ")" << endl
181
       << "  -A            reports all clones (-r 0 -R 1 -% 0 -z 0), to be used only on very small datasets" << endl
Mikaël Salson's avatar
Mikaël Salson committed
182 183
       << endl

184
       << "Fine segmentation options (second pass, see warning in doc/README)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
185 186 187 188 189 190
       << "  -d            segment into V(D)J components instead of VJ " << endl
       << "  -m <int>      minimal admissible delta between segmentation points (default: " << DEFAULT_DELTA_MIN << ") (default when -d is used: " << DEFAULT_DELTA_MIN_D << ")" << endl
       << "  -M <int>      maximal admissible delta between segmentation points (default: " << DEFAULT_DELTA_MAX << ") (default when -d is used: " << DEFAULT_DELTA_MAX_D << ")" << endl
       << "  -f <string>   use custom Cost for fine segmenter : format \"match, subst, indels, homo, del_end\" (default "<<VDJ<<" )"<< endl
       << endl

191 192 193
       << "Debug" << endl
       << "  -u            output unsegmented sequences (default: " << UNSEGMENTED_FILENAME << ")" << endl
       << "                and display detailed k-mer affectation both on segmented and on unsegmented sequences" << endl
Mikaël Salson's avatar
Mikaël Salson committed
194 195 196 197 198
       << "Output" << endl
       << "  -o <dir>      output directory (default: " << OUT_DIR << ")" <<  endl
       << "  -p <string>   prefix output filenames by the specified string" << endl
    
       << "  -a            output all sequences by cluster (" << SEQUENCES_FILENAME << ")" << endl
199
       << "  -x            do not compute representative sequences" << endl
Mikaël Salson's avatar
Mikaël Salson committed
200 201 202 203 204
       << "  -v            verbose mode" << endl
       << endl        

       << endl 
       << "Examples (see doc/README)" << endl
205 206
       << "  " << progname << "             -G germline/IGH             -d data/Stanford_S22.fasta" << endl
       << "  " << progname << " -c clones   -G germline/IGH  -r 5 -R 5  -d data/Stanford_S22.fasta" << endl
207
       << "  " << progname << " -c segment  -G germline/IGH             -d data/Stanford_S22.fasta   # (only for testing)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
208 209 210 211 212 213 214
    ;
  exit(1);
}

int main (int argc, char **argv)
{
  cout << "# Vidjil -- V(D)J recombinations analysis <http://bioinfo.lifl.fr/vidjil>" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
215 216
       << "# Copyright (C) 2011, 2012, 2013, 2014 by the Vidjil team" << endl
       << "# Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille" << endl 
Mikaël Salson's avatar
Mikaël Salson committed
217 218
       << endl ;

Mathieu Giraud's avatar
Mathieu Giraud committed
219 220
  //$$ options: defaults

Mikaël Salson's avatar
Mikaël Salson committed
221
  string germline_system = DEFAULT_GERMLINE_SYSTEM ;
Mikaël Salson's avatar
Mikaël Salson committed
222 223 224 225 226 227 228 229 230 231 232 233
  string f_rep_V = DEFAULT_V_REP ;
  string f_rep_D = DEFAULT_D_REP ;
  string f_rep_J = DEFAULT_J_REP ;
  string f_reads = DEFAULT_READS ;
  string seed = DEFAULT_SEED ;
  string prefix_filename = "";

  string out_dir = OUT_DIR;
  
  string comp_filename = COMP_FILENAME;

  int k = DEFAULT_K ;
Mikaël Salson's avatar
Mikaël Salson committed
234 235 236
  int w = 0 ;
  int default_w = DEFAULT_W ;

Mikaël Salson's avatar
Mikaël Salson committed
237 238 239 240 241 242 243 244 245 246 247
  int epsilon = DEFAULT_EPSILON ;
  int minPts = DEFAULT_MINPTS ;
  Cost cluster_cost = DEFAULT_CLUSTER_COST ;
  Cost segment_cost = DEFAULT_SEGMENT_COST ;
  
  
  int save_comp = 0;
  int load_comp = 0;
  int segment_D = 0;
  
  int verbose = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
248
  int command = CMD_WINDOWS;
Mikaël Salson's avatar
Mikaël Salson committed
249

Mikaël Salson's avatar
Mikaël Salson committed
250
  int max_clones = MAX_CLONES ;
Mathieu Giraud's avatar
Mathieu Giraud committed
251
  int min_reads_window = MIN_READS_WINDOW ;
Mikaël Salson's avatar
Mikaël Salson committed
252 253 254 255
  int min_reads_clone = MIN_READS_CLONE ;
  float ratio_reads_clone = RATIO_READS_CLONE;
  // int average_deletion = 4;     // Average number of deletion in V or J

Mathieu Giraud's avatar
Mathieu Giraud committed
256
  float ratio_representative = DEFAULT_RATIO_REPRESENTATIVE;
Mikaël Salson's avatar
Mikaël Salson committed
257
  unsigned int max_auditionned = DEFAULT_MAX_AUDITIONED;
Mathieu Giraud's avatar
Mathieu Giraud committed
258

Mikaël Salson's avatar
Mikaël Salson committed
259 260 261 262 263 264 265
  // Admissible delta between left and right segmentation points
  int delta_min = DEFAULT_DELTA_MIN ; // Kmer+Fine
  int delta_max = DEFAULT_DELTA_MAX ; // Fine
  int delta_max_kmer = 50 ; // TODO 

  bool output_sequences_by_cluster = false;
  bool detailed_cluster_analysis = true ;
Mikaël Salson's avatar
Mikaël Salson committed
266
  bool very_detailed_cluster_analysis = false ;
Mathieu Giraud's avatar
Mathieu Giraud committed
267
  bool output_unsegmented = false;
Mikaël Salson's avatar
Mikaël Salson committed
268 269 270

  string forced_edges = "" ;

Mathieu Giraud's avatar
Mathieu Giraud committed
271
  string windows_labels_file = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
272
  string normalization_file = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
273 274 275

  char c ;

276 277
  int options_s_k = 0 ;

Mathieu Giraud's avatar
Mathieu Giraud committed
278 279
  //$$ options: getopt

280
  while ((c = getopt(argc, argv, "AhaG:V:D:J:k:r:R:vw:e:C:t:l:dc:m:M:N:s:p:Sn:o:Lx%:Z:z:u")) != EOF)
Mikaël Salson's avatar
Mikaël Salson committed
281 282 283 284 285 286 287 288 289 290 291 292

    switch (c)
      {
      case 'h':
        usage(argv[0]);
      case 'a':
	output_sequences_by_cluster = true;
	break;
      case 'd':
	segment_D = 1 ;
	delta_min = DEFAULT_DELTA_MIN_D ;
	delta_max = DEFAULT_DELTA_MAX_D ;
Mikaël Salson's avatar
Mikaël Salson committed
293
	default_w = DEFAULT_W_D ;
Mikaël Salson's avatar
Mikaël Salson committed
294 295 296 297 298
	break;
      case 'e':
	forced_edges = optarg;
	break;
      case 'l':
Mathieu Giraud's avatar
Mathieu Giraud committed
299
	windows_labels_file = optarg; 
Mikaël Salson's avatar
Mikaël Salson committed
300
	break;
Mikaël Salson's avatar
Mikaël Salson committed
301 302 303
      case 'Z':
	normalization_file = optarg; 
	break;
Mikaël Salson's avatar
Mikaël Salson committed
304 305 306 307 308 309 310 311
      case 'x':
	detailed_cluster_analysis = false;
	break;
      case 'c':
        if (!strcmp(COMMAND_ANALYSIS,optarg))
          command = CMD_ANALYSIS;
        else if (!strcmp(COMMAND_SEGMENT,optarg))
          command = CMD_SEGMENT;
Mathieu Giraud's avatar
Mathieu Giraud committed
312 313
        else if (!strcmp(COMMAND_WINDOWS,optarg))
          command = CMD_WINDOWS;
Mikaël Salson's avatar
Mikaël Salson committed
314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
        else {
          cerr << "Unknwown command " << optarg << endl;
	  usage(argv[0]);
        }
        break;
      case 'v':
	verbose += 1 ;
	break;

      // Germline

      case 'V':
	f_rep_V = optarg;
	break;

      case 'D':
	f_rep_D = optarg;
        segment_D = 1;
	break;
        
      case 'J':
	f_rep_J = optarg;
	break;

      case 'G':
Mikaël Salson's avatar
Mikaël Salson committed
339 340 341 342
	germline_system = string(optarg);
	f_rep_V = (germline_system + "V.fa").c_str() ;
	f_rep_D = (germline_system + "D.fa").c_str() ;
	f_rep_J = (germline_system + "J.fa").c_str() ;
Mikaël Salson's avatar
Mikaël Salson committed
343 344 345 346 347 348 349 350
	// TODO: if VDJ, set segment_D // NO, bad idea, depends on naming convention
	break;

      // Algorithm

      case 'k':
	k = atoi(optarg);
	seed = seed_contiguous(k);
351
	options_s_k++ ;
Mikaël Salson's avatar
Mikaël Salson committed
352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
        break;

      case 'w':
	w = atoi(optarg);
        break;

      case 'm':
	delta_min = atoi(optarg);
        break;

      case 'M':
	delta_max = atoi(optarg);
        break;

      case 'o':
        out_dir = optarg ;
        break;

      case 'p':
        prefix_filename = optarg;
        break;

374 375
      // Limits

Mikaël Salson's avatar
Mikaël Salson committed
376
      case 'r':
Mathieu Giraud's avatar
Mathieu Giraud committed
377
	min_reads_window = atoi(optarg);
Mikaël Salson's avatar
Mikaël Salson committed
378 379 380 381 382 383 384 385 386 387
        break;

      case '%':
	ratio_reads_clone = atof(optarg);
	break;

      case 'R':
	min_reads_clone = atoi(optarg);
        break;

Mikaël Salson's avatar
Mikaël Salson committed
388 389 390
      case 'z':
	max_clones = atoi(optarg);
        break;
391 392 393 394 395 396 397 398 399 400

      case 'A': // --all
	min_reads_window = 1 ;
	ratio_reads_clone = 0 ;
	min_reads_clone = 0 ;
	max_clones = 0 ;
	break ;

      // Seeds

Mikaël Salson's avatar
Mikaël Salson committed
401 402 403 404
      case 's':
#ifndef NO_SPACED_SEEDS
	seed = string(optarg);
	k = seed_weight(seed);
405
	options_s_k++ ;
Mikaël Salson's avatar
Mikaël Salson committed
406
#else
Mathieu Giraud's avatar
Mathieu Giraud committed
407
        cerr << "To enable the option -s, please compile without NO_SPACED_SEEDS" << endl;
Mikaël Salson's avatar
Mikaël Salson committed
408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435
#endif
        break;
	
      // Clusterisation
	
      case 'n':
	epsilon = atoi(optarg);
        break;

      case 'N':
	minPts = atoi(optarg);
        break;
	
      case 'S':
	save_comp=1;
        break;

      case 'L':
	load_comp=1;
        break;
	
      case 'C':
	cluster_cost=strToCost(optarg, Cluster);
        break;
	
      case 't':
	segment_cost=strToCost(optarg, VDJ);
        break;
Mathieu Giraud's avatar
Mathieu Giraud committed
436 437 438 439

      case 'u':
        output_unsegmented = true;
        break;
Mikaël Salson's avatar
Mikaël Salson committed
440 441
      }

Mikaël Salson's avatar
Mikaël Salson committed
442 443 444 445 446
  // If there was no -w option, then w is either DEFAULT_W or DEFAULT_W_D
  if (w == 0)
    w = default_w ;

  
447 448 449 450 451 452
  if (options_s_k > 1)
    {
      cout << "use at most one -s or -k option." << endl ;
      exit(1);
    }

Mikaël Salson's avatar
Mikaël Salson committed
453 454
  string out_seqdir = out_dir + "/seq/" ;

Mikaël Salson's avatar
Mikaël Salson committed
455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470
  if (verbose)
    cout << "# verbose " << verbose << endl ;

  if (optind == argc)
    {
      cout << "# using default sequence file: " << f_reads << endl ;
    }
  else if (optind+1 == argc)
    {
      f_reads = argv[optind] ; 
    }
  else
    {
      cout << "wrong number of arguments." << endl ;
      exit(1);
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
471 472

  //$$ options: post-processing+display
Mathieu Giraud's avatar
Mathieu Giraud committed
473

474
  size_t min_cover_representative = (size_t) (MIN_COVER_REPRESENTATIVE_RATIO_MIN_READS_CLONE * min_reads_clone) ;
475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499

  // Default seeds

#ifndef NO_SPACED_SEEDS
  if (k == DEFAULT_K)
    {
      if (germline_system.find("TRA") != string::npos)
	seed = "#######-######" ;

      else if ((germline_system.find("TRB") != string::npos)
	       || (germline_system.find("IGH") != string::npos))
	seed = "######-######" ; 
      else // TRD, TRG, IGK, IGL
	seed = "#####-#####" ; 

      k = seed_weight(seed);
    }
#else
  {
    cout << "Vidjil was compiled with NO_SPACED_SEEDS: please provide a -k option." << endl;
    exit(1) ;
  }
#endif
	  

Mathieu Giraud's avatar
Mathieu Giraud committed
500 501 502 503 504 505 506 507 508
#ifndef NO_SPACED_SEEDS
  // Check seed buffer  
  if (seed.size() >= MAX_SEED_SIZE)
    {
      cout << "Seed size is too large (MAX_SEED_SIZE). Aborting." << endl ;
      exit(1);
    }
#endif

Mikaël Salson's avatar
Mikaël Salson committed
509 510 511 512 513 514 515 516
  // Check that out_dir is an existing directory or creates it
  const char *out_cstr = out_dir.c_str();

  if (mkpath(out_cstr, 0755) == -1) {
    perror("Directory creation");
    exit(2);
  }

Mikaël Salson's avatar
Mikaël Salson committed
517 518 519 520 521 522
  const char *outseq_cstr = out_seqdir.c_str();
  if (mkpath(outseq_cstr, 0755) == -1) {
    perror("Directory creation");
    exit(2);
  }

Mikaël Salson's avatar
Mikaël Salson committed
523 524 525
  out_dir += "/" ;

  /// Load labels ;
Mathieu Giraud's avatar
Mathieu Giraud committed
526
  map <string, string> windows_labels = load_map(windows_labels_file);
Mikaël Salson's avatar
Mikaël Salson committed
527

Mikaël Salson's avatar
Mikaël Salson committed
528 529
  map <string, pair <string, float> > normalization = load_map_norm(normalization_file);

Mikaël Salson's avatar
Mikaël Salson committed
530 531

  switch(command) {
Mathieu Giraud's avatar
Mathieu Giraud committed
532
  case CMD_WINDOWS: cout << "Extracting windows" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
533 534 535 536 537 538 539
    break;
  case CMD_ANALYSIS: cout << "Analysing clones" << endl; 
    break;
  case CMD_SEGMENT: cout << "Segmenting V(D)J" << endl;
    break;
  }

Mathieu Giraud's avatar
Mathieu Giraud committed
540
  cout << "Command line: ";
Mikaël Salson's avatar
Mikaël Salson committed
541
  for (int i=0; i < argc; i++) {
Mathieu Giraud's avatar
Mathieu Giraud committed
542
    cout << argv[i] << " ";
Mikaël Salson's avatar
Mikaël Salson committed
543
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
544
  cout << endl;
Mikaël Salson's avatar
Mikaël Salson committed
545 546 547 548 549 550 551 552 553 554 555 556

  //////////////////////////////////
  // Display time and date
  time_t rawtime;
  struct tm *timeinfo;
  char time_buffer[80];

  time (&rawtime );
  timeinfo = localtime (&rawtime);

  strftime (time_buffer, 80,"%F %T", timeinfo);

Mathieu Giraud's avatar
Mathieu Giraud committed
557
  cout << "# " << time_buffer << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
558 559 560 561 562 563


  //////////////////////////////////
  // Display version information or git log

#ifdef RELEASE_TAG
Mathieu Giraud's avatar
Mathieu Giraud committed
564
  cout << "# version: vidjil " << RELEASE_TAG << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
565
#else
Mathieu Giraud's avatar
Mathieu Giraud committed
566 567
  if (system("git log -1 --pretty=format:'# git: %h (%ci)' --abbrev-commit 2> /dev/null") == 0){}
  cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
568 569 570
#endif

  //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
571
  //$$ Read sequence files
Mikaël Salson's avatar
Mikaël Salson committed
572 573 574 575

  if (!segment_D) // TODO: add other constructor to Fasta, and do not load rep_D in this case
    f_rep_D = "";

Mathieu Giraud's avatar
Mathieu Giraud committed
576 577 578 579
  Fasta rep_V(f_rep_V, 2, "|", cout);
  Fasta rep_D(f_rep_D, 2, "|", cout);
  Fasta rep_J(f_rep_J, 2, "|", cout);

Mikaël Salson's avatar
Mikaël Salson committed
580 581 582 583 584
  OnlineFasta *reads;

  try {
    reads = new OnlineFasta(f_reads, 1, " ");
  } catch (const std::ios_base::failure e) {
Mathieu Giraud's avatar
Mathieu Giraud committed
585
    cout << "Error while reading reads file " << f_reads << ": " << e.what()
Mikaël Salson's avatar
Mikaël Salson committed
586 587 588 589 590 591 592 593 594
        << endl;
    exit(1);
  }

  out_dir += "/";

  ////////////////////////////////////////
  //           CLONE ANALYSIS           //
  ////////////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
595
  if (command == CMD_ANALYSIS || command == CMD_WINDOWS) {
Mikaël Salson's avatar
Mikaël Salson committed
596 597

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
598 599 600 601 602 603
    cout << "# seed = " << seed << "," ;
    cout << " weight = " << seed_weight(seed) << "," ;
    cout << " span = " << seed.size() << endl ;
    cout << "# k = " << k << "," ;
    cout << " w = " << w << "," ;
    cout << " delta = [" << delta_min << "," << delta_max << "]" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
604 605 606


    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
607 608
    //$$ Build Kmer indexes
    cout << "Build Kmer indexes" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
609 610 611 612 613 614 615 616 617

    bool rc = true ;
    
    IKmerStore<KmerAffect>  *index = KmerStoreFactory::createIndex<KmerAffect>(seed, rc);
    index->insert(rep_V, "V");
    index->insert(rep_J, "J");

  
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
618
    //$$ Kmer Segmentation
Mikaël Salson's avatar
Mikaël Salson committed
619

Mathieu Giraud's avatar
Mathieu Giraud committed
620 621
    cout << endl;
    cout << "Loop through reads, looking for windows" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
622

Mathieu Giraud's avatar
Mathieu Giraud committed
623
    string f_segmented = out_dir + prefix_filename + SEGMENTED_FILENAME ;
624
    cout << "  ==> " << f_segmented << "\t(result of window detection)" << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
625 626
    ofstream out_segmented(f_segmented.c_str()) ;
    ofstream *out_unsegmented = NULL;
Mikaël Salson's avatar
Mikaël Salson committed
627
 
Mathieu Giraud's avatar
Mathieu Giraud committed
628 629 630 631 632 633 634 635 636 637 638
    WindowExtractor we;
    we.setSegmentedOutput(&out_segmented);
    if (output_unsegmented) {
      string f_unsegmented = out_dir + prefix_filename + UNSEGMENTED_FILENAME ;
      cout << "  ==> " << f_unsegmented << endl ;
      out_unsegmented = new ofstream(f_unsegmented.c_str());
      we.setUnsegmentedOutput(out_unsegmented);
    }
    WindowsStorage *windowsStorage = we.extract(reads, index, w, delta_min, 
                                                delta_max_kmer, windows_labels);
    size_t nb_total_reads = we.getNbReads();
Mikaël Salson's avatar
Mikaël Salson committed
639 640


Mathieu Giraud's avatar
Mathieu Giraud committed
641
    //$$ Display statistics on segmentation causes
Mikaël Salson's avatar
Mikaël Salson committed
642

Mikaël Salson's avatar
Mikaël Salson committed
643

Mathieu Giraud's avatar
Mathieu Giraud committed
644 645
    int nb_segmented_including_too_short = we.getNbSegmented(TOTAL_SEG_AND_WINDOW) 
      + we.getNbSegmented(TOTAL_SEG_BUT_TOO_SHORT_FOR_THE_WINDOW);
Mikaël Salson's avatar
Mikaël Salson committed
646

Mathieu Giraud's avatar
Mathieu Giraud committed
647
    cout << "  ==> segmented " << nb_segmented_including_too_short << " reads"
Mikaël Salson's avatar
Mikaël Salson committed
648
	<< " (" << setprecision(3) << 100 * (float) nb_segmented_including_too_short / nb_total_reads << "%)" 
Mathieu Giraud's avatar
Mathieu Giraud committed
649 650
	<< endl ;

Mikaël Salson's avatar
Mikaël Salson committed
651
    // nb_segmented is the main denominator for the following (but will be normalized)
Mathieu Giraud's avatar
Mathieu Giraud committed
652
    int nb_segmented = we.getNbSegmented(TOTAL_SEG_AND_WINDOW);
653
    float ratio_segmented = 100 * (float) nb_segmented / nb_total_reads ;
Mikaël Salson's avatar
Mikaël Salson committed
654

Mathieu Giraud's avatar
Mathieu Giraud committed
655
    cout << "  ==> found " << windowsStorage->size() << " " << w << "-windows"
Mikaël Salson's avatar
Mikaël Salson committed
656
	<< " in " << nb_segmented << " segments"
657
	<< " (" << setprecision(3) << ratio_segmented << "%)"
Mikaël Salson's avatar
Mikaël Salson committed
658 659
	<< " inside " << nb_total_reads << " sequences" << endl ;
  
660
    // warn if there are too few segmented sequences
661
    if (ratio_segmented < WARN_PERCENT_SEGMENTED)
662 663 664 665 666
      {
        cout << "  ! There are not so many CDR3 windows found in this set of reads." << endl ;
        cout << "  ! If this is unexpected, check the germline (-G) and try to change seed parameters (-k)." << endl ;
      }

Mathieu Giraud's avatar
Mathieu Giraud committed
667
    cout << "                                  #      av. length" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
668

Mikaël Salson's avatar
Mikaël Salson committed
669
    for (int i=0; i<STATS_SIZE; i++)
Mikaël Salson's avatar
Mikaël Salson committed
670
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
671 672
	cout << "   " << left << setw(20) << segmented_mesg[i] 
             << " ->" << right << setw(9) << we.getNbSegmented(static_cast<SEGMENTED>(i)) ;
Mikaël Salson's avatar
Mikaël Salson committed
673

Mathieu Giraud's avatar
Mathieu Giraud committed
674 675 676 677 678
	if (we.getAverageSegmentationLength(static_cast<SEGMENTED>(i)))
	  cout << "      " << setw(5) << fixed << setprecision(1) 
               << we.getAverageSegmentationLength(static_cast<SEGMENTED>(i));
	
	cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
679 680
      }
    
Mathieu Giraud's avatar
Mathieu Giraud committed
681
      map <junction, JsonList> json_data_segment ;
Mikaël Salson's avatar
Mikaël Salson committed
682
    
Mikaël Salson's avatar
Mikaël Salson committed
683 684

	//////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
685
	//$$ Sort windows
Mikaël Salson's avatar
Mikaël Salson committed
686
	
Mathieu Giraud's avatar
Mathieu Giraud committed
687 688
        cout << "Sort windows by number of occurrences" << endl;
        windowsStorage->sort();
Mikaël Salson's avatar
Mikaël Salson committed
689 690

	//////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
691
	//$$ Output windows
Mikaël Salson's avatar
Mikaël Salson committed
692 693
	//////////////////////////////////

Mathieu Giraud's avatar
Mathieu Giraud committed
694
	string f_all_windows = out_dir + prefix_filename + WINDOWS_FILENAME;
Mathieu Giraud's avatar
Mathieu Giraud committed
695
	cout << "  ==> " << f_all_windows << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
696

Mathieu Giraud's avatar
Mathieu Giraud committed
697
	ofstream out_all_windows(f_all_windows.c_str());
Mathieu Giraud's avatar
Mathieu Giraud committed
698
        windowsStorage->printSortedWindows(out_all_windows);
Mikaël Salson's avatar
Mikaël Salson committed
699

Mathieu Giraud's avatar
Mathieu Giraud committed
700 701
	//$$ Normalization
	list< pair <float, int> > norm_list = compute_normalization_list(windowsStorage->getMap(), normalization, nb_segmented);
Mikaël Salson's avatar
Mikaël Salson committed
702 703 704 705 706


    if (command == CMD_ANALYSIS) {

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
707 708
    //$$ min_reads_window (ou label)
    cout << "Considering only windows with >= " << min_reads_window << " reads and labeled windows" << endl;
Mathieu Giraud's avatar
Mathieu Giraud committed
709

Mathieu Giraud's avatar
Mathieu Giraud committed
710
    pair<int, int> info_remove = windowsStorage->keepInterestingWindows((size_t) min_reads_window);
Mikaël Salson's avatar
Mikaël Salson committed
711
	 
Mathieu Giraud's avatar
Mathieu Giraud committed
712 713
    cout << "  ==> keep " <<  windowsStorage->size() << " windows in " << info_remove.second << " reads" ;
    cout << " (" << setprecision(3) << 100 * (float) info_remove.second / nb_total_reads << "%)  " << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
714

715 716 717 718
    if (windowsStorage->size() == 0)
      {
	cout << "  ! No windows with current parameters." << endl;
      }
Mikaël Salson's avatar
Mikaël Salson committed
719
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
720
    //$$ Clustering
Mikaël Salson's avatar
Mikaël Salson committed
721

Mathieu Giraud's avatar
Mathieu Giraud committed
722
    list <list <junction> > clones_windows;
Mathieu Giraud's avatar
Mathieu Giraud committed
723
    comp_matrix comp=comp_matrix(*windowsStorage);
Mikaël Salson's avatar
Mikaël Salson committed
724 725 726

    if (epsilon || forced_edges.size())
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
727
	cout << "Cluster similar windows" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
728 729 730 731 732 733 734

	if (load_comp==1) 
	  {
	    comp.load((out_dir+prefix_filename + comp_filename).c_str());
	  }
	else
	  {
Mathieu Giraud's avatar
Mathieu Giraud committed
735
	    comp.compare( cout, cluster_cost);
Mikaël Salson's avatar
Mikaël Salson committed
736 737 738 739 740 741 742
	  }
	
	if (save_comp==1)
	  {
	    comp.save(( out_dir+prefix_filename + comp_filename).c_str());
	  }
       
Mathieu Giraud's avatar
Mathieu Giraud committed
743 744
	clones_windows  = comp.cluster(forced_edges, w, cout, epsilon, minPts) ;
	comp.stat_cluster(clones_windows, out_dir + prefix_filename + GRAPH_FILENAME, cout );
Mikaël Salson's avatar
Mikaël Salson committed
745 746 747 748
	comp.del();
      } 
    else
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
749
	cout << "No clustering" << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
750
	clones_windows  = comp.nocluster() ;
Mikaël Salson's avatar
Mikaël Salson committed
751 752
      }

Mathieu Giraud's avatar
Mathieu Giraud committed
753
    cout << "  ==> " << clones_windows.size() << " clones" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
754
 
755 756 757 758 759 760 761 762
    if (clones_windows.size() == 0)
      {
	cout << "  ! No clones with current parameters." << endl;
	cout << "  ! See the 'Limits to report a clone' options (-R, -%, -z, -A)." << endl;
      }
    else // clones_windows.size() > 0
      { 

Mathieu Giraud's avatar
Mathieu Giraud committed
763
    //$$ Sort clones, number of occurrences
Mikaël Salson's avatar
Mikaël Salson committed
764
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
765
    cout << "Sort clones by number of occurrences" << endl;
Mikaël Salson's avatar
Mikaël Salson committed
766 767 768

    list<pair<list <junction>, int> >sort_clones;

Mathieu Giraud's avatar
Mathieu Giraud committed
769
    for (list <list <junction> >::const_iterator it = clones_windows.begin(); it != clones_windows.end(); ++it)
Mikaël Salson's avatar
Mikaël Salson committed
770 771 772
      {
        list <junction>clone = *it ;

Mikaël Salson's avatar
Mikaël Salson committed
773 774 775
	int clone_nb_reads=0;
	
        for (list <junction>::const_iterator it2 = clone.begin(); it2 != clone.end(); ++it2)
Mathieu Giraud's avatar
Mathieu Giraud committed
776
	  clone_nb_reads += windowsStorage->getNbReads(*it2);
Mikaël Salson's avatar
Mikaël Salson committed
777
	  
Mikaël Salson's avatar
Mikaël Salson committed
778
	bool labeled = false ;
Mathieu Giraud's avatar
Mathieu Giraud committed
779
	// Is there a labeled window ?
Mikaël Salson's avatar
Mikaël Salson committed
780
	for (list <junction>::const_iterator iit = clone.begin(); iit != clone.end(); ++iit) {
Mathieu Giraud's avatar
Mathieu Giraud committed
781
	  if (windows_labels.find(*iit) != windows_labels.end())
Mikaël Salson's avatar
Mikaël Salson committed
782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797
	    {
	      labeled = true ;
	      break ;
	    }
	}

	  if (labeled 
	      || ((clone_nb_reads >= min_reads_clone) 
		  && (clone_nb_reads * 100.0 / nb_segmented >= ratio_reads_clone)))
          // Record the clone and its number of occurrence
          sort_clones.push_back(make_pair(clone, clone_nb_reads));
      }

    // Sort clones
    sort_clones.sort(pair_occurrence_sort<list<junction> >);

798 799
    cout << endl;

Mikaël Salson's avatar
Mikaël Salson committed
800
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
801
    //$$ Output clones
802 803 804 805 806 807
    if (max_clones > 0)
      cout << "Output at most " << max_clones<< " clones" ;
    else
      cout << "Output all clones" ;

    cout << " with >= " << min_reads_clone << " reads and with a ratio >= " << ratio_reads_clone << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
808 809

    map <string, int> clones_codes ;
Mathieu Giraud's avatar
Mathieu Giraud committed
810
    map <string, string> clones_map_windows ;
Mikaël Salson's avatar
Mikaël Salson committed
811 812 813 814 815 816

    list <Sequence> representatives ;
    list <string> representatives_labels ;

    VirtualReadScore *scorer = new KmerAffectReadScore(*index);
    int num_clone = 0 ;
817
    int clones_without_representative = 0 ;
Mikaël Salson's avatar
Mikaël Salson committed
818 819 820

    ofstream out_edges((out_dir+prefix_filename + EDGES_FILENAME).c_str());
    int nb_edges = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
821
    cout << "  ==> suggested edges in " << out_dir+ prefix_filename + EDGES_FILENAME
Mikaël Salson's avatar
Mikaël Salson committed
822 823
        << endl ;

824 825 826
    string f_clones = out_dir + prefix_filename + CLONES_FILENAME ;
    cout << "  ==> " << f_clones << "   \t(main result file)" << endl ;
    ofstream out_clones(f_clones.c_str()) ;
Mikaël Salson's avatar
Mikaël Salson committed
827

828 829
    cout << "  ==> " << out_seqdir + prefix_filename + CLONE_FILENAME + "*" << "\t(detail, by clone)" << endl ; 
    cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
830 831 832 833 834 835 836 837

    for (list <pair<list <junction>,int> >::const_iterator it = sort_clones.begin();
         it != sort_clones.end(); ++it) {
      list<junction> clone = it->first;
      int clone_nb_reads = it->second;

    
      ++num_clone ;
Mikaël Salson's avatar
Mikaël Salson committed
838

839
      if (max_clones && (num_clone == max_clones + 1))
Mikaël Salson's avatar
Mikaël Salson committed
840 841
	  break ;

Mikaël Salson's avatar
Mikaël Salson committed
842
      cout << "#### " ;
Mikaël Salson's avatar
Mikaël Salson committed
843 844 845 846

      string clone_file_name = out_seqdir+ prefix_filename + CLONE_FILENAME + string_of_int(num_clone) ;
      string windows_file_name = out_seqdir+ prefix_filename + WINDOWS_FILENAME + "-" + string_of_int(num_clone) ;
      string sequences_file_name = out_seqdir+ prefix_filename + SEQUENCES_FILENAME + "-" + string_of_int(num_clone) ;
Mikaël Salson's avatar
Mikaël Salson committed
847 848

      ofstream out_clone(clone_file_name.c_str());
Mathieu Giraud's avatar
Mathieu Giraud committed
849 850 851 852 853 854
      ofstream out_windows(windows_file_name.c_str());
      ofstream out_sequences;

      if (output_sequences_by_cluster) {
        out_sequences.open(sequences_file_name.c_str());
      }
Mikaël Salson's avatar
Mikaël Salson committed
855
      
Mathieu Giraud's avatar
Mathieu Giraud committed
856 857 858
      cout << "Clone #" << right << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone ;
      cout << " – " << setfill(' ') << setw(WIDTH_NB_READS) << clone_nb_reads << " reads" ;
      cout << " – " << setprecision(3) << 100 * (float) clone_nb_reads / nb_segmented << "%  "  ;
Mikaël Salson's avatar
Mikaël Salson committed
859

Mathieu Giraud's avatar
Mathieu Giraud committed
860
      cout << " – " << 100 * (float) clone_nb_reads * compute_normalization_one(norm_list, clone_nb_reads) / nb_segmented << "% " 
Mikaël Salson's avatar
Mikaël Salson committed
861
	  << compute_normalization_one(norm_list, clone_nb_reads) << " ";
Mathieu Giraud's avatar
Mathieu Giraud committed
862
      cout.flush();
Mikaël Salson's avatar
Mikaël Salson committed
863 864 865

      //////////////////////////////////

Mathieu Giraud's avatar
Mathieu Giraud committed
866
      //$$ Sort sequences by nb_reads
Mathieu Giraud's avatar
Mathieu Giraud committed
867
      list<pair<junction, int> >sort_windows;
Mikaël Salson's avatar
Mikaël Salson committed
868 869

      for (list <junction>::const_iterator it = clone.begin(); it != clone.end(); ++it) {
Mathieu Giraud's avatar
Mathieu Giraud committed
870
	int nb_reads = windowsStorage->getNbReads(*it);
Mathieu Giraud's avatar
Mathieu Giraud committed
871
        sort_windows.push_back(make_pair(*it, nb_reads));
Mikaël Salson's avatar
Mikaël Salson committed
872
      }
Mathieu Giraud's avatar
Mathieu Giraud committed
873
      sort_windows.sort(pair_occurrence_sort<junction>);
Mikaël Salson's avatar
Mikaël Salson committed
874

Mathieu Giraud's avatar
Mathieu Giraud committed
875
      //$$ Output windows 
Mikaël Salson's avatar
Mikaël Salson committed
876 877

      int num_seq = 0 ;
Mikaël Salson's avatar
Mikaël Salson committed
878

Mathieu Giraud's avatar
Mathieu Giraud committed
879 880
      for (list <pair<junction, int> >::const_iterator it = sort_windows.begin(); 
           it != sort_windows.end(); ++it) {
Mikaël Salson's avatar
Mikaël Salson committed
881 882
	num_seq++ ;

Mathieu Giraud's avatar
Mathieu Giraud committed
883 884
        out_windows << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ;
	out_windows << it->first << endl;
Mikaël Salson's avatar
Mikaël Salson committed
885 886 887

	if ((!detailed_cluster_analysis) && (num_seq == 1))
	  {
Mathieu Giraud's avatar
Mathieu Giraud committed
888 889 890
	    cout << "\t" << setw(WIDTH_NB_READS) << it->second << "\t";
	    cout << it->first ;
	    cout << "\t" << windows_labels[it->first] ;
Mikaël Salson's avatar
Mikaël Salson committed
891 892 893 894 895
	  }
      }

      if (!detailed_cluster_analysis)
	{
Mathieu Giraud's avatar
Mathieu Giraud committed
896
	  cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
897 898 899 900
	  continue ;
	}

	
Mathieu Giraud's avatar
Mathieu Giraud committed
901
      //$$ First pass, choose one representative per cluster
Mikaël Salson's avatar
Mikaël Salson committed
902 903 904 905

      string best_V ;
      string best_D ;
      string best_J ;
Mathieu Giraud's avatar
Mathieu Giraud committed
906
      int more_windows = 0 ;
Mikaël Salson's avatar
Mikaël Salson committed
907
      
Mathieu Giraud's avatar
Mathieu Giraud committed
908 909
      for (list <pair<junction, int> >::const_iterator it = sort_windows.begin(); 
           it != sort_windows.end(); ++it) {
Mikaël Salson's avatar
Mikaël Salson committed
910

Mikaël Salson's avatar
Mikaël Salson committed
911 912
        out_clone << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ;
	out_clone << it->first << endl;
Mikaël Salson's avatar
Mikaël Salson committed
913

Mikaël Salson's avatar
Mikaël Salson committed
914 915 916 917 918

	// Display statistics on auditionned sequences
	if (verbose)
	{
	  int total_length = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
919 920
          list<Sequence> auditioned = windowsStorage->getSample(it->first, max_auditionned);
	  for (list<Sequence>::const_iterator it = auditioned.begin(); it != auditioned.end(); ++it) 
Mikaël Salson's avatar
Mikaël Salson committed
921 922
	    total_length += (*it).sequence.size() ;
	  
Mathieu Giraud's avatar
Mathieu Giraud committed
923
	  cout << auditioned.size() << " auditioned sequences, avg length " << total_length / auditioned.size() << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
924 925
	}

Mathieu Giraud's avatar
Mathieu Giraud committed
926 927 928 929 930 931
        Sequence representative 
          = windowsStorage->getRepresentative(it->first, seed, 
                                             min_cover_representative,
                                             ratio_representative,
                                             max_auditionned);

932 933 934 935 936 937 938

        if (representative == NULL_SEQUENCE) {
	  clones_without_representative++ ;
	  if (verbose)
	    cout << "# (no representative sequence with current parameters)" ;

        } else {
Mathieu Giraud's avatar
Mathieu Giraud committed
939
	//$$ There is one representative, FineSegmenter
940 941


Mathieu Giraud's avatar
Mathieu Giraud committed
942 943
          representative.label = string_of_int(it->second) + "-" 
            + representative.label;
Mikaël Salson's avatar
Mikaël Salson committed
944
	  FineSegmenter seg(representative, rep_V, rep_J, delta_min, delta_max, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
945
	
Mikaël Salson's avatar
Mikaël Salson committed
946 947
	if (segment_D)
	  seg.FineSegmentD(rep_V, rep_D, rep_J);
Mikaël Salson's avatar
Mikaël Salson committed
948
	
Mathieu Giraud's avatar
Mathieu Giraud committed
949 950 951 952
        //cout << seg.toJson();
        json_data_segment[it->first]=seg.toJsonList(rep_V, rep_D, rep_J);
        
        if (seg.isSegmented())
Mikaël Salson's avatar
Mikaël Salson committed
953 954 955 956 957 958
	  {
	    
	    // As soon as one representative is segmented
	    
	    representatives.push_back(seg.getSequence());
            representatives_labels.push_back("#" + string_of_int(num_clone));
Mikaël Salson's avatar
Mikaël Salson committed
959

Mathieu Giraud's avatar
Mathieu Giraud committed
960 961
              // We need to find the window in the representative
              size_t window_pos = seg.getSequence().sequence.find(it->first);
Mikaël Salson's avatar
Mikaël Salson committed
962

Mathieu Giraud's avatar
Mathieu Giraud committed
963 964
              // Default
              int ww = 2*w/3 ; // /2 ;
Mikaël Salson's avatar
Mikaël Salson committed
965

Mathieu Giraud's avatar
Mathieu Giraud committed
966 967 968 969
              if (window_pos != string::npos) {
                // for V.
                ww = seg.getLeft() - window_pos + seg.del_V;
              } 
Mikaël Salson's avatar
Mikaël Salson committed
970 971
            

Mathieu Giraud's avatar
Mathieu Giraud committed
972
              string end_V ="";
Mikaël Salson's avatar
Mikaël Salson committed
973
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
974 975 976 977
              // avoid case when V is not in the window
              if (seg.getLeft() > (int) window_pos)
                end_V = rep_V.sequence(seg.best_V).substr(rep_V.sequence(seg.best_V).size() - ww, 
                                                          ww - seg.del_V);
Mikaël Salson's avatar
Mikaël Salson committed
978

Mathieu Giraud's avatar
Mathieu Giraud committed
979
              string mid_D = "";
Mikaël Salson's avatar
Mikaël Salson committed
980
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
981 982 983
              if (segment_D)
                mid_D = rep_D.sequence(seg.best_D).substr(seg.del_D_left, 
                                                          rep_D.sequence(seg.best_D).size() - seg.del_D_left - seg.del_D_right );
Mikaël Salson's avatar
Mikaël Salson committed
984
	   
Mathieu Giraud's avatar
Mathieu Giraud committed
985 986 987 988
              if (window_pos != string::npos) {
                // for J.
                ww = (window_pos + w - 1) - seg.getRight() + seg.del_J;
              }
Mikaël Salson's avatar
Mikaël Salson committed
989
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
990
              string start_J = "";