vidjil.cpp 39.1 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1 2
/*
  This file is part of "Vidjil" <http://bioinfo.lifl.fr/vidjil>
Mathieu Giraud's avatar
Mathieu Giraud committed
3 4
  Copyright (C) 2011, 2012, 2013, 2014 by Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille
  Contributors: Mathieu Giraud <mathieu.giraud@lifl.fr>, Mikaël Salson <mikael.salson@lifl.fr>, Marc Duez <marc.duez@lifl.fr>
Mikaël Salson's avatar
Mikaël Salson committed
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

Mathieu Giraud's avatar
Mathieu Giraud committed
20
//$$ #include
Mikaël Salson's avatar
Mikaël Salson committed
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37

#include<algorithm>
#include<utility>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <cstring>
#include <time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#include "core/tools.h"
#include "core/kmerstore.h"
#include "core/fasta.h"
#include "core/segment.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
38
#include "core/windows.h"
Mikaël Salson's avatar
Mikaël Salson committed
39 40 41 42 43 44 45 46 47
#include "core/cluster-junctions.h"
#include "core/dynprog.h"
#include "core/read_score.h"
#include "core/read_chooser.h"
#include "core/msa.h"
#include "core/compare-all.h"
#include "core/teestream.h"
#include "core/mkdir.h"
#include "core/labels.h"
Mikaël Salson's avatar
Mikaël Salson committed
48
#include "core/list_utils.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
49
#include "core/windowExtractor.h"
Mikaël Salson's avatar
Mikaël Salson committed
50 51 52 53 54 55 56 57

#include "vidjil.h"

// RELEASE_TAG may be defined in the "release.h" file.
// If RELEASE_TAG is undefined, the version will be the git hash.
// #define RELEASE_TAG  "2013.04"
#include "release.h"

Mathieu Giraud's avatar
Mathieu Giraud committed
58
#define VIDJIL_JSON_VERSION "2014.02"
Mikaël Salson's avatar
Mikaël Salson committed
59

Mathieu Giraud's avatar
Mathieu Giraud committed
60 61 62 63
//$$ #define (mainly default options)

#define DEFAULT_GERMLINE_SYSTEM "IGH" 
#define DEFAULT_V_REP  "./germline/IGHV.fa"
Mikaël Salson's avatar
Mikaël Salson committed
64
#define DEFAULT_D_REP  "./germline/IGHD.fa" 
Mathieu Giraud's avatar
Mathieu Giraud committed
65
#define DEFAULT_J_REP  "./germline/IGHJ.fa"
Mikaël Salson's avatar
Mikaël Salson committed
66

Mathieu Giraud's avatar
Mathieu Giraud committed
67
#define DEFAULT_READS  "./data/Stanford_S22.fasta"
Mathieu Giraud's avatar
Mathieu Giraud committed
68
#define MIN_READS_WINDOW 10
Mikaël Salson's avatar
Mikaël Salson committed
69 70
#define MIN_READS_CLONE 10
#define MAX_CLONES 20
71
#define RATIO_READS_CLONE 0.0
Mikaël Salson's avatar
Mikaël Salson committed
72

Mathieu Giraud's avatar
Mathieu Giraud committed
73
#define COMMAND_WINDOWS "windows"
Mikaël Salson's avatar
Mikaël Salson committed
74 75 76
#define COMMAND_ANALYSIS "clones"
#define COMMAND_SEGMENT "segment"
 
Mathieu Giraud's avatar
Mathieu Giraud committed
77
enum { CMD_WINDOWS, CMD_ANALYSIS, CMD_SEGMENT } ;
Mikaël Salson's avatar
Mikaël Salson committed
78 79 80

#define OUT_DIR "./out/" 
#define CLONE_FILENAME "clone.fa-"
Mathieu Giraud's avatar
Mathieu Giraud committed
81
#define WINDOWS_FILENAME "windows.fa"
Mikaël Salson's avatar
Mikaël Salson committed
82 83
#define SEQUENCES_FILENAME "sequences.fa"
#define SEGMENTED_FILENAME "segmented.vdj.fa"
Mikaël Salson's avatar
Mikaël Salson committed
84
#define UNSEGMENTED_FILENAME "unsegmented.fa"
Mikaël Salson's avatar
Mikaël Salson committed
85 86 87
#define EDGES_FILENAME "edges"
#define COMP_FILENAME "comp.data"
#define GRAPH_FILENAME "graph"
Mathieu Giraud's avatar
Mathieu Giraud committed
88
#define JSON_SUFFIX ".data"
Mikaël Salson's avatar
Mikaël Salson committed
89 90 91

// "tests/data/leukemia.fa" 

92
#define DEFAULT_K      0
Mikaël Salson's avatar
Mikaël Salson committed
93
#define DEFAULT_W      40
Mikaël Salson's avatar
Mikaël Salson committed
94
#define DEFAULT_W_D    60
95
#define DEFAULT_SEED   ""
Mikaël Salson's avatar
Mikaël Salson committed
96 97

#define DEFAULT_DELTA_MIN  -10
Mathieu Giraud's avatar
Mathieu Giraud committed
98
#define DEFAULT_DELTA_MAX   20
Mikaël Salson's avatar
Mikaël Salson committed
99 100

#define DEFAULT_DELTA_MIN_D  0
Mathieu Giraud's avatar
Mathieu Giraud committed
101
#define DEFAULT_DELTA_MAX_D  60
Mikaël Salson's avatar
Mikaël Salson committed
102

Mikaël Salson's avatar
Mikaël Salson committed
103
#define DEFAULT_MAX_AUDITIONED 2000
Mathieu Giraud's avatar
Mathieu Giraud committed
104 105 106
#define DEFAULT_RATIO_REPRESENTATIVE 0.5
#define DEFAULT_MIN_COVER_REPRESENTATIVE 5

Mikaël Salson's avatar
Mikaël Salson committed
107 108 109 110 111 112
#define DEFAULT_EPSILON  0
#define DEFAULT_MINPTS   10

#define DEFAULT_CLUSTER_COST  Cluster
#define DEFAULT_SEGMENT_COST   VDJ

113 114 115
// warn
#define WARN_RATIO_SEGMENTED 0.4

Mikaël Salson's avatar
Mikaël Salson committed
116 117 118
// display
#define WIDTH_NB_READS 7
#define WIDTH_NB_CLONES 3
Mikaël Salson's avatar
Mikaël Salson committed
119
#define WIDTH_WINDOW_POS 14+WIDTH_NB_CLONES
Mikaël Salson's avatar
Mikaël Salson committed
120 121 122

using namespace std ;

Mathieu Giraud's avatar
Mathieu Giraud committed
123
//$$ options: usage
Mikaël Salson's avatar
Mikaël Salson committed
124

Mathieu Giraud's avatar
Mathieu Giraud committed
125
extern char *optarg;
Mikaël Salson's avatar
Mikaël Salson committed
126 127 128 129 130 131 132 133

extern int optind, optopt, opterr;

void usage(char *progname)
{
  cerr << "Usage: " << progname << " [options] <reads.fa>" << endl << endl;

  cerr << "Command selection" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
134
       << "  -c <command> \t" << COMMAND_WINDOWS << "\t window extracting (default)" << endl 
Mikaël Salson's avatar
Mikaël Salson committed
135 136 137 138 139 140 141 142 143 144 145
       << "  \t\t" << COMMAND_ANALYSIS  << "  \t clone analysis" << endl 
       << "  \t\t" << COMMAND_SEGMENT   << "  \t V(D)J segmentation" << endl
       << endl       

       << "Germline databases" << endl
       << "  -V <file>     V germline multi-fasta file" << endl
       << "  -D <file>     D germline multi-fasta file (automatically implies -d)" << endl
       << "  -J <file>     J germline multi-fasta file" << endl
       << "  -G <prefix>   prefix for V (D) and J repertoires (shortcut for -V <prefix>V.fa -D <prefix>D.fa -J <prefix>J.fa)" << endl
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
146
       << "Window prediction" << endl
Mikaël Salson's avatar
Mikaël Salson committed
147
#ifndef NO_SPACED_SEEDS
148 149
       << "  -s <string>   spaced seed used for the V/J affectation" << endl
       << "                (default: #####-#####, ######-######, #######-#######, depends on germline)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
150
#endif
151
       << "  -k <int>      k-mer size used for the V/J affectation (default: 10, 12, 13, depends on germline)" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
152
       << "  -w <int>      w-mer size used for the length of the extracted window (default: " << DEFAULT_W << ")(default with -d: " << DEFAULT_W_D << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
153 154
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
155 156
       << "Window annotations" << endl
       << "  -l <file>     labels for some windows -- these windows will be kept even if some limits are not reached" << endl
Mikaël Salson's avatar
Mikaël Salson committed
157 158
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
159 160
       << "Limit to keep a window" << endl
       << "  -r <nb>       minimal number of reads containing a window (default: " << MIN_READS_WINDOW << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
161 162 163 164 165 166 167 168 169 170 171 172 173
       << endl

       << "Clusterisation" << endl
       << "  -e <file>     manual clusterisation -- a file used to force some specific edges" << endl
       << "  -n <int>      maximum distance between neighbors for automatic clusterisation (default " << DEFAULT_EPSILON << "). No automatic clusterisation if =0." << endl
       << "  -N <int>      minimum required neighbors for automatic clusterisation (default " << DEFAULT_MINPTS << ")" << endl
       << "  -S            generate and save comparative matrix for clustering" << endl
       << "  -L            load comparative matrix for clustering" << endl
       << "  -C <string>   use custom Cost for automatic clustering : format \"match, subst, indels, homo, del_end\" (default "<<Cluster<<" )"<< endl
       << endl

       << "Limits to report a clone" << endl
       << "  -R <nb>       minimal number of reads supporting a clone (default: " << MIN_READS_CLONE << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
174
       << "  -% <ratio>    minimal percentage of reads supporting a clone (default: " << RATIO_READS_CLONE << ")" << endl
175
       << "  -z <nb>       maximal number of clones reported (0: no limit) (default: " << MAX_CLONES << ")" << endl
176
       << "  -A            reports all clones (-r 0 -R 1 -% 0 -z 0), to be used only on very small datasets" << endl
Mikaël Salson's avatar
Mikaël Salson committed
177 178 179 180 181 182 183 184 185
       << endl

       << "Fine segmentation options" << endl
       << "  -d            segment into V(D)J components instead of VJ " << endl
       << "  -m <int>      minimal admissible delta between segmentation points (default: " << DEFAULT_DELTA_MIN << ") (default when -d is used: " << DEFAULT_DELTA_MIN_D << ")" << endl
       << "  -M <int>      maximal admissible delta between segmentation points (default: " << DEFAULT_DELTA_MAX << ") (default when -d is used: " << DEFAULT_DELTA_MAX_D << ")" << endl
       << "  -f <string>   use custom Cost for fine segmenter : format \"match, subst, indels, homo, del_end\" (default "<<VDJ<<" )"<< endl
       << endl

186 187 188
       << "Debug" << endl
       << "  -u            output unsegmented sequences (default: " << UNSEGMENTED_FILENAME << ")" << endl
       << "                and display detailed k-mer affectation both on segmented and on unsegmented sequences" << endl
Mikaël Salson's avatar
Mikaël Salson committed
189 190 191 192 193
       << "Output" << endl
       << "  -o <dir>      output directory (default: " << OUT_DIR << ")" <<  endl
       << "  -p <string>   prefix output filenames by the specified string" << endl
    
       << "  -a            output all sequences by cluster (" << SEQUENCES_FILENAME << ")" << endl
194
       << "  -x            do not compute representative sequences" << endl
Mikaël Salson's avatar
Mikaël Salson committed
195 196 197 198 199
       << "  -v            verbose mode" << endl
       << endl        

       << endl 
       << "Examples (see doc/README)" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
200 201 202
       << "  " << progname << "             -G germline/IGH                  -d  data/Stanford_S22.fasta" << endl
       << "  " << progname << " -c clones   -G germline/IGH  -r 1 -R 1 -% 0  -d  data/Stanford_S22.fasta" << endl
       << "  " << progname << " -c segment  -G germline/IGH                  -d  data/Stanford_S22.fasta" << endl
Mikaël Salson's avatar
Mikaël Salson committed
203 204 205 206 207 208 209
    ;
  exit(1);
}

int main (int argc, char **argv)
{
  cout << "# Vidjil -- V(D)J recombinations analysis <http://bioinfo.lifl.fr/vidjil>" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
210 211
       << "# Copyright (C) 2011, 2012, 2013, 2014 by the Vidjil team" << endl
       << "# Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille" << endl 
Mikaël Salson's avatar
Mikaël Salson committed
212 213
       << endl ;

Mathieu Giraud's avatar
Mathieu Giraud committed
214 215
  //$$ options: defaults

Mikaël Salson's avatar
Mikaël Salson committed
216
  string germline_system = DEFAULT_GERMLINE_SYSTEM ;
Mikaël Salson's avatar
Mikaël Salson committed
217 218 219 220 221 222 223 224 225 226 227 228
  string f_rep_V = DEFAULT_V_REP ;
  string f_rep_D = DEFAULT_D_REP ;
  string f_rep_J = DEFAULT_J_REP ;
  string f_reads = DEFAULT_READS ;
  string seed = DEFAULT_SEED ;
  string prefix_filename = "";

  string out_dir = OUT_DIR;
  
  string comp_filename = COMP_FILENAME;

  int k = DEFAULT_K ;
Mikaël Salson's avatar
Mikaël Salson committed
229 230 231
  int w = 0 ;
  int default_w = DEFAULT_W ;

Mikaël Salson's avatar
Mikaël Salson committed
232 233 234 235 236 237 238 239 240 241 242
  int epsilon = DEFAULT_EPSILON ;
  int minPts = DEFAULT_MINPTS ;
  Cost cluster_cost = DEFAULT_CLUSTER_COST ;
  Cost segment_cost = DEFAULT_SEGMENT_COST ;
  
  
  int save_comp = 0;
  int load_comp = 0;
  int segment_D = 0;
  
  int verbose = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
243
  int command = CMD_WINDOWS;
Mikaël Salson's avatar
Mikaël Salson committed
244

Mikaël Salson's avatar
Mikaël Salson committed
245
  int max_clones = MAX_CLONES ;
Mathieu Giraud's avatar
Mathieu Giraud committed
246
  int min_reads_window = MIN_READS_WINDOW ;
Mikaël Salson's avatar
Mikaël Salson committed
247 248 249 250
  int min_reads_clone = MIN_READS_CLONE ;
  float ratio_reads_clone = RATIO_READS_CLONE;
  // int average_deletion = 4;     // Average number of deletion in V or J

Mathieu Giraud's avatar
Mathieu Giraud committed
251 252
  size_t min_cover_representative = DEFAULT_MIN_COVER_REPRESENTATIVE;
  float ratio_representative = DEFAULT_RATIO_REPRESENTATIVE;
Mikaël Salson's avatar
Mikaël Salson committed
253
  unsigned int max_auditionned = DEFAULT_MAX_AUDITIONED;
Mathieu Giraud's avatar
Mathieu Giraud committed
254

Mikaël Salson's avatar
Mikaël Salson committed
255 256 257 258 259 260 261
  // Admissible delta between left and right segmentation points
  int delta_min = DEFAULT_DELTA_MIN ; // Kmer+Fine
  int delta_max = DEFAULT_DELTA_MAX ; // Fine
  int delta_max_kmer = 50 ; // TODO 

  bool output_sequences_by_cluster = false;
  bool detailed_cluster_analysis = true ;
Mikaël Salson's avatar
Mikaël Salson committed
262
  bool very_detailed_cluster_analysis = false ;
Mathieu Giraud's avatar
Mathieu Giraud committed
263
  bool output_unsegmented = false;
Mikaël Salson's avatar
Mikaël Salson committed
264 265 266

  string forced_edges = "" ;

Mathieu Giraud's avatar
Mathieu Giraud committed
267
  string windows_labels_file = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
268
  string normalization_file = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
269 270 271

  char c ;

Mathieu Giraud's avatar
Mathieu Giraud committed
272 273
  //$$ options: getopt

274
  while ((c = getopt(argc, argv, "AhaG:V:D:J:k:r:R:vw:e:C:t:l:dc:m:M:N:s:p:Sn:o:Lx%:Z:z:u")) != EOF)
Mikaël Salson's avatar
Mikaël Salson committed
275 276 277 278 279 280 281 282 283 284 285 286

    switch (c)
      {
      case 'h':
        usage(argv[0]);
      case 'a':
	output_sequences_by_cluster = true;
	break;
      case 'd':
	segment_D = 1 ;
	delta_min = DEFAULT_DELTA_MIN_D ;
	delta_max = DEFAULT_DELTA_MAX_D ;
Mikaël Salson's avatar
Mikaël Salson committed
287
	default_w = DEFAULT_W_D ;
Mikaël Salson's avatar
Mikaël Salson committed
288 289 290 291 292
	break;
      case 'e':
	forced_edges = optarg;
	break;
      case 'l':
Mathieu Giraud's avatar
Mathieu Giraud committed
293
	windows_labels_file = optarg; 
Mikaël Salson's avatar
Mikaël Salson committed
294
	break;
Mikaël Salson's avatar
Mikaël Salson committed
295 296 297
      case 'Z':
	normalization_file = optarg; 
	break;
Mikaël Salson's avatar
Mikaël Salson committed
298 299 300 301 302 303 304 305
      case 'x':
	detailed_cluster_analysis = false;
	break;
      case 'c':
        if (!strcmp(COMMAND_ANALYSIS,optarg))
          command = CMD_ANALYSIS;
        else if (!strcmp(COMMAND_SEGMENT,optarg))
          command = CMD_SEGMENT;
Mathieu Giraud's avatar
Mathieu Giraud committed
306 307
        else if (!strcmp(COMMAND_WINDOWS,optarg))
          command = CMD_WINDOWS;
Mikaël Salson's avatar
Mikaël Salson committed
308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
        else {
          cerr << "Unknwown command " << optarg << endl;
	  usage(argv[0]);
        }
        break;
      case 'v':
	verbose += 1 ;
	break;

      // Germline

      case 'V':
	f_rep_V = optarg;
	break;

      case 'D':
	f_rep_D = optarg;
        segment_D = 1;
	break;
        
      case 'J':
	f_rep_J = optarg;
	break;

      case 'G':
Mikaël Salson's avatar
Mikaël Salson committed
333 334 335 336
	germline_system = string(optarg);
	f_rep_V = (germline_system + "V.fa").c_str() ;
	f_rep_D = (germline_system + "D.fa").c_str() ;
	f_rep_J = (germline_system + "J.fa").c_str() ;
Mikaël Salson's avatar
Mikaël Salson committed
337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366
	// TODO: if VDJ, set segment_D // NO, bad idea, depends on naming convention
	break;

      // Algorithm

      case 'k':
	k = atoi(optarg);
	seed = seed_contiguous(k);
        break;

      case 'w':
	w = atoi(optarg);
        break;

      case 'm':
	delta_min = atoi(optarg);
        break;

      case 'M':
	delta_max = atoi(optarg);
        break;

      case 'o':
        out_dir = optarg ;
        break;

      case 'p':
        prefix_filename = optarg;
        break;

367 368
      // Limits

Mikaël Salson's avatar
Mikaël Salson committed
369
      case 'r':
Mathieu Giraud's avatar
Mathieu Giraud committed
370
	min_reads_window = atoi(optarg);
Mikaël Salson's avatar
Mikaël Salson committed
371 372 373 374 375 376 377 378 379 380
        break;

      case '%':
	ratio_reads_clone = atof(optarg);
	break;

      case 'R':
	min_reads_clone = atoi(optarg);
        break;

Mikaël Salson's avatar
Mikaël Salson committed
381 382 383
      case 'z':
	max_clones = atoi(optarg);
        break;
384 385 386 387 388 389 390 391 392 393

      case 'A': // --all
	min_reads_window = 1 ;
	ratio_reads_clone = 0 ;
	min_reads_clone = 0 ;
	max_clones = 0 ;
	break ;

      // Seeds

Mikaël Salson's avatar
Mikaël Salson committed
394 395 396 397 398
      case 's':
#ifndef NO_SPACED_SEEDS
	seed = string(optarg);
	k = seed_weight(seed);
#else
Mathieu Giraud's avatar
Mathieu Giraud committed
399
        cerr << "To enable the option -s, please compile without NO_SPACED_SEEDS" << endl;
Mikaël Salson's avatar
Mikaël Salson committed
400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427
#endif
        break;
	
      // Clusterisation
	
      case 'n':
	epsilon = atoi(optarg);
        break;

      case 'N':
	minPts = atoi(optarg);
        break;
	
      case 'S':
	save_comp=1;
        break;

      case 'L':
	load_comp=1;
        break;
	
      case 'C':
	cluster_cost=strToCost(optarg, Cluster);
        break;
	
      case 't':
	segment_cost=strToCost(optarg, VDJ);
        break;
Mathieu Giraud's avatar
Mathieu Giraud committed
428 429 430 431

      case 'u':
        output_unsegmented = true;
        break;
Mikaël Salson's avatar
Mikaël Salson committed
432 433
      }

Mikaël Salson's avatar
Mikaël Salson committed
434 435 436 437 438 439 440
  // If there was no -w option, then w is either DEFAULT_W or DEFAULT_W_D
  if (w == 0)
    w = default_w ;

  
  string out_seqdir = out_dir + "/seq/" ;

Mikaël Salson's avatar
Mikaël Salson committed
441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456
  if (verbose)
    cout << "# verbose " << verbose << endl ;

  if (optind == argc)
    {
      cout << "# using default sequence file: " << f_reads << endl ;
    }
  else if (optind+1 == argc)
    {
      f_reads = argv[optind] ; 
    }
  else
    {
      cout << "wrong number of arguments." << endl ;
      exit(1);
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
457 458

  //$$ options: post-processing+display
Mathieu Giraud's avatar
Mathieu Giraud committed
459 460


461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485

  // Default seeds

#ifndef NO_SPACED_SEEDS
  if (k == DEFAULT_K)
    {
      if (germline_system.find("TRA") != string::npos)
	seed = "#######-######" ;

      else if ((germline_system.find("TRB") != string::npos)
	       || (germline_system.find("IGH") != string::npos))
	seed = "######-######" ; 
      else // TRD, TRG, IGK, IGL
	seed = "#####-#####" ; 

      k = seed_weight(seed);
    }
#else
  {
    cout << "Vidjil was compiled with NO_SPACED_SEEDS: please provide a -k option." << endl;
    exit(1) ;
  }
#endif
	  

Mathieu Giraud's avatar
Mathieu Giraud committed
486 487 488 489 490 491 492 493 494
#ifndef NO_SPACED_SEEDS
  // Check seed buffer  
  if (seed.size() >= MAX_SEED_SIZE)
    {
      cout << "Seed size is too large (MAX_SEED_SIZE). Aborting." << endl ;
      exit(1);
    }
#endif

Mikaël Salson's avatar
Mikaël Salson committed
495 496 497 498 499 500 501 502
  // Check that out_dir is an existing directory or creates it
  const char *out_cstr = out_dir.c_str();

  if (mkpath(out_cstr, 0755) == -1) {
    perror("Directory creation");
    exit(2);
  }

Mikaël Salson's avatar
Mikaël Salson committed
503 504 505 506 507 508
  const char *outseq_cstr = out_seqdir.c_str();
  if (mkpath(outseq_cstr, 0755) == -1) {
    perror("Directory creation");
    exit(2);
  }

Mikaël Salson's avatar
Mikaël Salson committed
509 510 511
  out_dir += "/" ;

  /// Load labels ;
Mathieu Giraud's avatar
Mathieu Giraud committed
512
  map <string, string> windows_labels = load_map(windows_labels_file);
Mikaël Salson's avatar
Mikaël Salson committed
513

Mikaël Salson's avatar
Mikaël Salson committed
514 515
  map <string, pair <string, float> > normalization = load_map_norm(normalization_file);

Mikaël Salson's avatar
Mikaël Salson committed
516 517

  switch(command) {
Mathieu Giraud's avatar
Mathieu Giraud committed
518
  case CMD_WINDOWS: cout << "Extracting windows" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
519 520 521 522 523 524 525
    break;
  case CMD_ANALYSIS: cout << "Analysing clones" << endl; 
    break;
  case CMD_SEGMENT: cout << "Segmenting V(D)J" << endl;
    break;
  }

Mathieu Giraud's avatar
Mathieu Giraud committed
526
  cout << "Command line: ";
Mikaël Salson's avatar
Mikaël Salson committed
527
  for (int i=0; i < argc; i++) {
Mathieu Giraud's avatar
Mathieu Giraud committed
528
    cout << argv[i] << " ";
Mikaël Salson's avatar
Mikaël Salson committed
529
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
530
  cout << endl;
Mikaël Salson's avatar
Mikaël Salson committed
531 532 533 534 535 536 537 538 539 540 541 542

  //////////////////////////////////
  // Display time and date
  time_t rawtime;
  struct tm *timeinfo;
  char time_buffer[80];

  time (&rawtime );
  timeinfo = localtime (&rawtime);

  strftime (time_buffer, 80,"%F %T", timeinfo);

Mathieu Giraud's avatar
Mathieu Giraud committed
543
  cout << "# " << time_buffer << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
544 545 546 547 548 549


  //////////////////////////////////
  // Display version information or git log

#ifdef RELEASE_TAG
Mathieu Giraud's avatar
Mathieu Giraud committed
550
  cout << "# version: vidjil " << RELEASE_TAG << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
551
#else
Mathieu Giraud's avatar
Mathieu Giraud committed
552 553
  if (system("git log -1 --pretty=format:'# git: %h (%ci)' --abbrev-commit 2> /dev/null") == 0){}
  cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
554 555 556
#endif

  //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
557
  //$$ Read sequence files
Mikaël Salson's avatar
Mikaël Salson committed
558 559 560 561

  if (!segment_D) // TODO: add other constructor to Fasta, and do not load rep_D in this case
    f_rep_D = "";

Mathieu Giraud's avatar
Mathieu Giraud committed
562 563 564 565
  Fasta rep_V(f_rep_V, 2, "|", cout);
  Fasta rep_D(f_rep_D, 2, "|", cout);
  Fasta rep_J(f_rep_J, 2, "|", cout);

Mikaël Salson's avatar
Mikaël Salson committed
566 567 568 569 570
  OnlineFasta *reads;

  try {
    reads = new OnlineFasta(f_reads, 1, " ");
  } catch (const std::ios_base::failure e) {
Mathieu Giraud's avatar
Mathieu Giraud committed
571
    cout << "Error while reading reads file " << f_reads << ": " << e.what()
Mikaël Salson's avatar
Mikaël Salson committed
572 573 574 575 576 577 578 579 580
        << endl;
    exit(1);
  }

  out_dir += "/";

  ////////////////////////////////////////
  //           CLONE ANALYSIS           //
  ////////////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
581
  if (command == CMD_ANALYSIS || command == CMD_WINDOWS) {
Mikaël Salson's avatar
Mikaël Salson committed
582 583

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
584 585 586 587 588 589
    cout << "# seed = " << seed << "," ;
    cout << " weight = " << seed_weight(seed) << "," ;
    cout << " span = " << seed.size() << endl ;
    cout << "# k = " << k << "," ;
    cout << " w = " << w << "," ;
    cout << " delta = [" << delta_min << "," << delta_max << "]" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
590 591 592


    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
593 594
    //$$ Build Kmer indexes
    cout << "Build Kmer indexes" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
595 596 597 598 599 600 601 602 603

    bool rc = true ;
    
    IKmerStore<KmerAffect>  *index = KmerStoreFactory::createIndex<KmerAffect>(seed, rc);
    index->insert(rep_V, "V");
    index->insert(rep_J, "J");

  
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
604
    //$$ Kmer Segmentation
Mikaël Salson's avatar
Mikaël Salson committed
605

Mathieu Giraud's avatar
Mathieu Giraud committed
606 607
    cout << endl;
    cout << "Loop through reads, looking for windows" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
608

Mathieu Giraud's avatar
Mathieu Giraud committed
609 610 611 612
    string f_segmented = out_dir + prefix_filename + SEGMENTED_FILENAME ;
    cout << "  ==> " << f_segmented << endl ;
    ofstream out_segmented(f_segmented.c_str()) ;
    ofstream *out_unsegmented = NULL;
Mikaël Salson's avatar
Mikaël Salson committed
613
 
Mathieu Giraud's avatar
Mathieu Giraud committed
614 615 616 617 618 619 620 621 622 623 624
    WindowExtractor we;
    we.setSegmentedOutput(&out_segmented);
    if (output_unsegmented) {
      string f_unsegmented = out_dir + prefix_filename + UNSEGMENTED_FILENAME ;
      cout << "  ==> " << f_unsegmented << endl ;
      out_unsegmented = new ofstream(f_unsegmented.c_str());
      we.setUnsegmentedOutput(out_unsegmented);
    }
    WindowsStorage *windowsStorage = we.extract(reads, index, w, delta_min, 
                                                delta_max_kmer, windows_labels);
    size_t nb_total_reads = we.getNbReads();
Mikaël Salson's avatar
Mikaël Salson committed
625 626


Mathieu Giraud's avatar
Mathieu Giraud committed
627
    //$$ Display statistics on segmentation causes
Mikaël Salson's avatar
Mikaël Salson committed
628

Mikaël Salson's avatar
Mikaël Salson committed
629

Mathieu Giraud's avatar
Mathieu Giraud committed
630 631
    int nb_segmented_including_too_short = we.getNbSegmented(TOTAL_SEG_AND_WINDOW) 
      + we.getNbSegmented(TOTAL_SEG_BUT_TOO_SHORT_FOR_THE_WINDOW);
Mikaël Salson's avatar
Mikaël Salson committed
632

Mathieu Giraud's avatar
Mathieu Giraud committed
633
    cout << "  ==> segmented " << nb_segmented_including_too_short << " reads"
Mikaël Salson's avatar
Mikaël Salson committed
634
	<< " (" << setprecision(3) << 100 * (float) nb_segmented_including_too_short / nb_total_reads << "%)" 
Mathieu Giraud's avatar
Mathieu Giraud committed
635 636
	<< endl ;

Mikaël Salson's avatar
Mikaël Salson committed
637
    // nb_segmented is the main denominator for the following (but will be normalized)
Mathieu Giraud's avatar
Mathieu Giraud committed
638
    int nb_segmented = we.getNbSegmented(TOTAL_SEG_AND_WINDOW);
639
    float ratio_segmented = 100 * (float) nb_segmented / nb_total_reads ;
Mikaël Salson's avatar
Mikaël Salson committed
640

Mathieu Giraud's avatar
Mathieu Giraud committed
641
    cout << "  ==> found " << windowsStorage->size() << " " << w << "-windows"
Mikaël Salson's avatar
Mikaël Salson committed
642
	<< " in " << nb_segmented << " segments"
643
	<< " (" << setprecision(3) << ratio_segmented << "%)"
Mikaël Salson's avatar
Mikaël Salson committed
644 645
	<< " inside " << nb_total_reads << " sequences" << endl ;
  
646 647 648 649 650 651 652
    // warn if there are too few segmented sequences
    if (ratio_segmented < WARN_RATIO_SEGMENTED)
      {
        cout << "  ! There are not so many CDR3 windows found in this set of reads." << endl ;
        cout << "  ! If this is unexpected, check the germline (-G) and try to change seed parameters (-k)." << endl ;
      }

Mathieu Giraud's avatar
Mathieu Giraud committed
653
    cout << "                                  #      av. length" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
654

Mikaël Salson's avatar
Mikaël Salson committed
655
    for (int i=0; i<STATS_SIZE; i++)
Mikaël Salson's avatar
Mikaël Salson committed
656
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
657 658
	cout << "   " << left << setw(20) << segmented_mesg[i] 
             << " ->" << right << setw(9) << we.getNbSegmented(static_cast<SEGMENTED>(i)) ;
Mikaël Salson's avatar
Mikaël Salson committed
659

Mathieu Giraud's avatar
Mathieu Giraud committed
660 661 662 663 664
	if (we.getAverageSegmentationLength(static_cast<SEGMENTED>(i)))
	  cout << "      " << setw(5) << fixed << setprecision(1) 
               << we.getAverageSegmentationLength(static_cast<SEGMENTED>(i));
	
	cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
665 666
      }
    
Mathieu Giraud's avatar
Mathieu Giraud committed
667
      map <junction, JsonList> json_data_segment ;
Mikaël Salson's avatar
Mikaël Salson committed
668
    
Mikaël Salson's avatar
Mikaël Salson committed
669 670

	//////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
671
	//$$ Sort windows
Mikaël Salson's avatar
Mikaël Salson committed
672
	
Mathieu Giraud's avatar
Mathieu Giraud committed
673 674
        cout << "Sort windows by number of occurrences" << endl;
        windowsStorage->sort();
Mikaël Salson's avatar
Mikaël Salson committed
675 676

	//////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
677
	//$$ Output windows
Mikaël Salson's avatar
Mikaël Salson committed
678 679
	//////////////////////////////////

Mathieu Giraud's avatar
Mathieu Giraud committed
680
	string f_all_windows = out_dir + prefix_filename + WINDOWS_FILENAME;
Mathieu Giraud's avatar
Mathieu Giraud committed
681
	cout << "  ==> " << f_all_windows << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
682

Mathieu Giraud's avatar
Mathieu Giraud committed
683
	ofstream out_all_windows(f_all_windows.c_str());
Mathieu Giraud's avatar
Mathieu Giraud committed
684
        windowsStorage->printSortedWindows(out_all_windows);
Mikaël Salson's avatar
Mikaël Salson committed
685

Mathieu Giraud's avatar
Mathieu Giraud committed
686 687
	//$$ Normalization
	list< pair <float, int> > norm_list = compute_normalization_list(windowsStorage->getMap(), normalization, nb_segmented);
Mikaël Salson's avatar
Mikaël Salson committed
688 689 690 691 692


    if (command == CMD_ANALYSIS) {

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
693 694
    //$$ min_reads_window (ou label)
    cout << "Considering only windows with >= " << min_reads_window << " reads and labeled windows" << endl;
Mathieu Giraud's avatar
Mathieu Giraud committed
695

Mathieu Giraud's avatar
Mathieu Giraud committed
696
    pair<int, int> info_remove = windowsStorage->keepInterestingWindows((size_t) min_reads_window);
Mikaël Salson's avatar
Mikaël Salson committed
697
	 
Mathieu Giraud's avatar
Mathieu Giraud committed
698 699
    cout << "  ==> keep " <<  windowsStorage->size() << " windows in " << info_remove.second << " reads" ;
    cout << " (" << setprecision(3) << 100 * (float) info_remove.second / nb_total_reads << "%)  " << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
700 701

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
702
    //$$ Clustering
Mikaël Salson's avatar
Mikaël Salson committed
703

Mathieu Giraud's avatar
Mathieu Giraud committed
704
    list <list <junction> > clones_windows;
Mathieu Giraud's avatar
Mathieu Giraud committed
705
    comp_matrix comp=comp_matrix(*windowsStorage);
Mikaël Salson's avatar
Mikaël Salson committed
706 707 708

    if (epsilon || forced_edges.size())
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
709
	cout << "Cluster similar windows" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
710 711 712 713 714 715 716

	if (load_comp==1) 
	  {
	    comp.load((out_dir+prefix_filename + comp_filename).c_str());
	  }
	else
	  {
Mathieu Giraud's avatar
Mathieu Giraud committed
717
	    comp.compare( cout, cluster_cost);
Mikaël Salson's avatar
Mikaël Salson committed
718 719 720 721 722 723 724
	  }
	
	if (save_comp==1)
	  {
	    comp.save(( out_dir+prefix_filename + comp_filename).c_str());
	  }
       
Mathieu Giraud's avatar
Mathieu Giraud committed
725 726
	clones_windows  = comp.cluster(forced_edges, w, cout, epsilon, minPts) ;
	comp.stat_cluster(clones_windows, out_dir + prefix_filename + GRAPH_FILENAME, cout );
Mikaël Salson's avatar
Mikaël Salson committed
727 728 729 730
	comp.del();
      } 
    else
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
731
	cout << "No clustering" << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
732
	clones_windows  = comp.nocluster() ;
Mikaël Salson's avatar
Mikaël Salson committed
733 734
      }

Mathieu Giraud's avatar
Mathieu Giraud committed
735
    cout << "  ==> " << clones_windows.size() << " clones" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
736
 
Mathieu Giraud's avatar
Mathieu Giraud committed
737
    //$$ Sort clones, number of occurrences
Mikaël Salson's avatar
Mikaël Salson committed
738
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
739
    cout << "Sort clones by number of occurrences" << endl;
Mikaël Salson's avatar
Mikaël Salson committed
740 741 742

    list<pair<list <junction>, int> >sort_clones;

Mathieu Giraud's avatar
Mathieu Giraud committed
743
    for (list <list <junction> >::const_iterator it = clones_windows.begin(); it != clones_windows.end(); ++it)
Mikaël Salson's avatar
Mikaël Salson committed
744 745 746
      {
        list <junction>clone = *it ;

Mikaël Salson's avatar
Mikaël Salson committed
747 748 749
	int clone_nb_reads=0;
	
        for (list <junction>::const_iterator it2 = clone.begin(); it2 != clone.end(); ++it2)
Mathieu Giraud's avatar
Mathieu Giraud committed
750
	  clone_nb_reads += windowsStorage->getNbReads(*it2);
Mikaël Salson's avatar
Mikaël Salson committed
751
	  
Mikaël Salson's avatar
Mikaël Salson committed
752
	bool labeled = false ;
Mathieu Giraud's avatar
Mathieu Giraud committed
753
	// Is there a labeled window ?
Mikaël Salson's avatar
Mikaël Salson committed
754
	for (list <junction>::const_iterator iit = clone.begin(); iit != clone.end(); ++iit) {
Mathieu Giraud's avatar
Mathieu Giraud committed
755
	  if (windows_labels.find(*iit) != windows_labels.end())
Mikaël Salson's avatar
Mikaël Salson committed
756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772
	    {
	      labeled = true ;
	      break ;
	    }
	}

	  if (labeled 
	      || ((clone_nb_reads >= min_reads_clone) 
		  && (clone_nb_reads * 100.0 / nb_segmented >= ratio_reads_clone)))
          // Record the clone and its number of occurrence
          sort_clones.push_back(make_pair(clone, clone_nb_reads));
      }

    // Sort clones
    sort_clones.sort(pair_occurrence_sort<list<junction> >);

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
773
    //$$ Output clones
774 775 776 777 778 779
    if (max_clones > 0)
      cout << "Output at most " << max_clones<< " clones" ;
    else
      cout << "Output all clones" ;

    cout << " with >= " << min_reads_clone << " reads and with a ratio >= " << ratio_reads_clone << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
780 781

    map <string, int> clones_codes ;
Mathieu Giraud's avatar
Mathieu Giraud committed
782
    map <string, string> clones_map_windows ;
Mikaël Salson's avatar
Mikaël Salson committed
783 784 785 786 787 788

    list <Sequence> representatives ;
    list <string> representatives_labels ;

    VirtualReadScore *scorer = new KmerAffectReadScore(*index);
    int num_clone = 0 ;
789
    int clones_without_representative = 0 ;
Mikaël Salson's avatar
Mikaël Salson committed
790 791 792

    ofstream out_edges((out_dir+prefix_filename + EDGES_FILENAME).c_str());
    int nb_edges = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
793
    cout << "  ==> suggested edges in " << out_dir+ prefix_filename + EDGES_FILENAME
Mikaël Salson's avatar
Mikaël Salson committed
794 795
        << endl ;

Mathieu Giraud's avatar
Mathieu Giraud committed
796
    cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
797

Mathieu Giraud's avatar
Mathieu Giraud committed
798
    cout << "Output clones in " << out_dir + prefix_filename << endl ; 
Mikaël Salson's avatar
Mikaël Salson committed
799 800 801 802 803 804 805 806

    for (list <pair<list <junction>,int> >::const_iterator it = sort_clones.begin();
         it != sort_clones.end(); ++it) {
      list<junction> clone = it->first;
      int clone_nb_reads = it->second;

    
      ++num_clone ;
Mikaël Salson's avatar
Mikaël Salson committed
807

808
      if (max_clones && (num_clone == max_clones + 1))
Mikaël Salson's avatar
Mikaël Salson committed
809 810
	  break ;

Mikaël Salson's avatar
Mikaël Salson committed
811
      cout << "#### " ;
Mikaël Salson's avatar
Mikaël Salson committed
812 813 814 815

      string clone_file_name = out_seqdir+ prefix_filename + CLONE_FILENAME + string_of_int(num_clone) ;
      string windows_file_name = out_seqdir+ prefix_filename + WINDOWS_FILENAME + "-" + string_of_int(num_clone) ;
      string sequences_file_name = out_seqdir+ prefix_filename + SEQUENCES_FILENAME + "-" + string_of_int(num_clone) ;
Mikaël Salson's avatar
Mikaël Salson committed
816 817

      ofstream out_clone(clone_file_name.c_str());
Mathieu Giraud's avatar
Mathieu Giraud committed
818 819 820 821 822 823
      ofstream out_windows(windows_file_name.c_str());
      ofstream out_sequences;

      if (output_sequences_by_cluster) {
        out_sequences.open(sequences_file_name.c_str());
      }
Mikaël Salson's avatar
Mikaël Salson committed
824
      
Mathieu Giraud's avatar
Mathieu Giraud committed
825 826 827
      cout << "Clone #" << right << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone ;
      cout << " – " << setfill(' ') << setw(WIDTH_NB_READS) << clone_nb_reads << " reads" ;
      cout << " – " << setprecision(3) << 100 * (float) clone_nb_reads / nb_segmented << "%  "  ;
Mikaël Salson's avatar
Mikaël Salson committed
828

Mathieu Giraud's avatar
Mathieu Giraud committed
829
      cout << " – " << 100 * (float) clone_nb_reads * compute_normalization_one(norm_list, clone_nb_reads) / nb_segmented << "% " 
Mikaël Salson's avatar
Mikaël Salson committed
830
	  << compute_normalization_one(norm_list, clone_nb_reads) << " ";
Mathieu Giraud's avatar
Mathieu Giraud committed
831
      cout.flush();
Mikaël Salson's avatar
Mikaël Salson committed
832 833 834

      //////////////////////////////////

Mathieu Giraud's avatar
Mathieu Giraud committed
835
      //$$ Sort sequences by nb_reads
Mathieu Giraud's avatar
Mathieu Giraud committed
836
      list<pair<junction, int> >sort_windows;
Mikaël Salson's avatar
Mikaël Salson committed
837 838

      for (list <junction>::const_iterator it = clone.begin(); it != clone.end(); ++it) {
Mathieu Giraud's avatar
Mathieu Giraud committed
839
	int nb_reads = windowsStorage->getNbReads(*it);
Mathieu Giraud's avatar
Mathieu Giraud committed
840
        sort_windows.push_back(make_pair(*it, nb_reads));
Mikaël Salson's avatar
Mikaël Salson committed
841
      }
Mathieu Giraud's avatar
Mathieu Giraud committed
842
      sort_windows.sort(pair_occurrence_sort<junction>);
Mikaël Salson's avatar
Mikaël Salson committed
843

Mathieu Giraud's avatar
Mathieu Giraud committed
844
      //$$ Output windows 
Mikaël Salson's avatar
Mikaël Salson committed
845 846

      int num_seq = 0 ;
Mikaël Salson's avatar
Mikaël Salson committed
847

Mathieu Giraud's avatar
Mathieu Giraud committed
848 849
      for (list <pair<junction, int> >::const_iterator it = sort_windows.begin(); 
           it != sort_windows.end(); ++it) {
Mikaël Salson's avatar
Mikaël Salson committed
850 851
	num_seq++ ;

Mathieu Giraud's avatar
Mathieu Giraud committed
852 853
        out_windows << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ;
	out_windows << it->first << endl;
Mikaël Salson's avatar
Mikaël Salson committed
854 855 856

	if ((!detailed_cluster_analysis) && (num_seq == 1))
	  {
Mathieu Giraud's avatar
Mathieu Giraud committed
857 858 859
	    cout << "\t" << setw(WIDTH_NB_READS) << it->second << "\t";
	    cout << it->first ;
	    cout << "\t" << windows_labels[it->first] ;
Mikaël Salson's avatar
Mikaël Salson committed
860 861 862 863 864
	  }
      }

      if (!detailed_cluster_analysis)
	{
Mathieu Giraud's avatar
Mathieu Giraud committed
865
	  cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
866 867 868 869
	  continue ;
	}

	
Mathieu Giraud's avatar
Mathieu Giraud committed
870
      //$$ First pass, choose one representative per cluster
Mikaël Salson's avatar
Mikaël Salson committed
871 872 873 874

      string best_V ;
      string best_D ;
      string best_J ;
Mathieu Giraud's avatar
Mathieu Giraud committed
875
      int more_windows = 0 ;
Mikaël Salson's avatar
Mikaël Salson committed
876
      
Mathieu Giraud's avatar
Mathieu Giraud committed
877 878
      for (list <pair<junction, int> >::const_iterator it = sort_windows.begin(); 
           it != sort_windows.end(); ++it) {
Mikaël Salson's avatar
Mikaël Salson committed
879

Mikaël Salson's avatar
Mikaël Salson committed
880 881
        out_clone << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ;
	out_clone << it->first << endl;
Mikaël Salson's avatar
Mikaël Salson committed
882

Mikaël Salson's avatar
Mikaël Salson committed
883 884 885 886 887

	// Display statistics on auditionned sequences
	if (verbose)
	{
	  int total_length = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
888 889
          list<Sequence> auditioned = windowsStorage->getSample(it->first, max_auditionned);
	  for (list<Sequence>::const_iterator it = auditioned.begin(); it != auditioned.end(); ++it) 
Mikaël Salson's avatar
Mikaël Salson committed
890 891
	    total_length += (*it).sequence.size() ;
	  
Mathieu Giraud's avatar
Mathieu Giraud committed
892
	  cout << auditioned.size() << " auditioned sequences, avg length " << total_length / auditioned.size() << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
893 894
	}

Mathieu Giraud's avatar
Mathieu Giraud committed
895 896 897 898 899 900
        Sequence representative 
          = windowsStorage->getRepresentative(it->first, seed, 
                                             min_cover_representative,
                                             ratio_representative,
                                             max_auditionned);

901 902 903 904 905 906 907

        if (representative == NULL_SEQUENCE) {
	  clones_without_representative++ ;
	  if (verbose)
	    cout << "# (no representative sequence with current parameters)" ;

        } else {
Mathieu Giraud's avatar
Mathieu Giraud committed
908
	//$$ There is one representative, FineSegmenter
909 910


Mathieu Giraud's avatar
Mathieu Giraud committed
911 912
          representative.label = string_of_int(it->second) + "-" 
            + representative.label;
Mikaël Salson's avatar
Mikaël Salson committed
913
	  FineSegmenter seg(representative, rep_V, rep_J, delta_min, delta_max, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
914
	
Mikaël Salson's avatar
Mikaël Salson committed
915 916
	if (segment_D)
	  seg.FineSegmentD(rep_V, rep_D, rep_J);
Mikaël Salson's avatar
Mikaël Salson committed
917
	
Mathieu Giraud's avatar
Mathieu Giraud committed
918 919 920 921
        //cout << seg.toJson();
        json_data_segment[it->first]=seg.toJsonList(rep_V, rep_D, rep_J);
        
        if (seg.isSegmented())
Mikaël Salson's avatar
Mikaël Salson committed
922 923 924 925 926 927
	  {
	    
	    // As soon as one representative is segmented
	    
	    representatives.push_back(seg.getSequence());
            representatives_labels.push_back("#" + string_of_int(num_clone));
Mikaël Salson's avatar
Mikaël Salson committed
928

Mathieu Giraud's avatar
Mathieu Giraud committed
929 930
              // We need to find the window in the representative
              size_t window_pos = seg.getSequence().sequence.find(it->first);
Mikaël Salson's avatar
Mikaël Salson committed
931

Mathieu Giraud's avatar
Mathieu Giraud committed
932 933
              // Default
              int ww = 2*w/3 ; // /2 ;
Mikaël Salson's avatar
Mikaël Salson committed
934

Mathieu Giraud's avatar
Mathieu Giraud committed
935 936 937 938
              if (window_pos != string::npos) {
                // for V.
                ww = seg.getLeft() - window_pos + seg.del_V;
              } 
Mikaël Salson's avatar
Mikaël Salson committed
939 940
            

Mathieu Giraud's avatar
Mathieu Giraud committed
941
              string end_V ="";
Mikaël Salson's avatar
Mikaël Salson committed
942
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
943 944 945 946
              // avoid case when V is not in the window
              if (seg.getLeft() > (int) window_pos)
                end_V = rep_V.sequence(seg.best_V).substr(rep_V.sequence(seg.best_V).size() - ww, 
                                                          ww - seg.del_V);
Mikaël Salson's avatar
Mikaël Salson committed
947

Mathieu Giraud's avatar
Mathieu Giraud committed
948
              string mid_D = "";
Mikaël Salson's avatar
Mikaël Salson committed
949
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
950 951 952
              if (segment_D)
                mid_D = rep_D.sequence(seg.best_D).substr(seg.del_D_left, 
                                                          rep_D.sequence(seg.best_D).size() - seg.del_D_left - seg.del_D_right );
Mikaël Salson's avatar
Mikaël Salson committed
953
	   
Mathieu Giraud's avatar
Mathieu Giraud committed
954 955 956 957
              if (window_pos != string::npos) {
                // for J.
                ww = (window_pos + w - 1) - seg.getRight() + seg.del_J;
              }
Mikaël Salson's avatar
Mikaël Salson committed
958
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
959
              string start_J = "";
Mikaël Salson's avatar
Mikaël Salson committed
960
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
961 962 963
              // avoid case when J is not in the window
              if (seg.getRight() > (int) (window_pos + w - 1))
                start_J=rep_J.sequence(seg.best_J).substr(seg.del_J, ww);
Mikaël Salson's avatar
Mikaël Salson committed
964
	      
Mathieu Giraud's avatar
Mathieu Giraud committed
965 966 967
              best_V = rep_V.label(seg.best_V) ;
              if (segment_D) best_D = rep_D.label(seg.best_D) ;
              best_J = rep_J.label(seg.best_J) ;
Mikaël Salson's avatar
Mikaël Salson committed
968
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
969 970
              // TODO: pad aux dimensions exactes
              string pad_N = "NNNNNNNNNNNNNNNN" ;
Mikaël Salson's avatar
Mikaël Salson committed
971

Mathieu Giraud's avatar
Mathieu Giraud committed
972
              // Add V, (D) and J to windows to be aligned
Mikaël Salson's avatar
Mikaël Salson committed
973
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
974 975 976 977 978 979 980 981 982
              out_windows << ">" << best_V << "-window" << endl ;
              out_windows << end_V << pad_N << endl ;
              more_windows++;

              if (segment_D) {
                out_windows << ">" << best_D << "-window" << endl ;
                out_windows << mid_D << endl ;   
                more_windows++ ;
              }
Mikaël Salson's avatar
Mikaël Salson committed
983
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
984 985 986
              out_windows << ">" << best_J << "-window" << endl ;
              out_windows << pad_N << start_J <<  endl ;
              more_windows++;
Mikaël Salson's avatar
Mikaël Salson committed
987

Mathieu Giraud's avatar
Mathieu Giraud committed
988 989
              string code = seg.code ;
              int cc = clones_codes[code];
Mikaël Salson's avatar
Mikaël Salson committed
990

Mathieu Giraud's avatar
Mathieu Giraud committed
991 992
              if (cc)
                {
Mathieu Giraud's avatar
Mathieu Giraud committed
993
                  cout << " (similar to Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << cc << setfill(' ') << ")";
Mikaël Salson's avatar
Mikaël Salson committed
994

Mathieu Giraud's avatar
Mathieu Giraud committed
995 996 997 998 999
                  nb_edges++ ;
                  out_edges << clones_map_windows[code] + " " + it->first + " "  ;
                  out_edges << code << "  " ;
                  out_edges << "Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << cc        << setfill(' ') << "  " ;
                  out_edges << "Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << setfill(' ') << "  " ;
Mathieu Giraud's avatar
Mathieu Giraud committed
1000
                  out_edges << endl ;                }
Mathieu Giraud's avatar
Mathieu Giraud committed
1001 1002 1003 1004 1005
              else
                {
                  clones_codes[code] = num_clone ;
                  clones_map_windows[code] = it->first ;
                }
Mikaël Salson's avatar
Mikaël Salson committed
1006

Mathieu Giraud's avatar
Mathieu Giraud committed
1007 1008 1009
              out_clone << seg ;
              out_clone << endl ;
          }