vidjil.cpp 38 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1 2
/*
  This file is part of "Vidjil" <http://bioinfo.lifl.fr/vidjil>
Mathieu Giraud's avatar
Mathieu Giraud committed
3 4
  Copyright (C) 2011, 2012, 2013, 2014 by Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille
  Contributors: Mathieu Giraud <mathieu.giraud@lifl.fr>, Mikaël Salson <mikael.salson@lifl.fr>, Marc Duez <marc.duez@lifl.fr>
Mikaël Salson's avatar
Mikaël Salson committed
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

Mathieu Giraud's avatar
Mathieu Giraud committed
20
//$$ #include
Mikaël Salson's avatar
Mikaël Salson committed
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37

#include<algorithm>
#include<utility>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <cstring>
#include <time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#include "core/tools.h"
#include "core/kmerstore.h"
#include "core/fasta.h"
#include "core/segment.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
38
#include "core/windows.h"
Mikaël Salson's avatar
Mikaël Salson committed
39 40 41 42 43 44 45 46 47
#include "core/cluster-junctions.h"
#include "core/dynprog.h"
#include "core/read_score.h"
#include "core/read_chooser.h"
#include "core/msa.h"
#include "core/compare-all.h"
#include "core/teestream.h"
#include "core/mkdir.h"
#include "core/labels.h"
Mikaël Salson's avatar
Mikaël Salson committed
48
#include "core/list_utils.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
49
#include "core/windowExtractor.h"
Mikaël Salson's avatar
Mikaël Salson committed
50 51 52 53 54 55 56 57

#include "vidjil.h"

// RELEASE_TAG may be defined in the "release.h" file.
// If RELEASE_TAG is undefined, the version will be the git hash.
// #define RELEASE_TAG  "2013.04"
#include "release.h"

Mathieu Giraud's avatar
Mathieu Giraud committed
58
#define VIDJIL_JSON_VERSION "2014.02"
Mikaël Salson's avatar
Mikaël Salson committed
59

Mathieu Giraud's avatar
Mathieu Giraud committed
60 61 62 63
//$$ #define (mainly default options)

#define DEFAULT_GERMLINE_SYSTEM "IGH" 
#define DEFAULT_V_REP  "./germline/IGHV.fa"
Mikaël Salson's avatar
Mikaël Salson committed
64
#define DEFAULT_D_REP  "./germline/IGHD.fa" 
Mathieu Giraud's avatar
Mathieu Giraud committed
65
#define DEFAULT_J_REP  "./germline/IGHJ.fa"
Mikaël Salson's avatar
Mikaël Salson committed
66

Mathieu Giraud's avatar
Mathieu Giraud committed
67
#define DEFAULT_READS  "./data/Stanford_S22.fasta"
Mathieu Giraud's avatar
Mathieu Giraud committed
68
#define MIN_READS_WINDOW 10
Mikaël Salson's avatar
Mikaël Salson committed
69 70
#define MIN_READS_CLONE 10
#define MAX_CLONES 20
Mikaël Salson's avatar
Mikaël Salson committed
71 72
#define RATIO_READS_CLONE 0.1

Mathieu Giraud's avatar
Mathieu Giraud committed
73
#define COMMAND_WINDOWS "windows"
Mikaël Salson's avatar
Mikaël Salson committed
74 75 76
#define COMMAND_ANALYSIS "clones"
#define COMMAND_SEGMENT "segment"
 
Mathieu Giraud's avatar
Mathieu Giraud committed
77
enum { CMD_WINDOWS, CMD_ANALYSIS, CMD_SEGMENT } ;
Mikaël Salson's avatar
Mikaël Salson committed
78 79 80

#define OUT_DIR "./out/" 
#define CLONE_FILENAME "clone.fa-"
Mathieu Giraud's avatar
Mathieu Giraud committed
81
#define WINDOWS_FILENAME "windows.fa"
Mikaël Salson's avatar
Mikaël Salson committed
82 83
#define SEQUENCES_FILENAME "sequences.fa"
#define SEGMENTED_FILENAME "segmented.vdj.fa"
Mikaël Salson's avatar
Mikaël Salson committed
84
#define UNSEGMENTED_FILENAME "unsegmented.fa"
Mikaël Salson's avatar
Mikaël Salson committed
85 86 87
#define EDGES_FILENAME "edges"
#define COMP_FILENAME "comp.data"
#define GRAPH_FILENAME "graph"
Mathieu Giraud's avatar
Mathieu Giraud committed
88
#define JSON_SUFFIX ".data"
Mikaël Salson's avatar
Mikaël Salson committed
89 90 91

// "tests/data/leukemia.fa" 

92
#define DEFAULT_K      0
Mikaël Salson's avatar
Mikaël Salson committed
93
#define DEFAULT_W      40
Mikaël Salson's avatar
Mikaël Salson committed
94
#define DEFAULT_W_D    60
95
#define DEFAULT_SEED   ""
Mikaël Salson's avatar
Mikaël Salson committed
96 97

#define DEFAULT_DELTA_MIN  -10
Mathieu Giraud's avatar
Mathieu Giraud committed
98
#define DEFAULT_DELTA_MAX   20
Mikaël Salson's avatar
Mikaël Salson committed
99 100

#define DEFAULT_DELTA_MIN_D  0
Mathieu Giraud's avatar
Mathieu Giraud committed
101
#define DEFAULT_DELTA_MAX_D  60
Mikaël Salson's avatar
Mikaël Salson committed
102

Mikaël Salson's avatar
Mikaël Salson committed
103
#define DEFAULT_MAX_AUDITIONED 2000
Mathieu Giraud's avatar
Mathieu Giraud committed
104 105 106
#define DEFAULT_RATIO_REPRESENTATIVE 0.5
#define DEFAULT_MIN_COVER_REPRESENTATIVE 5

Mikaël Salson's avatar
Mikaël Salson committed
107 108 109 110 111 112 113 114 115
#define DEFAULT_EPSILON  0
#define DEFAULT_MINPTS   10

#define DEFAULT_CLUSTER_COST  Cluster
#define DEFAULT_SEGMENT_COST   VDJ

// display
#define WIDTH_NB_READS 7
#define WIDTH_NB_CLONES 3
Mikaël Salson's avatar
Mikaël Salson committed
116
#define WIDTH_WINDOW_POS 14+WIDTH_NB_CLONES
Mikaël Salson's avatar
Mikaël Salson committed
117 118 119

using namespace std ;

Mathieu Giraud's avatar
Mathieu Giraud committed
120
//$$ options: usage
Mikaël Salson's avatar
Mikaël Salson committed
121

Mathieu Giraud's avatar
Mathieu Giraud committed
122
extern char *optarg;
Mikaël Salson's avatar
Mikaël Salson committed
123 124 125 126 127 128 129 130

extern int optind, optopt, opterr;

void usage(char *progname)
{
  cerr << "Usage: " << progname << " [options] <reads.fa>" << endl << endl;

  cerr << "Command selection" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
131
       << "  -c <command> \t" << COMMAND_WINDOWS << "\t window extracting (default)" << endl 
Mikaël Salson's avatar
Mikaël Salson committed
132 133 134 135 136 137 138 139 140 141 142
       << "  \t\t" << COMMAND_ANALYSIS  << "  \t clone analysis" << endl 
       << "  \t\t" << COMMAND_SEGMENT   << "  \t V(D)J segmentation" << endl
       << endl       

       << "Germline databases" << endl
       << "  -V <file>     V germline multi-fasta file" << endl
       << "  -D <file>     D germline multi-fasta file (automatically implies -d)" << endl
       << "  -J <file>     J germline multi-fasta file" << endl
       << "  -G <prefix>   prefix for V (D) and J repertoires (shortcut for -V <prefix>V.fa -D <prefix>D.fa -J <prefix>J.fa)" << endl
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
143
       << "Window prediction" << endl
Mikaël Salson's avatar
Mikaël Salson committed
144
#ifndef NO_SPACED_SEEDS
145 146
       << "  -s <string>   spaced seed used for the V/J affectation" << endl
       << "                (default: #####-#####, ######-######, #######-#######, depends on germline)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
147
#endif
148
       << "  -k <int>      k-mer size used for the V/J affectation (default: 10, 12, 13, depends on germline)" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
149
       << "  -w <int>      w-mer size used for the length of the extracted window (default: " << DEFAULT_W << ")(default with -d: " << DEFAULT_W_D << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
150 151
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
152 153
       << "Window annotations" << endl
       << "  -l <file>     labels for some windows -- these windows will be kept even if some limits are not reached" << endl
Mikaël Salson's avatar
Mikaël Salson committed
154 155
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
156 157
       << "Limit to keep a window" << endl
       << "  -r <nb>       minimal number of reads containing a window (default: " << MIN_READS_WINDOW << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
158 159 160 161 162 163 164 165 166 167 168 169 170
       << endl

       << "Clusterisation" << endl
       << "  -e <file>     manual clusterisation -- a file used to force some specific edges" << endl
       << "  -n <int>      maximum distance between neighbors for automatic clusterisation (default " << DEFAULT_EPSILON << "). No automatic clusterisation if =0." << endl
       << "  -N <int>      minimum required neighbors for automatic clusterisation (default " << DEFAULT_MINPTS << ")" << endl
       << "  -S            generate and save comparative matrix for clustering" << endl
       << "  -L            load comparative matrix for clustering" << endl
       << "  -C <string>   use custom Cost for automatic clustering : format \"match, subst, indels, homo, del_end\" (default "<<Cluster<<" )"<< endl
       << endl

       << "Limits to report a clone" << endl
       << "  -R <nb>       minimal number of reads supporting a clone (default: " << MIN_READS_CLONE << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
171
       << "  -% <ratio>    minimal percentage of reads supporting a clone (default: " << RATIO_READS_CLONE << ")" << endl
172
       << "  -z <nb>       maximal number of clones reported (0: no limit) (default: " << MAX_CLONES << ")" << endl
173
       << "  -A            reports all clones (-r 0 -R 1 -% 0 -z 0), to be used only on very small datasets" << endl
Mikaël Salson's avatar
Mikaël Salson committed
174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
       << endl

       << "Fine segmentation options" << endl
       << "  -d            segment into V(D)J components instead of VJ " << endl
       << "  -m <int>      minimal admissible delta between segmentation points (default: " << DEFAULT_DELTA_MIN << ") (default when -d is used: " << DEFAULT_DELTA_MIN_D << ")" << endl
       << "  -M <int>      maximal admissible delta between segmentation points (default: " << DEFAULT_DELTA_MAX << ") (default when -d is used: " << DEFAULT_DELTA_MAX_D << ")" << endl
       << "  -f <string>   use custom Cost for fine segmenter : format \"match, subst, indels, homo, del_end\" (default "<<VDJ<<" )"<< endl
       << endl

       << "Output" << endl
       << "  -o <dir>      output directory (default: " << OUT_DIR << ")" <<  endl
       << "  -p <string>   prefix output filenames by the specified string" << endl
    
       << "  -a            output all sequences by cluster (" << SEQUENCES_FILENAME << ")" << endl
       << "  -x            no detailed analysis of each cluster" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
189
       << "  -u            output unsegmented sequences (default: " << UNSEGMENTED_FILENAME << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
190 191 192 193 194
       << "  -v            verbose mode" << endl
       << endl        

       << endl 
       << "Examples (see doc/README)" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
195 196 197
       << "  " << progname << "             -G germline/IGH                  -d  data/Stanford_S22.fasta" << endl
       << "  " << progname << " -c clones   -G germline/IGH  -r 1 -R 1 -% 0  -d  data/Stanford_S22.fasta" << endl
       << "  " << progname << " -c segment  -G germline/IGH                  -d  data/Stanford_S22.fasta" << endl
Mikaël Salson's avatar
Mikaël Salson committed
198 199 200 201 202 203 204
    ;
  exit(1);
}

int main (int argc, char **argv)
{
  cout << "# Vidjil -- V(D)J recombinations analysis <http://bioinfo.lifl.fr/vidjil>" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
205 206
       << "# Copyright (C) 2011, 2012, 2013, 2014 by the Vidjil team" << endl
       << "# Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille" << endl 
Mikaël Salson's avatar
Mikaël Salson committed
207 208
       << endl ;

Mathieu Giraud's avatar
Mathieu Giraud committed
209 210
  //$$ options: defaults

Mikaël Salson's avatar
Mikaël Salson committed
211
  string germline_system = DEFAULT_GERMLINE_SYSTEM ;
Mikaël Salson's avatar
Mikaël Salson committed
212 213 214 215 216 217 218 219 220 221 222 223
  string f_rep_V = DEFAULT_V_REP ;
  string f_rep_D = DEFAULT_D_REP ;
  string f_rep_J = DEFAULT_J_REP ;
  string f_reads = DEFAULT_READS ;
  string seed = DEFAULT_SEED ;
  string prefix_filename = "";

  string out_dir = OUT_DIR;
  
  string comp_filename = COMP_FILENAME;

  int k = DEFAULT_K ;
Mikaël Salson's avatar
Mikaël Salson committed
224 225 226
  int w = 0 ;
  int default_w = DEFAULT_W ;

Mikaël Salson's avatar
Mikaël Salson committed
227 228 229 230 231 232 233 234 235 236 237
  int epsilon = DEFAULT_EPSILON ;
  int minPts = DEFAULT_MINPTS ;
  Cost cluster_cost = DEFAULT_CLUSTER_COST ;
  Cost segment_cost = DEFAULT_SEGMENT_COST ;
  
  
  int save_comp = 0;
  int load_comp = 0;
  int segment_D = 0;
  
  int verbose = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
238
  int command = CMD_WINDOWS;
Mikaël Salson's avatar
Mikaël Salson committed
239

Mikaël Salson's avatar
Mikaël Salson committed
240
  int max_clones = MAX_CLONES ;
Mathieu Giraud's avatar
Mathieu Giraud committed
241
  int min_reads_window = MIN_READS_WINDOW ;
Mikaël Salson's avatar
Mikaël Salson committed
242 243 244 245
  int min_reads_clone = MIN_READS_CLONE ;
  float ratio_reads_clone = RATIO_READS_CLONE;
  // int average_deletion = 4;     // Average number of deletion in V or J

Mathieu Giraud's avatar
Mathieu Giraud committed
246 247
  size_t min_cover_representative = DEFAULT_MIN_COVER_REPRESENTATIVE;
  float ratio_representative = DEFAULT_RATIO_REPRESENTATIVE;
Mikaël Salson's avatar
Mikaël Salson committed
248
  unsigned int max_auditionned = DEFAULT_MAX_AUDITIONED;
Mathieu Giraud's avatar
Mathieu Giraud committed
249

Mikaël Salson's avatar
Mikaël Salson committed
250 251 252 253 254 255 256
  // Admissible delta between left and right segmentation points
  int delta_min = DEFAULT_DELTA_MIN ; // Kmer+Fine
  int delta_max = DEFAULT_DELTA_MAX ; // Fine
  int delta_max_kmer = 50 ; // TODO 

  bool output_sequences_by_cluster = false;
  bool detailed_cluster_analysis = true ;
Mikaël Salson's avatar
Mikaël Salson committed
257
  bool very_detailed_cluster_analysis = false ;
Mathieu Giraud's avatar
Mathieu Giraud committed
258
  bool output_unsegmented = false;
Mikaël Salson's avatar
Mikaël Salson committed
259 260 261

  string forced_edges = "" ;

Mathieu Giraud's avatar
Mathieu Giraud committed
262
  string windows_labels_file = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
263
  string normalization_file = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
264 265 266

  char c ;

Mathieu Giraud's avatar
Mathieu Giraud committed
267 268
  //$$ options: getopt

269
  while ((c = getopt(argc, argv, "AhaG:V:D:J:k:r:R:vw:e:C:t:l:dc:m:M:N:s:p:Sn:o:Lx%:Z:z:u")) != EOF)
Mikaël Salson's avatar
Mikaël Salson committed
270 271 272 273 274 275 276 277 278 279 280 281

    switch (c)
      {
      case 'h':
        usage(argv[0]);
      case 'a':
	output_sequences_by_cluster = true;
	break;
      case 'd':
	segment_D = 1 ;
	delta_min = DEFAULT_DELTA_MIN_D ;
	delta_max = DEFAULT_DELTA_MAX_D ;
Mikaël Salson's avatar
Mikaël Salson committed
282
	default_w = DEFAULT_W_D ;
Mikaël Salson's avatar
Mikaël Salson committed
283 284 285 286 287
	break;
      case 'e':
	forced_edges = optarg;
	break;
      case 'l':
Mathieu Giraud's avatar
Mathieu Giraud committed
288
	windows_labels_file = optarg; 
Mikaël Salson's avatar
Mikaël Salson committed
289
	break;
Mikaël Salson's avatar
Mikaël Salson committed
290 291 292
      case 'Z':
	normalization_file = optarg; 
	break;
Mikaël Salson's avatar
Mikaël Salson committed
293 294 295 296 297 298 299 300
      case 'x':
	detailed_cluster_analysis = false;
	break;
      case 'c':
        if (!strcmp(COMMAND_ANALYSIS,optarg))
          command = CMD_ANALYSIS;
        else if (!strcmp(COMMAND_SEGMENT,optarg))
          command = CMD_SEGMENT;
Mathieu Giraud's avatar
Mathieu Giraud committed
301 302
        else if (!strcmp(COMMAND_WINDOWS,optarg))
          command = CMD_WINDOWS;
Mikaël Salson's avatar
Mikaël Salson committed
303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327
        else {
          cerr << "Unknwown command " << optarg << endl;
	  usage(argv[0]);
        }
        break;
      case 'v':
	verbose += 1 ;
	break;

      // Germline

      case 'V':
	f_rep_V = optarg;
	break;

      case 'D':
	f_rep_D = optarg;
        segment_D = 1;
	break;
        
      case 'J':
	f_rep_J = optarg;
	break;

      case 'G':
Mikaël Salson's avatar
Mikaël Salson committed
328 329 330 331
	germline_system = string(optarg);
	f_rep_V = (germline_system + "V.fa").c_str() ;
	f_rep_D = (germline_system + "D.fa").c_str() ;
	f_rep_J = (germline_system + "J.fa").c_str() ;
Mikaël Salson's avatar
Mikaël Salson committed
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
	// TODO: if VDJ, set segment_D // NO, bad idea, depends on naming convention
	break;

      // Algorithm

      case 'k':
	k = atoi(optarg);
	seed = seed_contiguous(k);
        break;

      case 'w':
	w = atoi(optarg);
        break;

      case 'm':
	delta_min = atoi(optarg);
        break;

      case 'M':
	delta_max = atoi(optarg);
        break;

      case 'o':
        out_dir = optarg ;
        break;

      case 'p':
        prefix_filename = optarg;
        break;

362 363
      // Limits

Mikaël Salson's avatar
Mikaël Salson committed
364
      case 'r':
Mathieu Giraud's avatar
Mathieu Giraud committed
365
	min_reads_window = atoi(optarg);
Mikaël Salson's avatar
Mikaël Salson committed
366 367 368 369 370 371 372 373 374 375
        break;

      case '%':
	ratio_reads_clone = atof(optarg);
	break;

      case 'R':
	min_reads_clone = atoi(optarg);
        break;

Mikaël Salson's avatar
Mikaël Salson committed
376 377 378
      case 'z':
	max_clones = atoi(optarg);
        break;
379 380 381 382 383 384 385 386 387 388

      case 'A': // --all
	min_reads_window = 1 ;
	ratio_reads_clone = 0 ;
	min_reads_clone = 0 ;
	max_clones = 0 ;
	break ;

      // Seeds

Mikaël Salson's avatar
Mikaël Salson committed
389 390 391 392 393
      case 's':
#ifndef NO_SPACED_SEEDS
	seed = string(optarg);
	k = seed_weight(seed);
#else
Mathieu Giraud's avatar
Mathieu Giraud committed
394
        cerr << "To enable the option -s, please compile without NO_SPACED_SEEDS" << endl;
Mikaël Salson's avatar
Mikaël Salson committed
395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422
#endif
        break;
	
      // Clusterisation
	
      case 'n':
	epsilon = atoi(optarg);
        break;

      case 'N':
	minPts = atoi(optarg);
        break;
	
      case 'S':
	save_comp=1;
        break;

      case 'L':
	load_comp=1;
        break;
	
      case 'C':
	cluster_cost=strToCost(optarg, Cluster);
        break;
	
      case 't':
	segment_cost=strToCost(optarg, VDJ);
        break;
Mathieu Giraud's avatar
Mathieu Giraud committed
423 424 425 426

      case 'u':
        output_unsegmented = true;
        break;
Mikaël Salson's avatar
Mikaël Salson committed
427 428
      }

Mikaël Salson's avatar
Mikaël Salson committed
429 430 431 432 433 434 435
  // If there was no -w option, then w is either DEFAULT_W or DEFAULT_W_D
  if (w == 0)
    w = default_w ;

  
  string out_seqdir = out_dir + "/seq/" ;

Mikaël Salson's avatar
Mikaël Salson committed
436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451
  if (verbose)
    cout << "# verbose " << verbose << endl ;

  if (optind == argc)
    {
      cout << "# using default sequence file: " << f_reads << endl ;
    }
  else if (optind+1 == argc)
    {
      f_reads = argv[optind] ; 
    }
  else
    {
      cout << "wrong number of arguments." << endl ;
      exit(1);
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
452 453

  //$$ options: post-processing+display
Mathieu Giraud's avatar
Mathieu Giraud committed
454 455


456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480

  // Default seeds

#ifndef NO_SPACED_SEEDS
  if (k == DEFAULT_K)
    {
      if (germline_system.find("TRA") != string::npos)
	seed = "#######-######" ;

      else if ((germline_system.find("TRB") != string::npos)
	       || (germline_system.find("IGH") != string::npos))
	seed = "######-######" ; 
      else // TRD, TRG, IGK, IGL
	seed = "#####-#####" ; 

      k = seed_weight(seed);
    }
#else
  {
    cout << "Vidjil was compiled with NO_SPACED_SEEDS: please provide a -k option." << endl;
    exit(1) ;
  }
#endif
	  

Mathieu Giraud's avatar
Mathieu Giraud committed
481 482 483 484 485 486 487 488 489
#ifndef NO_SPACED_SEEDS
  // Check seed buffer  
  if (seed.size() >= MAX_SEED_SIZE)
    {
      cout << "Seed size is too large (MAX_SEED_SIZE). Aborting." << endl ;
      exit(1);
    }
#endif

Mikaël Salson's avatar
Mikaël Salson committed
490 491 492 493 494 495 496 497
  // Check that out_dir is an existing directory or creates it
  const char *out_cstr = out_dir.c_str();

  if (mkpath(out_cstr, 0755) == -1) {
    perror("Directory creation");
    exit(2);
  }

Mikaël Salson's avatar
Mikaël Salson committed
498 499 500 501 502 503
  const char *outseq_cstr = out_seqdir.c_str();
  if (mkpath(outseq_cstr, 0755) == -1) {
    perror("Directory creation");
    exit(2);
  }

Mikaël Salson's avatar
Mikaël Salson committed
504 505 506
  out_dir += "/" ;

  /// Load labels ;
Mathieu Giraud's avatar
Mathieu Giraud committed
507
  map <string, string> windows_labels = load_map(windows_labels_file);
Mikaël Salson's avatar
Mikaël Salson committed
508

Mikaël Salson's avatar
Mikaël Salson committed
509 510
  map <string, pair <string, float> > normalization = load_map_norm(normalization_file);

Mikaël Salson's avatar
Mikaël Salson committed
511 512

  switch(command) {
Mathieu Giraud's avatar
Mathieu Giraud committed
513
  case CMD_WINDOWS: cout << "Extracting windows" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
514 515 516 517 518 519 520
    break;
  case CMD_ANALYSIS: cout << "Analysing clones" << endl; 
    break;
  case CMD_SEGMENT: cout << "Segmenting V(D)J" << endl;
    break;
  }

Mathieu Giraud's avatar
Mathieu Giraud committed
521
  cout << "Command line: ";
Mikaël Salson's avatar
Mikaël Salson committed
522
  for (int i=0; i < argc; i++) {
Mathieu Giraud's avatar
Mathieu Giraud committed
523
    cout << argv[i] << " ";
Mikaël Salson's avatar
Mikaël Salson committed
524
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
525
  cout << endl;
Mikaël Salson's avatar
Mikaël Salson committed
526 527 528 529 530 531 532 533 534 535 536 537

  //////////////////////////////////
  // Display time and date
  time_t rawtime;
  struct tm *timeinfo;
  char time_buffer[80];

  time (&rawtime );
  timeinfo = localtime (&rawtime);

  strftime (time_buffer, 80,"%F %T", timeinfo);

Mathieu Giraud's avatar
Mathieu Giraud committed
538
  cout << "# " << time_buffer << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
539 540 541 542 543 544


  //////////////////////////////////
  // Display version information or git log

#ifdef RELEASE_TAG
Mathieu Giraud's avatar
Mathieu Giraud committed
545
  cout << "# version: vidjil " << RELEASE_TAG << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
546
#else
Mathieu Giraud's avatar
Mathieu Giraud committed
547 548
  if (system("git log -1 --pretty=format:'# git: %h (%ci)' --abbrev-commit 2> /dev/null") == 0){}
  cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
549 550 551
#endif

  //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
552
  //$$ Read sequence files
Mikaël Salson's avatar
Mikaël Salson committed
553 554 555 556

  if (!segment_D) // TODO: add other constructor to Fasta, and do not load rep_D in this case
    f_rep_D = "";

Mathieu Giraud's avatar
Mathieu Giraud committed
557 558 559 560
  Fasta rep_V(f_rep_V, 2, "|", cout);
  Fasta rep_D(f_rep_D, 2, "|", cout);
  Fasta rep_J(f_rep_J, 2, "|", cout);

Mikaël Salson's avatar
Mikaël Salson committed
561 562 563 564 565
  OnlineFasta *reads;

  try {
    reads = new OnlineFasta(f_reads, 1, " ");
  } catch (const std::ios_base::failure e) {
Mathieu Giraud's avatar
Mathieu Giraud committed
566
    cout << "Error while reading reads file " << f_reads << ": " << e.what()
Mikaël Salson's avatar
Mikaël Salson committed
567 568 569 570 571 572 573 574 575
        << endl;
    exit(1);
  }

  out_dir += "/";

  ////////////////////////////////////////
  //           CLONE ANALYSIS           //
  ////////////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
576
  if (command == CMD_ANALYSIS || command == CMD_WINDOWS) {
Mikaël Salson's avatar
Mikaël Salson committed
577 578

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
579 580 581 582 583 584
    cout << "# seed = " << seed << "," ;
    cout << " weight = " << seed_weight(seed) << "," ;
    cout << " span = " << seed.size() << endl ;
    cout << "# k = " << k << "," ;
    cout << " w = " << w << "," ;
    cout << " delta = [" << delta_min << "," << delta_max << "]" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
585 586 587


    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
588 589
    //$$ Build Kmer indexes
    cout << "Build Kmer indexes" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
590 591 592 593 594 595 596 597 598

    bool rc = true ;
    
    IKmerStore<KmerAffect>  *index = KmerStoreFactory::createIndex<KmerAffect>(seed, rc);
    index->insert(rep_V, "V");
    index->insert(rep_J, "J");

  
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
599
    //$$ Kmer Segmentation
Mikaël Salson's avatar
Mikaël Salson committed
600

Mathieu Giraud's avatar
Mathieu Giraud committed
601 602
    cout << endl;
    cout << "Loop through reads, looking for windows" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
603

Mathieu Giraud's avatar
Mathieu Giraud committed
604 605 606 607
    string f_segmented = out_dir + prefix_filename + SEGMENTED_FILENAME ;
    cout << "  ==> " << f_segmented << endl ;
    ofstream out_segmented(f_segmented.c_str()) ;
    ofstream *out_unsegmented = NULL;
Mikaël Salson's avatar
Mikaël Salson committed
608
 
Mathieu Giraud's avatar
Mathieu Giraud committed
609 610 611 612 613 614 615 616 617 618 619
    WindowExtractor we;
    we.setSegmentedOutput(&out_segmented);
    if (output_unsegmented) {
      string f_unsegmented = out_dir + prefix_filename + UNSEGMENTED_FILENAME ;
      cout << "  ==> " << f_unsegmented << endl ;
      out_unsegmented = new ofstream(f_unsegmented.c_str());
      we.setUnsegmentedOutput(out_unsegmented);
    }
    WindowsStorage *windowsStorage = we.extract(reads, index, w, delta_min, 
                                                delta_max_kmer, windows_labels);
    size_t nb_total_reads = we.getNbReads();
Mikaël Salson's avatar
Mikaël Salson committed
620 621


Mathieu Giraud's avatar
Mathieu Giraud committed
622
    //$$ Display statistics on segmentation causes
Mikaël Salson's avatar
Mikaël Salson committed
623

Mikaël Salson's avatar
Mikaël Salson committed
624

Mathieu Giraud's avatar
Mathieu Giraud committed
625 626
    int nb_segmented_including_too_short = we.getNbSegmented(TOTAL_SEG_AND_WINDOW) 
      + we.getNbSegmented(TOTAL_SEG_BUT_TOO_SHORT_FOR_THE_WINDOW);
Mikaël Salson's avatar
Mikaël Salson committed
627

Mathieu Giraud's avatar
Mathieu Giraud committed
628
    cout << "  ==> segmented " << nb_segmented_including_too_short << " reads"
Mikaël Salson's avatar
Mikaël Salson committed
629
	<< " (" << setprecision(3) << 100 * (float) nb_segmented_including_too_short / nb_total_reads << "%)" 
Mathieu Giraud's avatar
Mathieu Giraud committed
630 631
	<< endl ;

Mikaël Salson's avatar
Mikaël Salson committed
632
    // nb_segmented is the main denominator for the following (but will be normalized)
Mathieu Giraud's avatar
Mathieu Giraud committed
633
    int nb_segmented = we.getNbSegmented(TOTAL_SEG_AND_WINDOW);
Mikaël Salson's avatar
Mikaël Salson committed
634

Mathieu Giraud's avatar
Mathieu Giraud committed
635
    cout << "  ==> found " << windowsStorage->size() << " " << w << "-windows"
Mikaël Salson's avatar
Mikaël Salson committed
636 637
	<< " in " << nb_segmented << " segments"
	<< " (" << setprecision(3) << 100 * (float) nb_segmented / nb_total_reads << "%)"
Mikaël Salson's avatar
Mikaël Salson committed
638 639
	<< " inside " << nb_total_reads << " sequences" << endl ;
  
Mathieu Giraud's avatar
Mathieu Giraud committed
640
    cout << "                                  #      av. length" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
641

Mikaël Salson's avatar
Mikaël Salson committed
642
    for (int i=0; i<STATS_SIZE; i++)
Mikaël Salson's avatar
Mikaël Salson committed
643
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
644 645
	cout << "   " << left << setw(20) << segmented_mesg[i] 
             << " ->" << right << setw(9) << we.getNbSegmented(static_cast<SEGMENTED>(i)) ;
Mikaël Salson's avatar
Mikaël Salson committed
646

Mathieu Giraud's avatar
Mathieu Giraud committed
647 648 649 650 651
	if (we.getAverageSegmentationLength(static_cast<SEGMENTED>(i)))
	  cout << "      " << setw(5) << fixed << setprecision(1) 
               << we.getAverageSegmentationLength(static_cast<SEGMENTED>(i));
	
	cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
652 653
      }
    
Mathieu Giraud's avatar
Mathieu Giraud committed
654
      map <junction, JsonList> json_data_segment ;
Mikaël Salson's avatar
Mikaël Salson committed
655
    
Mikaël Salson's avatar
Mikaël Salson committed
656 657

	//////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
658
	//$$ Sort windows
Mikaël Salson's avatar
Mikaël Salson committed
659
	
Mathieu Giraud's avatar
Mathieu Giraud committed
660 661
        cout << "Sort windows by number of occurrences" << endl;
        windowsStorage->sort();
Mikaël Salson's avatar
Mikaël Salson committed
662 663

	//////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
664
	//$$ Output windows
Mikaël Salson's avatar
Mikaël Salson committed
665 666
	//////////////////////////////////

Mathieu Giraud's avatar
Mathieu Giraud committed
667
	string f_all_windows = out_dir + prefix_filename + WINDOWS_FILENAME;
Mathieu Giraud's avatar
Mathieu Giraud committed
668
	cout << "  ==> " << f_all_windows << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
669

Mathieu Giraud's avatar
Mathieu Giraud committed
670
	ofstream out_all_windows(f_all_windows.c_str());
Mathieu Giraud's avatar
Mathieu Giraud committed
671
        windowsStorage->printSortedWindows(out_all_windows);
Mikaël Salson's avatar
Mikaël Salson committed
672

Mathieu Giraud's avatar
Mathieu Giraud committed
673 674
	//$$ Normalization
	list< pair <float, int> > norm_list = compute_normalization_list(windowsStorage->getMap(), normalization, nb_segmented);
Mikaël Salson's avatar
Mikaël Salson committed
675 676 677 678 679


    if (command == CMD_ANALYSIS) {

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
680 681
    //$$ min_reads_window (ou label)
    cout << "Considering only windows with >= " << min_reads_window << " reads and labeled windows" << endl;
Mathieu Giraud's avatar
Mathieu Giraud committed
682

Mathieu Giraud's avatar
Mathieu Giraud committed
683
    pair<int, int> info_remove = windowsStorage->keepInterestingWindows((size_t) min_reads_window);
Mikaël Salson's avatar
Mikaël Salson committed
684
	 
Mathieu Giraud's avatar
Mathieu Giraud committed
685 686
    cout << "  ==> keep " <<  windowsStorage->size() << " windows in " << info_remove.second << " reads" ;
    cout << " (" << setprecision(3) << 100 * (float) info_remove.second / nb_total_reads << "%)  " << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
687 688

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
689
    //$$ Clustering
Mikaël Salson's avatar
Mikaël Salson committed
690

Mathieu Giraud's avatar
Mathieu Giraud committed
691
    list <list <junction> > clones_windows;
Mathieu Giraud's avatar
Mathieu Giraud committed
692
    comp_matrix comp=comp_matrix(*windowsStorage);
Mikaël Salson's avatar
Mikaël Salson committed
693 694 695

    if (epsilon || forced_edges.size())
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
696
	cout << "Cluster similar windows" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
697 698 699 700 701 702 703

	if (load_comp==1) 
	  {
	    comp.load((out_dir+prefix_filename + comp_filename).c_str());
	  }
	else
	  {
Mathieu Giraud's avatar
Mathieu Giraud committed
704
	    comp.compare( cout, cluster_cost);
Mikaël Salson's avatar
Mikaël Salson committed
705 706 707 708 709 710 711
	  }
	
	if (save_comp==1)
	  {
	    comp.save(( out_dir+prefix_filename + comp_filename).c_str());
	  }
       
Mathieu Giraud's avatar
Mathieu Giraud committed
712 713
	clones_windows  = comp.cluster(forced_edges, w, cout, epsilon, minPts) ;
	comp.stat_cluster(clones_windows, out_dir + prefix_filename + GRAPH_FILENAME, cout );
Mikaël Salson's avatar
Mikaël Salson committed
714 715 716 717
	comp.del();
      } 
    else
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
718
	cout << "No clustering" << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
719
	clones_windows  = comp.nocluster() ;
Mikaël Salson's avatar
Mikaël Salson committed
720 721
      }

Mathieu Giraud's avatar
Mathieu Giraud committed
722
    cout << "  ==> " << clones_windows.size() << " clones" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
723
 
Mathieu Giraud's avatar
Mathieu Giraud committed
724
    //$$ Sort clones, number of occurrences
Mikaël Salson's avatar
Mikaël Salson committed
725
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
726
    cout << "Sort clones by number of occurrences" << endl;
Mikaël Salson's avatar
Mikaël Salson committed
727 728 729

    list<pair<list <junction>, int> >sort_clones;

Mathieu Giraud's avatar
Mathieu Giraud committed
730
    for (list <list <junction> >::const_iterator it = clones_windows.begin(); it != clones_windows.end(); ++it)
Mikaël Salson's avatar
Mikaël Salson committed
731 732 733
      {
        list <junction>clone = *it ;

Mikaël Salson's avatar
Mikaël Salson committed
734 735 736
	int clone_nb_reads=0;
	
        for (list <junction>::const_iterator it2 = clone.begin(); it2 != clone.end(); ++it2)
Mathieu Giraud's avatar
Mathieu Giraud committed
737
	  clone_nb_reads += windowsStorage->getNbReads(*it2);
Mikaël Salson's avatar
Mikaël Salson committed
738
	  
Mikaël Salson's avatar
Mikaël Salson committed
739
	bool labeled = false ;
Mathieu Giraud's avatar
Mathieu Giraud committed
740
	// Is there a labeled window ?
Mikaël Salson's avatar
Mikaël Salson committed
741
	for (list <junction>::const_iterator iit = clone.begin(); iit != clone.end(); ++iit) {
Mathieu Giraud's avatar
Mathieu Giraud committed
742
	  if (windows_labels.find(*iit) != windows_labels.end())
Mikaël Salson's avatar
Mikaël Salson committed
743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759
	    {
	      labeled = true ;
	      break ;
	    }
	}

	  if (labeled 
	      || ((clone_nb_reads >= min_reads_clone) 
		  && (clone_nb_reads * 100.0 / nb_segmented >= ratio_reads_clone)))
          // Record the clone and its number of occurrence
          sort_clones.push_back(make_pair(clone, clone_nb_reads));
      }

    // Sort clones
    sort_clones.sort(pair_occurrence_sort<list<junction> >);

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
760
    //$$ Output clones
761 762 763 764 765 766
    if (max_clones > 0)
      cout << "Output at most " << max_clones<< " clones" ;
    else
      cout << "Output all clones" ;

    cout << " with >= " << min_reads_clone << " reads and with a ratio >= " << ratio_reads_clone << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
767 768

    map <string, int> clones_codes ;
Mathieu Giraud's avatar
Mathieu Giraud committed
769
    map <string, string> clones_map_windows ;
Mikaël Salson's avatar
Mikaël Salson committed
770 771 772 773 774 775 776 777 778

    list <Sequence> representatives ;
    list <string> representatives_labels ;

    VirtualReadScore *scorer = new KmerAffectReadScore(*index);
    int num_clone = 0 ;

    ofstream out_edges((out_dir+prefix_filename + EDGES_FILENAME).c_str());
    int nb_edges = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
779
    cout << "  ==> suggested edges in " << out_dir+ prefix_filename + EDGES_FILENAME
Mikaël Salson's avatar
Mikaël Salson committed
780 781
        << endl ;

Mathieu Giraud's avatar
Mathieu Giraud committed
782
    cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
783

Mathieu Giraud's avatar
Mathieu Giraud committed
784
    cout << "Output clones in " << out_dir + prefix_filename << endl ; 
Mikaël Salson's avatar
Mikaël Salson committed
785 786 787 788 789 790 791 792

    for (list <pair<list <junction>,int> >::const_iterator it = sort_clones.begin();
         it != sort_clones.end(); ++it) {
      list<junction> clone = it->first;
      int clone_nb_reads = it->second;

    
      ++num_clone ;
Mikaël Salson's avatar
Mikaël Salson committed
793

794
      if (max_clones && (num_clone == max_clones + 1))
Mikaël Salson's avatar
Mikaël Salson committed
795 796
	  break ;

Mikaël Salson's avatar
Mikaël Salson committed
797
      cout << "#### " ;
Mikaël Salson's avatar
Mikaël Salson committed
798 799 800 801

      string clone_file_name = out_seqdir+ prefix_filename + CLONE_FILENAME + string_of_int(num_clone) ;
      string windows_file_name = out_seqdir+ prefix_filename + WINDOWS_FILENAME + "-" + string_of_int(num_clone) ;
      string sequences_file_name = out_seqdir+ prefix_filename + SEQUENCES_FILENAME + "-" + string_of_int(num_clone) ;
Mikaël Salson's avatar
Mikaël Salson committed
802 803

      ofstream out_clone(clone_file_name.c_str());
Mathieu Giraud's avatar
Mathieu Giraud committed
804 805 806 807 808 809
      ofstream out_windows(windows_file_name.c_str());
      ofstream out_sequences;

      if (output_sequences_by_cluster) {
        out_sequences.open(sequences_file_name.c_str());
      }
Mikaël Salson's avatar
Mikaël Salson committed
810
      
Mathieu Giraud's avatar
Mathieu Giraud committed
811 812 813
      cout << "Clone #" << right << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone ;
      cout << " – " << setfill(' ') << setw(WIDTH_NB_READS) << clone_nb_reads << " reads" ;
      cout << " – " << setprecision(3) << 100 * (float) clone_nb_reads / nb_segmented << "%  "  ;
Mikaël Salson's avatar
Mikaël Salson committed
814

Mathieu Giraud's avatar
Mathieu Giraud committed
815
      cout << " – " << 100 * (float) clone_nb_reads * compute_normalization_one(norm_list, clone_nb_reads) / nb_segmented << "% " 
Mikaël Salson's avatar
Mikaël Salson committed
816
	  << compute_normalization_one(norm_list, clone_nb_reads) << " ";
Mathieu Giraud's avatar
Mathieu Giraud committed
817
      cout.flush();
Mikaël Salson's avatar
Mikaël Salson committed
818 819 820

      //////////////////////////////////

Mathieu Giraud's avatar
Mathieu Giraud committed
821
      //$$ Sort sequences by nb_reads
Mathieu Giraud's avatar
Mathieu Giraud committed
822
      list<pair<junction, int> >sort_windows;
Mikaël Salson's avatar
Mikaël Salson committed
823 824

      for (list <junction>::const_iterator it = clone.begin(); it != clone.end(); ++it) {
Mathieu Giraud's avatar
Mathieu Giraud committed
825
	int nb_reads = windowsStorage->getNbReads(*it);
Mathieu Giraud's avatar
Mathieu Giraud committed
826
        sort_windows.push_back(make_pair(*it, nb_reads));
Mikaël Salson's avatar
Mikaël Salson committed
827
      }
Mathieu Giraud's avatar
Mathieu Giraud committed
828
      sort_windows.sort(pair_occurrence_sort<junction>);
Mikaël Salson's avatar
Mikaël Salson committed
829

Mathieu Giraud's avatar
Mathieu Giraud committed
830
      //$$ Output windows 
Mikaël Salson's avatar
Mikaël Salson committed
831 832

      int num_seq = 0 ;
Mikaël Salson's avatar
Mikaël Salson committed
833

Mathieu Giraud's avatar
Mathieu Giraud committed
834 835
      for (list <pair<junction, int> >::const_iterator it = sort_windows.begin(); 
           it != sort_windows.end(); ++it) {
Mikaël Salson's avatar
Mikaël Salson committed
836 837
	num_seq++ ;

Mathieu Giraud's avatar
Mathieu Giraud committed
838 839
        out_windows << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ;
	out_windows << it->first << endl;
Mikaël Salson's avatar
Mikaël Salson committed
840 841 842

	if ((!detailed_cluster_analysis) && (num_seq == 1))
	  {
Mathieu Giraud's avatar
Mathieu Giraud committed
843 844 845
	    cout << "\t" << setw(WIDTH_NB_READS) << it->second << "\t";
	    cout << it->first ;
	    cout << "\t" << windows_labels[it->first] ;
Mikaël Salson's avatar
Mikaël Salson committed
846 847 848 849 850
	  }
      }

      if (!detailed_cluster_analysis)
	{
Mathieu Giraud's avatar
Mathieu Giraud committed
851
	  cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
852 853 854 855
	  continue ;
	}

	
Mathieu Giraud's avatar
Mathieu Giraud committed
856
      //$$ First pass, choose one representative per cluster
Mikaël Salson's avatar
Mikaël Salson committed
857 858 859 860

      string best_V ;
      string best_D ;
      string best_J ;
Mathieu Giraud's avatar
Mathieu Giraud committed
861
      int more_windows = 0 ;
Mikaël Salson's avatar
Mikaël Salson committed
862
      
Mathieu Giraud's avatar
Mathieu Giraud committed
863 864
      for (list <pair<junction, int> >::const_iterator it = sort_windows.begin(); 
           it != sort_windows.end(); ++it) {
Mikaël Salson's avatar
Mikaël Salson committed
865

Mikaël Salson's avatar
Mikaël Salson committed
866 867
        out_clone << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ;
	out_clone << it->first << endl;
Mikaël Salson's avatar
Mikaël Salson committed
868

Mikaël Salson's avatar
Mikaël Salson committed
869 870 871 872 873

	// Display statistics on auditionned sequences
	if (verbose)
	{
	  int total_length = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
874 875
          list<Sequence> auditioned = windowsStorage->getSample(it->first, max_auditionned);
	  for (list<Sequence>::const_iterator it = auditioned.begin(); it != auditioned.end(); ++it) 
Mikaël Salson's avatar
Mikaël Salson committed
876 877
	    total_length += (*it).sequence.size() ;
	  
Mathieu Giraud's avatar
Mathieu Giraud committed
878
	  cout << auditioned.size() << " auditioned sequences, avg length " << total_length / auditioned.size() << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
879 880
	}

Mathieu Giraud's avatar
Mathieu Giraud committed
881 882 883 884 885 886 887 888
        Sequence representative 
          = windowsStorage->getRepresentative(it->first, seed, 
                                             min_cover_representative,
                                             ratio_representative,
                                             max_auditionned);

	//$$ There is one representative, FineSegmenter
        if (representative != NULL_SEQUENCE) {
Mathieu Giraud's avatar
Mathieu Giraud committed
889 890
          representative.label = string_of_int(it->second) + "-" 
            + representative.label;
Mikaël Salson's avatar
Mikaël Salson committed
891
	  FineSegmenter seg(representative, rep_V, rep_J, delta_min, delta_max, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
892
	
Mikaël Salson's avatar
Mikaël Salson committed
893 894
	if (segment_D)
	  seg.FineSegmentD(rep_V, rep_D, rep_J);
Mikaël Salson's avatar
Mikaël Salson committed
895
	
Mathieu Giraud's avatar
Mathieu Giraud committed
896 897 898 899
        //cout << seg.toJson();
        json_data_segment[it->first]=seg.toJsonList(rep_V, rep_D, rep_J);
        
        if (seg.isSegmented())
Mikaël Salson's avatar
Mikaël Salson committed
900 901 902 903 904 905
	  {
	    
	    // As soon as one representative is segmented
	    
	    representatives.push_back(seg.getSequence());
            representatives_labels.push_back("#" + string_of_int(num_clone));
Mikaël Salson's avatar
Mikaël Salson committed
906

Mathieu Giraud's avatar
Mathieu Giraud committed
907 908
              // We need to find the window in the representative
              size_t window_pos = seg.getSequence().sequence.find(it->first);
Mikaël Salson's avatar
Mikaël Salson committed
909

Mathieu Giraud's avatar
Mathieu Giraud committed
910 911
              // Default
              int ww = 2*w/3 ; // /2 ;
Mikaël Salson's avatar
Mikaël Salson committed
912

Mathieu Giraud's avatar
Mathieu Giraud committed
913 914 915 916
              if (window_pos != string::npos) {
                // for V.
                ww = seg.getLeft() - window_pos + seg.del_V;
              } 
Mikaël Salson's avatar
Mikaël Salson committed
917 918
            

Mathieu Giraud's avatar
Mathieu Giraud committed
919
              string end_V ="";
Mikaël Salson's avatar
Mikaël Salson committed
920
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
921 922 923 924
              // avoid case when V is not in the window
              if (seg.getLeft() > (int) window_pos)
                end_V = rep_V.sequence(seg.best_V).substr(rep_V.sequence(seg.best_V).size() - ww, 
                                                          ww - seg.del_V);
Mikaël Salson's avatar
Mikaël Salson committed
925

Mathieu Giraud's avatar
Mathieu Giraud committed
926
              string mid_D = "";
Mikaël Salson's avatar
Mikaël Salson committed
927
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
928 929 930
              if (segment_D)
                mid_D = rep_D.sequence(seg.best_D).substr(seg.del_D_left, 
                                                          rep_D.sequence(seg.best_D).size() - seg.del_D_left - seg.del_D_right );
Mikaël Salson's avatar
Mikaël Salson committed
931
	   
Mathieu Giraud's avatar
Mathieu Giraud committed
932 933 934 935
              if (window_pos != string::npos) {
                // for J.
                ww = (window_pos + w - 1) - seg.getRight() + seg.del_J;
              }
Mikaël Salson's avatar
Mikaël Salson committed
936
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
937
              string start_J = "";
Mikaël Salson's avatar
Mikaël Salson committed
938
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
939 940 941
              // avoid case when J is not in the window
              if (seg.getRight() > (int) (window_pos + w - 1))
                start_J=rep_J.sequence(seg.best_J).substr(seg.del_J, ww);
Mikaël Salson's avatar
Mikaël Salson committed
942
	      
Mathieu Giraud's avatar
Mathieu Giraud committed
943 944 945
              best_V = rep_V.label(seg.best_V) ;
              if (segment_D) best_D = rep_D.label(seg.best_D) ;
              best_J = rep_J.label(seg.best_J) ;
Mikaël Salson's avatar
Mikaël Salson committed
946
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
947 948
              // TODO: pad aux dimensions exactes
              string pad_N = "NNNNNNNNNNNNNNNN" ;
Mikaël Salson's avatar
Mikaël Salson committed
949

Mathieu Giraud's avatar
Mathieu Giraud committed
950
              // Add V, (D) and J to windows to be aligned
Mikaël Salson's avatar
Mikaël Salson committed
951
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
952 953 954 955 956 957 958 959 960
              out_windows << ">" << best_V << "-window" << endl ;
              out_windows << end_V << pad_N << endl ;
              more_windows++;

              if (segment_D) {
                out_windows << ">" << best_D << "-window" << endl ;
                out_windows << mid_D << endl ;   
                more_windows++ ;
              }
Mikaël Salson's avatar
Mikaël Salson committed
961
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
962 963 964
              out_windows << ">" << best_J << "-window" << endl ;
              out_windows << pad_N << start_J <<  endl ;
              more_windows++;
Mikaël Salson's avatar
Mikaël Salson committed
965

Mathieu Giraud's avatar
Mathieu Giraud committed
966 967
              string code = seg.code ;
              int cc = clones_codes[code];
Mikaël Salson's avatar
Mikaël Salson committed
968

Mathieu Giraud's avatar
Mathieu Giraud committed
969 970
              if (cc)
                {
Mathieu Giraud's avatar
Mathieu Giraud committed
971
                  cout << " (similar to Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << cc << setfill(' ') << ")";
Mikaël Salson's avatar
Mikaël Salson committed
972

Mathieu Giraud's avatar
Mathieu Giraud committed
973 974 975 976 977
                  nb_edges++ ;
                  out_edges << clones_map_windows[code] + " " + it->first + " "  ;
                  out_edges << code << "  " ;
                  out_edges << "Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << cc        << setfill(' ') << "  " ;
                  out_edges << "Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << setfill(' ') << "  " ;
Mathieu Giraud's avatar
Mathieu Giraud committed
978
                  out_edges << endl ;                }
Mathieu Giraud's avatar
Mathieu Giraud committed
979 980 981 982 983
              else
                {
                  clones_codes[code] = num_clone ;
                  clones_map_windows[code] = it->first ;
                }
Mikaël Salson's avatar
Mikaël Salson committed
984

Mathieu Giraud's avatar
Mathieu Giraud committed
985 986 987
              out_clone << seg ;
              out_clone << endl ;
          }
Mikaël Salson's avatar
Mikaël Salson committed
988

Mathieu Giraud's avatar
Mathieu Giraud committed
989 990
        if (seg.isSegmented() 
            || it == --(sort_windows.end())) {
Mathieu Giraud's avatar
Mathieu Giraud committed
991
              // display window
Mikaël Salson's avatar
Mikaël Salson committed
992
              cout << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
993 994
		   << ">clone-"  << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << "-window"  << " " << windows_labels[it->first] << endl
		   << it->first << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
995

Mathieu Giraud's avatar
Mathieu Giraud committed
996 997
	      // display representative, possibly segmented
	      cout << ">clone-"  << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << "-representative" << " " << seg.info << setfill(' ') << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
998

Mathieu Giraud's avatar
Mathieu Giraud committed
999
	      cout << representative.sequence << endl;
Mathieu Giraud's avatar
Mathieu Giraud committed
1000
              break ;
Mathieu Giraud's avatar
Mathieu Giraud committed
1001
        }
Mathieu Giraud's avatar
Mathieu Giraud committed
1002
        }
Mikaël Salson's avatar
Mikaël Salson committed
1003 1004
      }

Mathieu Giraud's avatar
Mathieu Giraud committed
1005
      cout << endl ;