vidjil.cpp 37.1 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1 2
/*
  This file is part of "Vidjil" <http://bioinfo.lifl.fr/vidjil>
Mathieu Giraud's avatar
Mathieu Giraud committed
3 4
  Copyright (C) 2011, 2012, 2013, 2014 by Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille
  Contributors: Mathieu Giraud <mathieu.giraud@lifl.fr>, Mikaël Salson <mikael.salson@lifl.fr>, Marc Duez <marc.duez@lifl.fr>
Mikaël Salson's avatar
Mikaël Salson committed
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

Mathieu Giraud's avatar
Mathieu Giraud committed
20
//$$ #include
Mikaël Salson's avatar
Mikaël Salson committed
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37

#include<algorithm>
#include<utility>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <cstring>
#include <time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#include "core/tools.h"
#include "core/kmerstore.h"
#include "core/fasta.h"
#include "core/segment.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
38
#include "core/windows.h"
Mikaël Salson's avatar
Mikaël Salson committed
39 40 41 42 43 44 45 46 47
#include "core/cluster-junctions.h"
#include "core/dynprog.h"
#include "core/read_score.h"
#include "core/read_chooser.h"
#include "core/msa.h"
#include "core/compare-all.h"
#include "core/teestream.h"
#include "core/mkdir.h"
#include "core/labels.h"
Mikaël Salson's avatar
Mikaël Salson committed
48
#include "core/list_utils.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
49
#include "core/windowExtractor.h"
Mikaël Salson's avatar
Mikaël Salson committed
50 51 52 53 54 55 56 57

#include "vidjil.h"

// RELEASE_TAG may be defined in the "release.h" file.
// If RELEASE_TAG is undefined, the version will be the git hash.
// #define RELEASE_TAG  "2013.04"
#include "release.h"

Mathieu Giraud's avatar
Mathieu Giraud committed
58
#define VIDJIL_JSON_VERSION "2014.02"
Mikaël Salson's avatar
Mikaël Salson committed
59

Mathieu Giraud's avatar
Mathieu Giraud committed
60 61 62 63
//$$ #define (mainly default options)

#define DEFAULT_GERMLINE_SYSTEM "IGH" 
#define DEFAULT_V_REP  "./germline/IGHV.fa"
Mikaël Salson's avatar
Mikaël Salson committed
64
#define DEFAULT_D_REP  "./germline/IGHD.fa" 
Mathieu Giraud's avatar
Mathieu Giraud committed
65
#define DEFAULT_J_REP  "./germline/IGHJ.fa"
Mikaël Salson's avatar
Mikaël Salson committed
66

Mathieu Giraud's avatar
Mathieu Giraud committed
67
#define DEFAULT_READS  "./data/Stanford_S22.fasta"
Mathieu Giraud's avatar
Mathieu Giraud committed
68
#define MIN_READS_WINDOW 10
Mikaël Salson's avatar
Mikaël Salson committed
69 70
#define MIN_READS_CLONE 10
#define MAX_CLONES 20
Mikaël Salson's avatar
Mikaël Salson committed
71 72
#define RATIO_READS_CLONE 0.1

Mathieu Giraud's avatar
Mathieu Giraud committed
73
#define COMMAND_WINDOWS "windows"
Mikaël Salson's avatar
Mikaël Salson committed
74 75 76
#define COMMAND_ANALYSIS "clones"
#define COMMAND_SEGMENT "segment"
 
Mathieu Giraud's avatar
Mathieu Giraud committed
77
enum { CMD_WINDOWS, CMD_ANALYSIS, CMD_SEGMENT } ;
Mikaël Salson's avatar
Mikaël Salson committed
78 79 80

#define OUT_DIR "./out/" 
#define CLONE_FILENAME "clone.fa-"
Mathieu Giraud's avatar
Mathieu Giraud committed
81
#define WINDOWS_FILENAME "windows.fa"
Mikaël Salson's avatar
Mikaël Salson committed
82 83
#define SEQUENCES_FILENAME "sequences.fa"
#define SEGMENTED_FILENAME "segmented.vdj.fa"
Mikaël Salson's avatar
Mikaël Salson committed
84
#define UNSEGMENTED_FILENAME "unsegmented.fa"
Mikaël Salson's avatar
Mikaël Salson committed
85 86 87
#define EDGES_FILENAME "edges"
#define COMP_FILENAME "comp.data"
#define GRAPH_FILENAME "graph"
Mathieu Giraud's avatar
Mathieu Giraud committed
88
#define JSON_SUFFIX ".data"
Mikaël Salson's avatar
Mikaël Salson committed
89 90 91

// "tests/data/leukemia.fa" 

Mikaël Salson's avatar
Mikaël Salson committed
92
#define DEFAULT_K      10
Mikaël Salson's avatar
Mikaël Salson committed
93
#define DEFAULT_W      40
Mikaël Salson's avatar
Mikaël Salson committed
94 95
#define DEFAULT_W_D    60
#define DEFAULT_SEED   "#####-#####"
Mikaël Salson's avatar
Mikaël Salson committed
96 97

#define DEFAULT_DELTA_MIN  -10
Mathieu Giraud's avatar
Mathieu Giraud committed
98
#define DEFAULT_DELTA_MAX   20
Mikaël Salson's avatar
Mikaël Salson committed
99 100

#define DEFAULT_DELTA_MIN_D  0
Mathieu Giraud's avatar
Mathieu Giraud committed
101
#define DEFAULT_DELTA_MAX_D  60
Mikaël Salson's avatar
Mikaël Salson committed
102

Mikaël Salson's avatar
Mikaël Salson committed
103
#define DEFAULT_MAX_AUDITIONED 2000
Mathieu Giraud's avatar
Mathieu Giraud committed
104 105 106
#define DEFAULT_RATIO_REPRESENTATIVE 0.5
#define DEFAULT_MIN_COVER_REPRESENTATIVE 5

Mikaël Salson's avatar
Mikaël Salson committed
107 108 109 110 111 112 113 114 115
#define DEFAULT_EPSILON  0
#define DEFAULT_MINPTS   10

#define DEFAULT_CLUSTER_COST  Cluster
#define DEFAULT_SEGMENT_COST   VDJ

// display
#define WIDTH_NB_READS 7
#define WIDTH_NB_CLONES 3
Mikaël Salson's avatar
Mikaël Salson committed
116
#define WIDTH_WINDOW_POS 14+WIDTH_NB_CLONES
Mikaël Salson's avatar
Mikaël Salson committed
117 118 119

using namespace std ;

Mathieu Giraud's avatar
Mathieu Giraud committed
120
//$$ options: usage
Mikaël Salson's avatar
Mikaël Salson committed
121

Mathieu Giraud's avatar
Mathieu Giraud committed
122
extern char *optarg;
Mikaël Salson's avatar
Mikaël Salson committed
123 124 125 126 127 128 129 130

extern int optind, optopt, opterr;

void usage(char *progname)
{
  cerr << "Usage: " << progname << " [options] <reads.fa>" << endl << endl;

  cerr << "Command selection" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
131
       << "  -c <command> \t" << COMMAND_WINDOWS << "\t window extracting (default)" << endl 
Mikaël Salson's avatar
Mikaël Salson committed
132 133 134 135 136 137 138 139 140 141 142
       << "  \t\t" << COMMAND_ANALYSIS  << "  \t clone analysis" << endl 
       << "  \t\t" << COMMAND_SEGMENT   << "  \t V(D)J segmentation" << endl
       << endl       

       << "Germline databases" << endl
       << "  -V <file>     V germline multi-fasta file" << endl
       << "  -D <file>     D germline multi-fasta file (automatically implies -d)" << endl
       << "  -J <file>     J germline multi-fasta file" << endl
       << "  -G <prefix>   prefix for V (D) and J repertoires (shortcut for -V <prefix>V.fa -D <prefix>D.fa -J <prefix>J.fa)" << endl
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
143
       << "Window prediction" << endl
Mikaël Salson's avatar
Mikaël Salson committed
144 145 146 147
#ifndef NO_SPACED_SEEDS
       << "  -s <string>   spaced seed used for the V/J affectation (default: " << DEFAULT_SEED << ")" << endl
#endif
       << "  -k <int>      k-mer size used for the V/J affectation (default: " << DEFAULT_K << ")" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
148
       << "  -w <int>      w-mer size used for the length of the extracted window (default: " << DEFAULT_W << ")(default with -d: " << DEFAULT_W_D << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
149 150
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
151 152
       << "Window annotations" << endl
       << "  -l <file>     labels for some windows -- these windows will be kept even if some limits are not reached" << endl
Mikaël Salson's avatar
Mikaël Salson committed
153 154
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
155 156
       << "Limit to keep a window" << endl
       << "  -r <nb>       minimal number of reads containing a window (default: " << MIN_READS_WINDOW << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
157 158 159 160 161 162 163 164 165 166 167 168 169
       << endl

       << "Clusterisation" << endl
       << "  -e <file>     manual clusterisation -- a file used to force some specific edges" << endl
       << "  -n <int>      maximum distance between neighbors for automatic clusterisation (default " << DEFAULT_EPSILON << "). No automatic clusterisation if =0." << endl
       << "  -N <int>      minimum required neighbors for automatic clusterisation (default " << DEFAULT_MINPTS << ")" << endl
       << "  -S            generate and save comparative matrix for clustering" << endl
       << "  -L            load comparative matrix for clustering" << endl
       << "  -C <string>   use custom Cost for automatic clustering : format \"match, subst, indels, homo, del_end\" (default "<<Cluster<<" )"<< endl
       << endl

       << "Limits to report a clone" << endl
       << "  -R <nb>       minimal number of reads supporting a clone (default: " << MIN_READS_CLONE << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
170
       << "  -% <ratio>    minimal percentage of reads supporting a clone (default: " << RATIO_READS_CLONE << ")" << endl
171
       << "  -z <nb>       maximal number of clones reported (0: no limit) (default: " << MAX_CLONES << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
       << endl

       << "Fine segmentation options" << endl
       << "  -d            segment into V(D)J components instead of VJ " << endl
       << "  -m <int>      minimal admissible delta between segmentation points (default: " << DEFAULT_DELTA_MIN << ") (default when -d is used: " << DEFAULT_DELTA_MIN_D << ")" << endl
       << "  -M <int>      maximal admissible delta between segmentation points (default: " << DEFAULT_DELTA_MAX << ") (default when -d is used: " << DEFAULT_DELTA_MAX_D << ")" << endl
       << "  -f <string>   use custom Cost for fine segmenter : format \"match, subst, indels, homo, del_end\" (default "<<VDJ<<" )"<< endl
       << endl

       << "Output" << endl
       << "  -o <dir>      output directory (default: " << OUT_DIR << ")" <<  endl
       << "  -p <string>   prefix output filenames by the specified string" << endl
    
       << "  -a            output all sequences by cluster (" << SEQUENCES_FILENAME << ")" << endl
       << "  -x            no detailed analysis of each cluster" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
187
       << "  -u            output unsegmented sequences (default: " << UNSEGMENTED_FILENAME << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
188 189 190 191 192
       << "  -v            verbose mode" << endl
       << endl        

       << endl 
       << "Examples (see doc/README)" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
193 194 195
       << "  " << progname << "             -G germline/IGH                  -d  data/Stanford_S22.fasta" << endl
       << "  " << progname << " -c clones   -G germline/IGH  -r 1 -R 1 -% 0  -d  data/Stanford_S22.fasta" << endl
       << "  " << progname << " -c segment  -G germline/IGH                  -d  data/Stanford_S22.fasta" << endl
Mikaël Salson's avatar
Mikaël Salson committed
196 197 198 199 200 201 202
    ;
  exit(1);
}

int main (int argc, char **argv)
{
  cout << "# Vidjil -- V(D)J recombinations analysis <http://bioinfo.lifl.fr/vidjil>" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
203 204
       << "# Copyright (C) 2011, 2012, 2013, 2014 by the Vidjil team" << endl
       << "# Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille" << endl 
Mikaël Salson's avatar
Mikaël Salson committed
205 206
       << endl ;

Mathieu Giraud's avatar
Mathieu Giraud committed
207 208
  //$$ options: defaults

Mikaël Salson's avatar
Mikaël Salson committed
209
  string germline_system = DEFAULT_GERMLINE_SYSTEM ;
Mikaël Salson's avatar
Mikaël Salson committed
210 211 212 213 214 215 216 217 218 219 220 221
  string f_rep_V = DEFAULT_V_REP ;
  string f_rep_D = DEFAULT_D_REP ;
  string f_rep_J = DEFAULT_J_REP ;
  string f_reads = DEFAULT_READS ;
  string seed = DEFAULT_SEED ;
  string prefix_filename = "";

  string out_dir = OUT_DIR;
  
  string comp_filename = COMP_FILENAME;

  int k = DEFAULT_K ;
Mikaël Salson's avatar
Mikaël Salson committed
222 223 224
  int w = 0 ;
  int default_w = DEFAULT_W ;

Mikaël Salson's avatar
Mikaël Salson committed
225 226 227 228 229 230 231 232 233 234 235
  int epsilon = DEFAULT_EPSILON ;
  int minPts = DEFAULT_MINPTS ;
  Cost cluster_cost = DEFAULT_CLUSTER_COST ;
  Cost segment_cost = DEFAULT_SEGMENT_COST ;
  
  
  int save_comp = 0;
  int load_comp = 0;
  int segment_D = 0;
  
  int verbose = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
236
  int command = CMD_WINDOWS;
Mikaël Salson's avatar
Mikaël Salson committed
237

Mikaël Salson's avatar
Mikaël Salson committed
238
  int max_clones = MAX_CLONES ;
Mathieu Giraud's avatar
Mathieu Giraud committed
239
  int min_reads_window = MIN_READS_WINDOW ;
Mikaël Salson's avatar
Mikaël Salson committed
240 241 242 243
  int min_reads_clone = MIN_READS_CLONE ;
  float ratio_reads_clone = RATIO_READS_CLONE;
  // int average_deletion = 4;     // Average number of deletion in V or J

Mathieu Giraud's avatar
Mathieu Giraud committed
244 245
  size_t min_cover_representative = DEFAULT_MIN_COVER_REPRESENTATIVE;
  float ratio_representative = DEFAULT_RATIO_REPRESENTATIVE;
Mikaël Salson's avatar
Mikaël Salson committed
246
  unsigned int max_auditionned = DEFAULT_MAX_AUDITIONED;
Mathieu Giraud's avatar
Mathieu Giraud committed
247

Mikaël Salson's avatar
Mikaël Salson committed
248 249 250 251 252 253 254
  // Admissible delta between left and right segmentation points
  int delta_min = DEFAULT_DELTA_MIN ; // Kmer+Fine
  int delta_max = DEFAULT_DELTA_MAX ; // Fine
  int delta_max_kmer = 50 ; // TODO 

  bool output_sequences_by_cluster = false;
  bool detailed_cluster_analysis = true ;
Mikaël Salson's avatar
Mikaël Salson committed
255
  bool very_detailed_cluster_analysis = false ;
Mathieu Giraud's avatar
Mathieu Giraud committed
256
  bool output_unsegmented = false;
Mikaël Salson's avatar
Mikaël Salson committed
257 258 259

  string forced_edges = "" ;

Mathieu Giraud's avatar
Mathieu Giraud committed
260
  string windows_labels_file = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
261
  string normalization_file = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
262 263 264

  char c ;

Mathieu Giraud's avatar
Mathieu Giraud committed
265 266 267
  //$$ options: getopt

  while ((c = getopt(argc, argv, "haG:V:D:J:k:r:R:vw:e:C:t:l:dc:m:M:N:s:p:Sn:o:Lx%:Z:z:u")) != EOF)
Mikaël Salson's avatar
Mikaël Salson committed
268 269 270 271 272 273 274 275 276 277 278 279

    switch (c)
      {
      case 'h':
        usage(argv[0]);
      case 'a':
	output_sequences_by_cluster = true;
	break;
      case 'd':
	segment_D = 1 ;
	delta_min = DEFAULT_DELTA_MIN_D ;
	delta_max = DEFAULT_DELTA_MAX_D ;
Mikaël Salson's avatar
Mikaël Salson committed
280
	default_w = DEFAULT_W_D ;
Mikaël Salson's avatar
Mikaël Salson committed
281 282 283 284 285
	break;
      case 'e':
	forced_edges = optarg;
	break;
      case 'l':
Mathieu Giraud's avatar
Mathieu Giraud committed
286
	windows_labels_file = optarg; 
Mikaël Salson's avatar
Mikaël Salson committed
287
	break;
Mikaël Salson's avatar
Mikaël Salson committed
288 289 290
      case 'Z':
	normalization_file = optarg; 
	break;
Mikaël Salson's avatar
Mikaël Salson committed
291 292 293 294 295 296 297 298
      case 'x':
	detailed_cluster_analysis = false;
	break;
      case 'c':
        if (!strcmp(COMMAND_ANALYSIS,optarg))
          command = CMD_ANALYSIS;
        else if (!strcmp(COMMAND_SEGMENT,optarg))
          command = CMD_SEGMENT;
Mathieu Giraud's avatar
Mathieu Giraud committed
299 300
        else if (!strcmp(COMMAND_WINDOWS,optarg))
          command = CMD_WINDOWS;
Mikaël Salson's avatar
Mikaël Salson committed
301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
        else {
          cerr << "Unknwown command " << optarg << endl;
	  usage(argv[0]);
        }
        break;
      case 'v':
	verbose += 1 ;
	break;

      // Germline

      case 'V':
	f_rep_V = optarg;
	break;

      case 'D':
	f_rep_D = optarg;
        segment_D = 1;
	break;
        
      case 'J':
	f_rep_J = optarg;
	break;

      case 'G':
Mikaël Salson's avatar
Mikaël Salson committed
326 327 328 329
	germline_system = string(optarg);
	f_rep_V = (germline_system + "V.fa").c_str() ;
	f_rep_D = (germline_system + "D.fa").c_str() ;
	f_rep_J = (germline_system + "J.fa").c_str() ;
Mikaël Salson's avatar
Mikaël Salson committed
330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
	// TODO: if VDJ, set segment_D // NO, bad idea, depends on naming convention
	break;

      // Algorithm

      case 'k':
	k = atoi(optarg);
	seed = seed_contiguous(k);
        break;

      case 'w':
	w = atoi(optarg);
        break;

      case 'm':
	delta_min = atoi(optarg);
        break;

      case 'M':
	delta_max = atoi(optarg);
        break;

      case 'o':
        out_dir = optarg ;
        break;

      case 'p':
        prefix_filename = optarg;
        break;

      case 'r':
Mathieu Giraud's avatar
Mathieu Giraud committed
361
	min_reads_window = atoi(optarg);
Mikaël Salson's avatar
Mikaël Salson committed
362 363 364 365 366 367 368 369 370 371
        break;

      case '%':
	ratio_reads_clone = atof(optarg);
	break;

      case 'R':
	min_reads_clone = atoi(optarg);
        break;

Mikaël Salson's avatar
Mikaël Salson committed
372 373 374
      case 'z':
	max_clones = atoi(optarg);
        break;
Mikaël Salson's avatar
Mikaël Salson committed
375 376 377 378 379
      case 's':
#ifndef NO_SPACED_SEEDS
	seed = string(optarg);
	k = seed_weight(seed);
#else
Mathieu Giraud's avatar
Mathieu Giraud committed
380
        cerr << "To enable the option -s, please compile without NO_SPACED_SEEDS" << endl;
Mikaël Salson's avatar
Mikaël Salson committed
381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408
#endif
        break;
	
      // Clusterisation
	
      case 'n':
	epsilon = atoi(optarg);
        break;

      case 'N':
	minPts = atoi(optarg);
        break;
	
      case 'S':
	save_comp=1;
        break;

      case 'L':
	load_comp=1;
        break;
	
      case 'C':
	cluster_cost=strToCost(optarg, Cluster);
        break;
	
      case 't':
	segment_cost=strToCost(optarg, VDJ);
        break;
Mathieu Giraud's avatar
Mathieu Giraud committed
409 410 411 412

      case 'u':
        output_unsegmented = true;
        break;
Mikaël Salson's avatar
Mikaël Salson committed
413 414
      }

Mikaël Salson's avatar
Mikaël Salson committed
415 416 417 418 419 420 421
  // If there was no -w option, then w is either DEFAULT_W or DEFAULT_W_D
  if (w == 0)
    w = default_w ;

  
  string out_seqdir = out_dir + "/seq/" ;

Mikaël Salson's avatar
Mikaël Salson committed
422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437
  if (verbose)
    cout << "# verbose " << verbose << endl ;

  if (optind == argc)
    {
      cout << "# using default sequence file: " << f_reads << endl ;
    }
  else if (optind+1 == argc)
    {
      f_reads = argv[optind] ; 
    }
  else
    {
      cout << "wrong number of arguments." << endl ;
      exit(1);
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
438 439

  //$$ options: post-processing+display
Mathieu Giraud's avatar
Mathieu Giraud committed
440 441 442 443 444 445 446 447 448 449 450


#ifndef NO_SPACED_SEEDS
  // Check seed buffer  
  if (seed.size() >= MAX_SEED_SIZE)
    {
      cout << "Seed size is too large (MAX_SEED_SIZE). Aborting." << endl ;
      exit(1);
    }
#endif

Mikaël Salson's avatar
Mikaël Salson committed
451 452 453 454 455 456 457 458
  // Check that out_dir is an existing directory or creates it
  const char *out_cstr = out_dir.c_str();

  if (mkpath(out_cstr, 0755) == -1) {
    perror("Directory creation");
    exit(2);
  }

Mikaël Salson's avatar
Mikaël Salson committed
459 460 461 462 463 464
  const char *outseq_cstr = out_seqdir.c_str();
  if (mkpath(outseq_cstr, 0755) == -1) {
    perror("Directory creation");
    exit(2);
  }

Mikaël Salson's avatar
Mikaël Salson committed
465 466 467
  out_dir += "/" ;

  /// Load labels ;
Mathieu Giraud's avatar
Mathieu Giraud committed
468
  map <string, string> windows_labels = load_map(windows_labels_file);
Mikaël Salson's avatar
Mikaël Salson committed
469

Mikaël Salson's avatar
Mikaël Salson committed
470 471
  map <string, pair <string, float> > normalization = load_map_norm(normalization_file);

Mikaël Salson's avatar
Mikaël Salson committed
472 473

  switch(command) {
Mathieu Giraud's avatar
Mathieu Giraud committed
474
  case CMD_WINDOWS: cout << "Extracting windows" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
475 476 477 478 479 480 481
    break;
  case CMD_ANALYSIS: cout << "Analysing clones" << endl; 
    break;
  case CMD_SEGMENT: cout << "Segmenting V(D)J" << endl;
    break;
  }

Mathieu Giraud's avatar
Mathieu Giraud committed
482
  cout << "Command line: ";
Mikaël Salson's avatar
Mikaël Salson committed
483
  for (int i=0; i < argc; i++) {
Mathieu Giraud's avatar
Mathieu Giraud committed
484
    cout << argv[i] << " ";
Mikaël Salson's avatar
Mikaël Salson committed
485
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
486
  cout << endl;
Mikaël Salson's avatar
Mikaël Salson committed
487 488 489 490 491 492 493 494 495 496 497 498

  //////////////////////////////////
  // Display time and date
  time_t rawtime;
  struct tm *timeinfo;
  char time_buffer[80];

  time (&rawtime );
  timeinfo = localtime (&rawtime);

  strftime (time_buffer, 80,"%F %T", timeinfo);

Mathieu Giraud's avatar
Mathieu Giraud committed
499
  cout << "# " << time_buffer << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
500 501 502 503 504 505


  //////////////////////////////////
  // Display version information or git log

#ifdef RELEASE_TAG
Mathieu Giraud's avatar
Mathieu Giraud committed
506
  cout << "# version: vidjil " << RELEASE_TAG << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
507
#else
Mathieu Giraud's avatar
Mathieu Giraud committed
508 509
  if (system("git log -1 --pretty=format:'# git: %h (%ci)' --abbrev-commit 2> /dev/null") == 0){}
  cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
510 511 512
#endif

  //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
513
  //$$ Read sequence files
Mikaël Salson's avatar
Mikaël Salson committed
514 515 516 517

  if (!segment_D) // TODO: add other constructor to Fasta, and do not load rep_D in this case
    f_rep_D = "";

Mathieu Giraud's avatar
Mathieu Giraud committed
518 519 520 521
  Fasta rep_V(f_rep_V, 2, "|", cout);
  Fasta rep_D(f_rep_D, 2, "|", cout);
  Fasta rep_J(f_rep_J, 2, "|", cout);

Mikaël Salson's avatar
Mikaël Salson committed
522 523 524 525 526
  OnlineFasta *reads;

  try {
    reads = new OnlineFasta(f_reads, 1, " ");
  } catch (const std::ios_base::failure e) {
Mathieu Giraud's avatar
Mathieu Giraud committed
527
    cout << "Error while reading reads file " << f_reads << ": " << e.what()
Mikaël Salson's avatar
Mikaël Salson committed
528 529 530 531 532 533 534 535 536
        << endl;
    exit(1);
  }

  out_dir += "/";

  ////////////////////////////////////////
  //           CLONE ANALYSIS           //
  ////////////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
537
  if (command == CMD_ANALYSIS || command == CMD_WINDOWS) {
Mikaël Salson's avatar
Mikaël Salson committed
538 539

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
540 541 542 543 544 545
    cout << "# seed = " << seed << "," ;
    cout << " weight = " << seed_weight(seed) << "," ;
    cout << " span = " << seed.size() << endl ;
    cout << "# k = " << k << "," ;
    cout << " w = " << w << "," ;
    cout << " delta = [" << delta_min << "," << delta_max << "]" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
546 547 548


    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
549 550
    //$$ Build Kmer indexes
    cout << "Build Kmer indexes" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
551 552 553 554 555 556 557 558 559

    bool rc = true ;
    
    IKmerStore<KmerAffect>  *index = KmerStoreFactory::createIndex<KmerAffect>(seed, rc);
    index->insert(rep_V, "V");
    index->insert(rep_J, "J");

  
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
560
    //$$ Kmer Segmentation
Mikaël Salson's avatar
Mikaël Salson committed
561

Mathieu Giraud's avatar
Mathieu Giraud committed
562 563
    cout << endl;
    cout << "Loop through reads, looking for windows" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
564

Mathieu Giraud's avatar
Mathieu Giraud committed
565 566 567 568
    string f_segmented = out_dir + prefix_filename + SEGMENTED_FILENAME ;
    cout << "  ==> " << f_segmented << endl ;
    ofstream out_segmented(f_segmented.c_str()) ;
    ofstream *out_unsegmented = NULL;
Mikaël Salson's avatar
Mikaël Salson committed
569
 
Mathieu Giraud's avatar
Mathieu Giraud committed
570 571 572 573 574 575 576 577 578 579 580
    WindowExtractor we;
    we.setSegmentedOutput(&out_segmented);
    if (output_unsegmented) {
      string f_unsegmented = out_dir + prefix_filename + UNSEGMENTED_FILENAME ;
      cout << "  ==> " << f_unsegmented << endl ;
      out_unsegmented = new ofstream(f_unsegmented.c_str());
      we.setUnsegmentedOutput(out_unsegmented);
    }
    WindowsStorage *windowsStorage = we.extract(reads, index, w, delta_min, 
                                                delta_max_kmer, windows_labels);
    size_t nb_total_reads = we.getNbReads();
Mikaël Salson's avatar
Mikaël Salson committed
581 582


Mathieu Giraud's avatar
Mathieu Giraud committed
583
    //$$ Display statistics on segmentation causes
Mikaël Salson's avatar
Mikaël Salson committed
584

Mikaël Salson's avatar
Mikaël Salson committed
585

Mathieu Giraud's avatar
Mathieu Giraud committed
586 587
    int nb_segmented_including_too_short = we.getNbSegmented(TOTAL_SEG_AND_WINDOW) 
      + we.getNbSegmented(TOTAL_SEG_BUT_TOO_SHORT_FOR_THE_WINDOW);
Mikaël Salson's avatar
Mikaël Salson committed
588

Mathieu Giraud's avatar
Mathieu Giraud committed
589
    cout << "  ==> segmented " << nb_segmented_including_too_short << " reads"
Mikaël Salson's avatar
Mikaël Salson committed
590
	<< " (" << setprecision(3) << 100 * (float) nb_segmented_including_too_short / nb_total_reads << "%)" 
Mathieu Giraud's avatar
Mathieu Giraud committed
591 592
	<< endl ;

Mikaël Salson's avatar
Mikaël Salson committed
593
    // nb_segmented is the main denominator for the following (but will be normalized)
Mathieu Giraud's avatar
Mathieu Giraud committed
594
    int nb_segmented = we.getNbSegmented(TOTAL_SEG_AND_WINDOW);
Mikaël Salson's avatar
Mikaël Salson committed
595

Mathieu Giraud's avatar
Mathieu Giraud committed
596
    cout << "  ==> found " << windowsStorage->size() << " " << w << "-windows"
Mikaël Salson's avatar
Mikaël Salson committed
597 598
	<< " in " << nb_segmented << " segments"
	<< " (" << setprecision(3) << 100 * (float) nb_segmented / nb_total_reads << "%)"
Mikaël Salson's avatar
Mikaël Salson committed
599 600
	<< " inside " << nb_total_reads << " sequences" << endl ;
  
Mathieu Giraud's avatar
Mathieu Giraud committed
601
    cout << "                                  #      av. length" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
602

Mikaël Salson's avatar
Mikaël Salson committed
603
    for (int i=0; i<STATS_SIZE; i++)
Mikaël Salson's avatar
Mikaël Salson committed
604
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
605 606
	cout << "   " << left << setw(20) << segmented_mesg[i] 
             << " ->" << right << setw(9) << we.getNbSegmented(static_cast<SEGMENTED>(i)) ;
Mikaël Salson's avatar
Mikaël Salson committed
607

Mathieu Giraud's avatar
Mathieu Giraud committed
608 609 610 611 612
	if (we.getAverageSegmentationLength(static_cast<SEGMENTED>(i)))
	  cout << "      " << setw(5) << fixed << setprecision(1) 
               << we.getAverageSegmentationLength(static_cast<SEGMENTED>(i));
	
	cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
613 614
      }
    
Mathieu Giraud's avatar
Mathieu Giraud committed
615
      map <junction, JsonList> json_data_segment ;
Mikaël Salson's avatar
Mikaël Salson committed
616
    
Mikaël Salson's avatar
Mikaël Salson committed
617 618

	//////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
619
	//$$ Sort windows
Mikaël Salson's avatar
Mikaël Salson committed
620
	
Mathieu Giraud's avatar
Mathieu Giraud committed
621 622
        cout << "Sort windows by number of occurrences" << endl;
        windowsStorage->sort();
Mikaël Salson's avatar
Mikaël Salson committed
623 624

	//////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
625
	//$$ Output windows
Mikaël Salson's avatar
Mikaël Salson committed
626 627
	//////////////////////////////////

Mathieu Giraud's avatar
Mathieu Giraud committed
628
	string f_all_windows = out_dir + prefix_filename + WINDOWS_FILENAME;
Mathieu Giraud's avatar
Mathieu Giraud committed
629
	cout << "  ==> " << f_all_windows << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
630

Mathieu Giraud's avatar
Mathieu Giraud committed
631
	ofstream out_all_windows(f_all_windows.c_str());
Mathieu Giraud's avatar
Mathieu Giraud committed
632
        windowsStorage->printSortedWindows(out_all_windows);
Mikaël Salson's avatar
Mikaël Salson committed
633

Mathieu Giraud's avatar
Mathieu Giraud committed
634 635
	//$$ Normalization
	list< pair <float, int> > norm_list = compute_normalization_list(windowsStorage->getMap(), normalization, nb_segmented);
Mikaël Salson's avatar
Mikaël Salson committed
636 637 638 639 640


    if (command == CMD_ANALYSIS) {

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
641 642
    //$$ min_reads_window (ou label)
    cout << "Considering only windows with >= " << min_reads_window << " reads and labeled windows" << endl;
Mathieu Giraud's avatar
Mathieu Giraud committed
643

Mathieu Giraud's avatar
Mathieu Giraud committed
644
    pair<int, int> info_remove = windowsStorage->keepInterestingWindows((size_t) min_reads_window);
Mikaël Salson's avatar
Mikaël Salson committed
645
	 
Mathieu Giraud's avatar
Mathieu Giraud committed
646 647
    cout << "  ==> keep " <<  windowsStorage->size() << " windows in " << info_remove.second << " reads" ;
    cout << " (" << setprecision(3) << 100 * (float) info_remove.second / nb_total_reads << "%)  " << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
648 649

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
650
    //$$ Clustering
Mikaël Salson's avatar
Mikaël Salson committed
651

Mathieu Giraud's avatar
Mathieu Giraud committed
652
    list <list <junction> > clones_windows;
Mathieu Giraud's avatar
Mathieu Giraud committed
653
    comp_matrix comp=comp_matrix(*windowsStorage);
Mikaël Salson's avatar
Mikaël Salson committed
654 655 656

    if (epsilon || forced_edges.size())
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
657
	cout << "Cluster similar windows" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
658 659 660 661 662 663 664

	if (load_comp==1) 
	  {
	    comp.load((out_dir+prefix_filename + comp_filename).c_str());
	  }
	else
	  {
Mathieu Giraud's avatar
Mathieu Giraud committed
665
	    comp.compare( cout, cluster_cost);
Mikaël Salson's avatar
Mikaël Salson committed
666 667 668 669 670 671 672
	  }
	
	if (save_comp==1)
	  {
	    comp.save(( out_dir+prefix_filename + comp_filename).c_str());
	  }
       
Mathieu Giraud's avatar
Mathieu Giraud committed
673 674
	clones_windows  = comp.cluster(forced_edges, w, cout, epsilon, minPts) ;
	comp.stat_cluster(clones_windows, out_dir + prefix_filename + GRAPH_FILENAME, cout );
Mikaël Salson's avatar
Mikaël Salson committed
675 676 677 678
	comp.del();
      } 
    else
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
679
	cout << "No clustering" << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
680
	clones_windows  = comp.nocluster() ;
Mikaël Salson's avatar
Mikaël Salson committed
681 682
      }

Mathieu Giraud's avatar
Mathieu Giraud committed
683
    cout << "  ==> " << clones_windows.size() << " clones" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
684
 
Mathieu Giraud's avatar
Mathieu Giraud committed
685
    //$$ Sort clones, number of occurrences
Mikaël Salson's avatar
Mikaël Salson committed
686
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
687
    cout << "Sort clones by number of occurrences" << endl;
Mikaël Salson's avatar
Mikaël Salson committed
688 689 690

    list<pair<list <junction>, int> >sort_clones;

Mathieu Giraud's avatar
Mathieu Giraud committed
691
    for (list <list <junction> >::const_iterator it = clones_windows.begin(); it != clones_windows.end(); ++it)
Mikaël Salson's avatar
Mikaël Salson committed
692 693 694
      {
        list <junction>clone = *it ;

Mikaël Salson's avatar
Mikaël Salson committed
695 696 697
	int clone_nb_reads=0;
	
        for (list <junction>::const_iterator it2 = clone.begin(); it2 != clone.end(); ++it2)
Mathieu Giraud's avatar
Mathieu Giraud committed
698
	  clone_nb_reads += windowsStorage->getNbReads(*it2);
Mikaël Salson's avatar
Mikaël Salson committed
699
	  
Mikaël Salson's avatar
Mikaël Salson committed
700
	bool labeled = false ;
Mathieu Giraud's avatar
Mathieu Giraud committed
701
	// Is there a labeled window ?
Mikaël Salson's avatar
Mikaël Salson committed
702
	for (list <junction>::const_iterator iit = clone.begin(); iit != clone.end(); ++iit) {
Mathieu Giraud's avatar
Mathieu Giraud committed
703
	  if (windows_labels.find(*iit) != windows_labels.end())
Mikaël Salson's avatar
Mikaël Salson committed
704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720
	    {
	      labeled = true ;
	      break ;
	    }
	}

	  if (labeled 
	      || ((clone_nb_reads >= min_reads_clone) 
		  && (clone_nb_reads * 100.0 / nb_segmented >= ratio_reads_clone)))
          // Record the clone and its number of occurrence
          sort_clones.push_back(make_pair(clone, clone_nb_reads));
      }

    // Sort clones
    sort_clones.sort(pair_occurrence_sort<list<junction> >);

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
721
    //$$ Output clones
722 723 724 725 726 727
    if (max_clones > 0)
      cout << "Output at most " << max_clones<< " clones" ;
    else
      cout << "Output all clones" ;

    cout << " with >= " << min_reads_clone << " reads and with a ratio >= " << ratio_reads_clone << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
728 729

    map <string, int> clones_codes ;
Mathieu Giraud's avatar
Mathieu Giraud committed
730
    map <string, string> clones_map_windows ;
Mikaël Salson's avatar
Mikaël Salson committed
731 732 733 734 735 736 737 738 739

    list <Sequence> representatives ;
    list <string> representatives_labels ;

    VirtualReadScore *scorer = new KmerAffectReadScore(*index);
    int num_clone = 0 ;

    ofstream out_edges((out_dir+prefix_filename + EDGES_FILENAME).c_str());
    int nb_edges = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
740
    cout << "  ==> suggested edges in " << out_dir+ prefix_filename + EDGES_FILENAME
Mikaël Salson's avatar
Mikaël Salson committed
741 742
        << endl ;

Mathieu Giraud's avatar
Mathieu Giraud committed
743
    cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
744

Mathieu Giraud's avatar
Mathieu Giraud committed
745
    cout << "Output clones in " << out_dir + prefix_filename << endl ; 
Mikaël Salson's avatar
Mikaël Salson committed
746 747 748 749 750 751 752 753

    for (list <pair<list <junction>,int> >::const_iterator it = sort_clones.begin();
         it != sort_clones.end(); ++it) {
      list<junction> clone = it->first;
      int clone_nb_reads = it->second;

    
      ++num_clone ;
Mikaël Salson's avatar
Mikaël Salson committed
754

755
      if (max_clones && (num_clone == max_clones + 1))
Mikaël Salson's avatar
Mikaël Salson committed
756 757
	  break ;

Mikaël Salson's avatar
Mikaël Salson committed
758
      cout << "#### " ;
Mikaël Salson's avatar
Mikaël Salson committed
759 760 761 762

      string clone_file_name = out_seqdir+ prefix_filename + CLONE_FILENAME + string_of_int(num_clone) ;
      string windows_file_name = out_seqdir+ prefix_filename + WINDOWS_FILENAME + "-" + string_of_int(num_clone) ;
      string sequences_file_name = out_seqdir+ prefix_filename + SEQUENCES_FILENAME + "-" + string_of_int(num_clone) ;
Mikaël Salson's avatar
Mikaël Salson committed
763 764

      ofstream out_clone(clone_file_name.c_str());
Mathieu Giraud's avatar
Mathieu Giraud committed
765 766 767 768 769 770
      ofstream out_windows(windows_file_name.c_str());
      ofstream out_sequences;

      if (output_sequences_by_cluster) {
        out_sequences.open(sequences_file_name.c_str());
      }
Mikaël Salson's avatar
Mikaël Salson committed
771
      
Mathieu Giraud's avatar
Mathieu Giraud committed
772 773 774
      cout << "Clone #" << right << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone ;
      cout << " – " << setfill(' ') << setw(WIDTH_NB_READS) << clone_nb_reads << " reads" ;
      cout << " – " << setprecision(3) << 100 * (float) clone_nb_reads / nb_segmented << "%  "  ;
Mikaël Salson's avatar
Mikaël Salson committed
775

Mathieu Giraud's avatar
Mathieu Giraud committed
776
      cout << " – " << 100 * (float) clone_nb_reads * compute_normalization_one(norm_list, clone_nb_reads) / nb_segmented << "% " 
Mikaël Salson's avatar
Mikaël Salson committed
777
	  << compute_normalization_one(norm_list, clone_nb_reads) << " ";
Mathieu Giraud's avatar
Mathieu Giraud committed
778
      cout.flush();
Mikaël Salson's avatar
Mikaël Salson committed
779 780 781

      //////////////////////////////////

Mathieu Giraud's avatar
Mathieu Giraud committed
782
      //$$ Sort sequences by nb_reads
Mathieu Giraud's avatar
Mathieu Giraud committed
783
      list<pair<junction, int> >sort_windows;
Mikaël Salson's avatar
Mikaël Salson committed
784 785

      for (list <junction>::const_iterator it = clone.begin(); it != clone.end(); ++it) {
Mathieu Giraud's avatar
Mathieu Giraud committed
786
	int nb_reads = windowsStorage->getNbReads(*it);
Mathieu Giraud's avatar
Mathieu Giraud committed
787
        sort_windows.push_back(make_pair(*it, nb_reads));
Mikaël Salson's avatar
Mikaël Salson committed
788
      }
Mathieu Giraud's avatar
Mathieu Giraud committed
789
      sort_windows.sort(pair_occurrence_sort<junction>);
Mikaël Salson's avatar
Mikaël Salson committed
790

Mathieu Giraud's avatar
Mathieu Giraud committed
791
      //$$ Output windows 
Mikaël Salson's avatar
Mikaël Salson committed
792 793

      int num_seq = 0 ;
Mikaël Salson's avatar
Mikaël Salson committed
794

Mathieu Giraud's avatar
Mathieu Giraud committed
795 796
      for (list <pair<junction, int> >::const_iterator it = sort_windows.begin(); 
           it != sort_windows.end(); ++it) {
Mikaël Salson's avatar
Mikaël Salson committed
797 798
	num_seq++ ;

Mathieu Giraud's avatar
Mathieu Giraud committed
799 800
        out_windows << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ;
	out_windows << it->first << endl;
Mikaël Salson's avatar
Mikaël Salson committed
801 802 803

	if ((!detailed_cluster_analysis) && (num_seq == 1))
	  {
Mathieu Giraud's avatar
Mathieu Giraud committed
804 805 806
	    cout << "\t" << setw(WIDTH_NB_READS) << it->second << "\t";
	    cout << it->first ;
	    cout << "\t" << windows_labels[it->first] ;
Mikaël Salson's avatar
Mikaël Salson committed
807 808 809 810 811
	  }
      }

      if (!detailed_cluster_analysis)
	{
Mathieu Giraud's avatar
Mathieu Giraud committed
812
	  cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
813 814 815 816
	  continue ;
	}

	
Mathieu Giraud's avatar
Mathieu Giraud committed
817
      //$$ First pass, choose one representative per cluster
Mikaël Salson's avatar
Mikaël Salson committed
818 819 820 821

      string best_V ;
      string best_D ;
      string best_J ;
Mathieu Giraud's avatar
Mathieu Giraud committed
822
      int more_windows = 0 ;
Mikaël Salson's avatar
Mikaël Salson committed
823
      
Mathieu Giraud's avatar
Mathieu Giraud committed
824 825
      for (list <pair<junction, int> >::const_iterator it = sort_windows.begin(); 
           it != sort_windows.end(); ++it) {
Mikaël Salson's avatar
Mikaël Salson committed
826

Mikaël Salson's avatar
Mikaël Salson committed
827 828
        out_clone << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ;
	out_clone << it->first << endl;
Mikaël Salson's avatar
Mikaël Salson committed
829

Mikaël Salson's avatar
Mikaël Salson committed
830 831 832 833 834

	// Display statistics on auditionned sequences
	if (verbose)
	{
	  int total_length = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
835 836
          list<Sequence> auditioned = windowsStorage->getSample(it->first, max_auditionned);
	  for (list<Sequence>::const_iterator it = auditioned.begin(); it != auditioned.end(); ++it) 
Mikaël Salson's avatar
Mikaël Salson committed
837 838
	    total_length += (*it).sequence.size() ;
	  
Mathieu Giraud's avatar
Mathieu Giraud committed
839
	  cout << auditioned.size() << " auditioned sequences, avg length " << total_length / auditioned.size() << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
840 841
	}

Mathieu Giraud's avatar
Mathieu Giraud committed
842 843 844 845 846 847 848 849
        Sequence representative 
          = windowsStorage->getRepresentative(it->first, seed, 
                                             min_cover_representative,
                                             ratio_representative,
                                             max_auditionned);

	//$$ There is one representative, FineSegmenter
        if (representative != NULL_SEQUENCE) {
Mathieu Giraud's avatar
Mathieu Giraud committed
850 851
          representative.label = string_of_int(it->second) + "-" 
            + representative.label;
Mikaël Salson's avatar
Mikaël Salson committed
852
	  FineSegmenter seg(representative, rep_V, rep_J, delta_min, delta_max, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
853
	
Mikaël Salson's avatar
Mikaël Salson committed
854 855
	if (segment_D)
	  seg.FineSegmentD(rep_V, rep_D, rep_J);
Mikaël Salson's avatar
Mikaël Salson committed
856
	
Mathieu Giraud's avatar
Mathieu Giraud committed
857 858 859 860
        //cout << seg.toJson();
        json_data_segment[it->first]=seg.toJsonList(rep_V, rep_D, rep_J);
        
        if (seg.isSegmented())
Mikaël Salson's avatar
Mikaël Salson committed
861 862 863 864 865 866
	  {
	    
	    // As soon as one representative is segmented
	    
	    representatives.push_back(seg.getSequence());
            representatives_labels.push_back("#" + string_of_int(num_clone));
Mikaël Salson's avatar
Mikaël Salson committed
867

Mathieu Giraud's avatar
Mathieu Giraud committed
868 869
              // We need to find the window in the representative
              size_t window_pos = seg.getSequence().sequence.find(it->first);
Mikaël Salson's avatar
Mikaël Salson committed
870

Mathieu Giraud's avatar
Mathieu Giraud committed
871 872
              // Default
              int ww = 2*w/3 ; // /2 ;
Mikaël Salson's avatar
Mikaël Salson committed
873

Mathieu Giraud's avatar
Mathieu Giraud committed
874 875 876 877
              if (window_pos != string::npos) {
                // for V.
                ww = seg.getLeft() - window_pos + seg.del_V;
              } 
Mikaël Salson's avatar
Mikaël Salson committed
878 879
            

Mathieu Giraud's avatar
Mathieu Giraud committed
880
              string end_V ="";
Mikaël Salson's avatar
Mikaël Salson committed
881
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
882 883 884 885
              // avoid case when V is not in the window
              if (seg.getLeft() > (int) window_pos)
                end_V = rep_V.sequence(seg.best_V).substr(rep_V.sequence(seg.best_V).size() - ww, 
                                                          ww - seg.del_V);
Mikaël Salson's avatar
Mikaël Salson committed
886

Mathieu Giraud's avatar
Mathieu Giraud committed
887
              string mid_D = "";
Mikaël Salson's avatar
Mikaël Salson committed
888
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
889 890 891
              if (segment_D)
                mid_D = rep_D.sequence(seg.best_D).substr(seg.del_D_left, 
                                                          rep_D.sequence(seg.best_D).size() - seg.del_D_left - seg.del_D_right );
Mikaël Salson's avatar
Mikaël Salson committed
892
	   
Mathieu Giraud's avatar
Mathieu Giraud committed
893 894 895 896
              if (window_pos != string::npos) {
                // for J.
                ww = (window_pos + w - 1) - seg.getRight() + seg.del_J;
              }
Mikaël Salson's avatar
Mikaël Salson committed
897
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
898
              string start_J = "";
Mikaël Salson's avatar
Mikaël Salson committed
899
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
900 901 902
              // avoid case when J is not in the window
              if (seg.getRight() > (int) (window_pos + w - 1))
                start_J=rep_J.sequence(seg.best_J).substr(seg.del_J, ww);
Mikaël Salson's avatar
Mikaël Salson committed
903
	      
Mathieu Giraud's avatar
Mathieu Giraud committed
904 905 906
              best_V = rep_V.label(seg.best_V) ;
              if (segment_D) best_D = rep_D.label(seg.best_D) ;
              best_J = rep_J.label(seg.best_J) ;
Mikaël Salson's avatar
Mikaël Salson committed
907
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
908 909
              // TODO: pad aux dimensions exactes
              string pad_N = "NNNNNNNNNNNNNNNN" ;
Mikaël Salson's avatar
Mikaël Salson committed
910

Mathieu Giraud's avatar
Mathieu Giraud committed
911
              // Add V, (D) and J to windows to be aligned
Mikaël Salson's avatar
Mikaël Salson committed
912
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
913 914 915 916 917 918 919 920 921
              out_windows << ">" << best_V << "-window" << endl ;
              out_windows << end_V << pad_N << endl ;
              more_windows++;

              if (segment_D) {
                out_windows << ">" << best_D << "-window" << endl ;
                out_windows << mid_D << endl ;   
                more_windows++ ;
              }
Mikaël Salson's avatar
Mikaël Salson committed
922
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
923 924 925
              out_windows << ">" << best_J << "-window" << endl ;
              out_windows << pad_N << start_J <<  endl ;
              more_windows++;
Mikaël Salson's avatar
Mikaël Salson committed
926

Mathieu Giraud's avatar
Mathieu Giraud committed
927 928
              string code = seg.code ;
              int cc = clones_codes[code];
Mikaël Salson's avatar
Mikaël Salson committed
929

Mathieu Giraud's avatar
Mathieu Giraud committed
930 931
              if (cc)
                {
Mathieu Giraud's avatar
Mathieu Giraud committed
932
                  cout << " (similar to Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << cc << setfill(' ') << ")";
Mikaël Salson's avatar
Mikaël Salson committed
933

Mathieu Giraud's avatar
Mathieu Giraud committed
934 935 936 937 938
                  nb_edges++ ;
                  out_edges << clones_map_windows[code] + " " + it->first + " "  ;
                  out_edges << code << "  " ;
                  out_edges << "Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << cc        << setfill(' ') << "  " ;
                  out_edges << "Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << setfill(' ') << "  " ;
Mathieu Giraud's avatar
Mathieu Giraud committed
939
                  out_edges << endl ;                }
Mathieu Giraud's avatar
Mathieu Giraud committed
940 941 942 943 944
              else
                {
                  clones_codes[code] = num_clone ;
                  clones_map_windows[code] = it->first ;
                }
Mikaël Salson's avatar
Mikaël Salson committed
945

Mathieu Giraud's avatar
Mathieu Giraud committed
946 947 948
              out_clone << seg ;
              out_clone << endl ;
          }
Mikaël Salson's avatar
Mikaël Salson committed
949

Mathieu Giraud's avatar
Mathieu Giraud committed
950 951
        if (seg.isSegmented() 
            || it == --(sort_windows.end())) {
Mathieu Giraud's avatar
Mathieu Giraud committed
952
              // display window
Mikaël Salson's avatar
Mikaël Salson committed
953
              cout << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
954 955
		   << ">clone-"  << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << "-window"  << " " << windows_labels[it->first] << endl
		   << it->first << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
956

Mathieu Giraud's avatar
Mathieu Giraud committed
957 958
	      // display representative, possibly segmented
	      cout << ">clone-"  << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << "-representative" << " " << seg.info << setfill(' ') << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
959

Mathieu Giraud's avatar
Mathieu Giraud committed
960
	      cout << representative.sequence << endl;
Mathieu Giraud's avatar
Mathieu Giraud committed
961
              break ;
Mathieu Giraud's avatar
Mathieu Giraud committed
962
        }
Mathieu Giraud's avatar
Mathieu Giraud committed
963
        }
Mikaël Salson's avatar
Mikaël Salson committed
964 965
      }

Mathieu Giraud's avatar
Mathieu Giraud committed
966
      cout << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
967
      out_windows.close();
Mikaël Salson's avatar
Mikaël Salson committed
968

Mikaël Salson's avatar
Mikaël Salson committed
969 970 971 972 973 974
      if (!very_detailed_cluster_analysis)
	{
	  continue ;
	}


Mathieu Giraud's avatar
Mathieu Giraud committed
975
      //$$ Very detailed cluster analysis (with sequences)
Mikaël Salson's avatar
Mikaël Salson committed
976 977 978 979 980

      list<string> msa;
      bool good_msa = false ;

      // TODO: do something if no sequences have been segmented !
Mathieu Giraud's avatar
Mathieu Giraud committed
981
      if (!more_windows)
Mikaël Salson's avatar
Mikaël Salson committed
982
	{
Mathieu Giraud's avatar
Mathieu Giraud committed
983
	  cout << "!! No segmented sequence, deleting clone" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
984 985 986
	  // continue ;
	} else 
        {
Mathieu Giraud's avatar
Mathieu Giraud committed
987
          msa = multiple_seq_align(windows_file_name);
Mikaël Salson's avatar
Mikaël Salson committed
988
        
Mathieu Giraud's avatar
Mathieu Giraud committed
989
          // Alignment of windows
Mikaël Salson's avatar
Mikaël Salson committed
990 991 992
          
          if (!msa.empty())
            {
Mathieu Giraud's avatar
Mathieu Giraud committed
993
              if (msa.size() == sort_windows.size() + more_windows)
Mikaël Salson's avatar
Mikaël Salson committed
994
                {