vidjil.cpp 37.4 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
2
/*
  This file is part of "Vidjil" <http://bioinfo.lifl.fr/vidjil>
Mathieu Giraud's avatar
Mathieu Giraud committed
3
4
  Copyright (C) 2011, 2012, 2013, 2014 by Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille
  Contributors: Mathieu Giraud <mathieu.giraud@lifl.fr>, Mikaël Salson <mikael.salson@lifl.fr>, Marc Duez <marc.duez@lifl.fr>
Mikaël Salson's avatar
Mikaël Salson committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

Mathieu Giraud's avatar
Mathieu Giraud committed
20
//$$ #include
Mikaël Salson's avatar
Mikaël Salson committed
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

#include<algorithm>
#include<utility>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <cstring>
#include <time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#include "core/tools.h"
#include "core/kmerstore.h"
#include "core/fasta.h"
#include "core/segment.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
38
#include "core/windows.h"
Mikaël Salson's avatar
Mikaël Salson committed
39
40
41
42
43
44
45
46
47
#include "core/cluster-junctions.h"
#include "core/dynprog.h"
#include "core/read_score.h"
#include "core/read_chooser.h"
#include "core/msa.h"
#include "core/compare-all.h"
#include "core/teestream.h"
#include "core/mkdir.h"
#include "core/labels.h"
Mikaël Salson's avatar
Mikaël Salson committed
48
#include "core/list_utils.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
49
#include "core/windowExtractor.h"
Mikaël Salson's avatar
Mikaël Salson committed
50
51
52
53
54
55
56
57

#include "vidjil.h"

// RELEASE_TAG may be defined in the "release.h" file.
// If RELEASE_TAG is undefined, the version will be the git hash.
// #define RELEASE_TAG  "2013.04"
#include "release.h"

Mathieu Giraud's avatar
Mathieu Giraud committed
58
#define VIDJIL_JSON_VERSION "2014.02"
Mikaël Salson's avatar
Mikaël Salson committed
59

Mathieu Giraud's avatar
Mathieu Giraud committed
60
61
62
63
//$$ #define (mainly default options)

#define DEFAULT_GERMLINE_SYSTEM "IGH" 
#define DEFAULT_V_REP  "./germline/IGHV.fa"
Mikaël Salson's avatar
Mikaël Salson committed
64
#define DEFAULT_D_REP  "./germline/IGHD.fa" 
Mathieu Giraud's avatar
Mathieu Giraud committed
65
#define DEFAULT_J_REP  "./germline/IGHJ.fa"
Mikaël Salson's avatar
Mikaël Salson committed
66

Mathieu Giraud's avatar
Mathieu Giraud committed
67
#define DEFAULT_READS  "./data/Stanford_S22.fasta"
Mathieu Giraud's avatar
Mathieu Giraud committed
68
#define MIN_READS_WINDOW 10
Mikaël Salson's avatar
Mikaël Salson committed
69
70
#define MIN_READS_CLONE 10
#define MAX_CLONES 20
Mikaël Salson's avatar
Mikaël Salson committed
71
72
#define RATIO_READS_CLONE 0.1

Mathieu Giraud's avatar
Mathieu Giraud committed
73
#define COMMAND_WINDOWS "windows"
Mikaël Salson's avatar
Mikaël Salson committed
74
75
76
#define COMMAND_ANALYSIS "clones"
#define COMMAND_SEGMENT "segment"
 
Mathieu Giraud's avatar
Mathieu Giraud committed
77
enum { CMD_WINDOWS, CMD_ANALYSIS, CMD_SEGMENT } ;
Mikaël Salson's avatar
Mikaël Salson committed
78
79
80

#define OUT_DIR "./out/" 
#define CLONE_FILENAME "clone.fa-"
Mathieu Giraud's avatar
Mathieu Giraud committed
81
#define WINDOWS_FILENAME "windows.fa"
Mikaël Salson's avatar
Mikaël Salson committed
82
83
#define SEQUENCES_FILENAME "sequences.fa"
#define SEGMENTED_FILENAME "segmented.vdj.fa"
Mikaël Salson's avatar
Mikaël Salson committed
84
#define UNSEGMENTED_FILENAME "unsegmented.fa"
Mikaël Salson's avatar
Mikaël Salson committed
85
86
87
#define EDGES_FILENAME "edges"
#define COMP_FILENAME "comp.data"
#define GRAPH_FILENAME "graph"
Mathieu Giraud's avatar
Mathieu Giraud committed
88
#define JSON_SUFFIX ".data"
Mikaël Salson's avatar
Mikaël Salson committed
89
90
91

// "tests/data/leukemia.fa" 

Mikaël Salson's avatar
Mikaël Salson committed
92
#define DEFAULT_K      10
Mikaël Salson's avatar
Mikaël Salson committed
93
#define DEFAULT_W      40
Mikaël Salson's avatar
Mikaël Salson committed
94
95
#define DEFAULT_W_D    60
#define DEFAULT_SEED   "#####-#####"
Mikaël Salson's avatar
Mikaël Salson committed
96
97

#define DEFAULT_DELTA_MIN  -10
Mathieu Giraud's avatar
Mathieu Giraud committed
98
#define DEFAULT_DELTA_MAX   20
Mikaël Salson's avatar
Mikaël Salson committed
99
100

#define DEFAULT_DELTA_MIN_D  0
Mathieu Giraud's avatar
Mathieu Giraud committed
101
#define DEFAULT_DELTA_MAX_D  60
Mikaël Salson's avatar
Mikaël Salson committed
102

Mikaël Salson's avatar
Mikaël Salson committed
103
#define DEFAULT_MAX_AUDITIONED 2000
Mathieu Giraud's avatar
Mathieu Giraud committed
104
105
106
#define DEFAULT_RATIO_REPRESENTATIVE 0.5
#define DEFAULT_MIN_COVER_REPRESENTATIVE 5

Mikaël Salson's avatar
Mikaël Salson committed
107
108
109
110
111
112
113
114
115
#define DEFAULT_EPSILON  0
#define DEFAULT_MINPTS   10

#define DEFAULT_CLUSTER_COST  Cluster
#define DEFAULT_SEGMENT_COST   VDJ

// display
#define WIDTH_NB_READS 7
#define WIDTH_NB_CLONES 3
Mikaël Salson's avatar
Mikaël Salson committed
116
#define WIDTH_WINDOW_POS 14+WIDTH_NB_CLONES
Mikaël Salson's avatar
Mikaël Salson committed
117
118
119

using namespace std ;

Mathieu Giraud's avatar
Mathieu Giraud committed
120
//$$ options: usage
Mikaël Salson's avatar
Mikaël Salson committed
121

Mathieu Giraud's avatar
Mathieu Giraud committed
122
extern char *optarg;
Mikaël Salson's avatar
Mikaël Salson committed
123
124
125
126
127
128
129
130

extern int optind, optopt, opterr;

void usage(char *progname)
{
  cerr << "Usage: " << progname << " [options] <reads.fa>" << endl << endl;

  cerr << "Command selection" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
131
       << "  -c <command> \t" << COMMAND_WINDOWS << "\t window extracting (default)" << endl 
Mikaël Salson's avatar
Mikaël Salson committed
132
133
134
135
136
137
138
139
140
141
142
       << "  \t\t" << COMMAND_ANALYSIS  << "  \t clone analysis" << endl 
       << "  \t\t" << COMMAND_SEGMENT   << "  \t V(D)J segmentation" << endl
       << endl       

       << "Germline databases" << endl
       << "  -V <file>     V germline multi-fasta file" << endl
       << "  -D <file>     D germline multi-fasta file (automatically implies -d)" << endl
       << "  -J <file>     J germline multi-fasta file" << endl
       << "  -G <prefix>   prefix for V (D) and J repertoires (shortcut for -V <prefix>V.fa -D <prefix>D.fa -J <prefix>J.fa)" << endl
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
143
       << "Window prediction" << endl
Mikaël Salson's avatar
Mikaël Salson committed
144
145
146
147
#ifndef NO_SPACED_SEEDS
       << "  -s <string>   spaced seed used for the V/J affectation (default: " << DEFAULT_SEED << ")" << endl
#endif
       << "  -k <int>      k-mer size used for the V/J affectation (default: " << DEFAULT_K << ")" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
148
       << "  -w <int>      w-mer size used for the length of the extracted window (default: " << DEFAULT_W << ")(default with -d: " << DEFAULT_W_D << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
149
150
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
151
152
       << "Window annotations" << endl
       << "  -l <file>     labels for some windows -- these windows will be kept even if some limits are not reached" << endl
Mikaël Salson's avatar
Mikaël Salson committed
153
154
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
155
156
       << "Limit to keep a window" << endl
       << "  -r <nb>       minimal number of reads containing a window (default: " << MIN_READS_WINDOW << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
157
158
159
160
161
162
163
164
165
166
167
168
169
       << endl

       << "Clusterisation" << endl
       << "  -e <file>     manual clusterisation -- a file used to force some specific edges" << endl
       << "  -n <int>      maximum distance between neighbors for automatic clusterisation (default " << DEFAULT_EPSILON << "). No automatic clusterisation if =0." << endl
       << "  -N <int>      minimum required neighbors for automatic clusterisation (default " << DEFAULT_MINPTS << ")" << endl
       << "  -S            generate and save comparative matrix for clustering" << endl
       << "  -L            load comparative matrix for clustering" << endl
       << "  -C <string>   use custom Cost for automatic clustering : format \"match, subst, indels, homo, del_end\" (default "<<Cluster<<" )"<< endl
       << endl

       << "Limits to report a clone" << endl
       << "  -R <nb>       minimal number of reads supporting a clone (default: " << MIN_READS_CLONE << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
170
       << "  -% <ratio>    minimal percentage of reads supporting a clone (default: " << RATIO_READS_CLONE << ")" << endl
171
       << "  -z <nb>       maximal number of clones reported (0: no limit) (default: " << MAX_CLONES << ")" << endl
172
       << "  -A            reports all clones (-r 0 -R 1 -% 0 -z 0), to be used only on very small datasets" << endl
Mikaël Salson's avatar
Mikaël Salson committed
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
       << endl

       << "Fine segmentation options" << endl
       << "  -d            segment into V(D)J components instead of VJ " << endl
       << "  -m <int>      minimal admissible delta between segmentation points (default: " << DEFAULT_DELTA_MIN << ") (default when -d is used: " << DEFAULT_DELTA_MIN_D << ")" << endl
       << "  -M <int>      maximal admissible delta between segmentation points (default: " << DEFAULT_DELTA_MAX << ") (default when -d is used: " << DEFAULT_DELTA_MAX_D << ")" << endl
       << "  -f <string>   use custom Cost for fine segmenter : format \"match, subst, indels, homo, del_end\" (default "<<VDJ<<" )"<< endl
       << endl

       << "Output" << endl
       << "  -o <dir>      output directory (default: " << OUT_DIR << ")" <<  endl
       << "  -p <string>   prefix output filenames by the specified string" << endl
    
       << "  -a            output all sequences by cluster (" << SEQUENCES_FILENAME << ")" << endl
       << "  -x            no detailed analysis of each cluster" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
188
       << "  -u            output unsegmented sequences (default: " << UNSEGMENTED_FILENAME << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
189
190
191
192
193
       << "  -v            verbose mode" << endl
       << endl        

       << endl 
       << "Examples (see doc/README)" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
194
195
196
       << "  " << progname << "             -G germline/IGH                  -d  data/Stanford_S22.fasta" << endl
       << "  " << progname << " -c clones   -G germline/IGH  -r 1 -R 1 -% 0  -d  data/Stanford_S22.fasta" << endl
       << "  " << progname << " -c segment  -G germline/IGH                  -d  data/Stanford_S22.fasta" << endl
Mikaël Salson's avatar
Mikaël Salson committed
197
198
199
200
201
202
203
    ;
  exit(1);
}

int main (int argc, char **argv)
{
  cout << "# Vidjil -- V(D)J recombinations analysis <http://bioinfo.lifl.fr/vidjil>" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
204
205
       << "# Copyright (C) 2011, 2012, 2013, 2014 by the Vidjil team" << endl
       << "# Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille" << endl 
Mikaël Salson's avatar
Mikaël Salson committed
206
207
       << endl ;

Mathieu Giraud's avatar
Mathieu Giraud committed
208
209
  //$$ options: defaults

Mikaël Salson's avatar
Mikaël Salson committed
210
  string germline_system = DEFAULT_GERMLINE_SYSTEM ;
Mikaël Salson's avatar
Mikaël Salson committed
211
212
213
214
215
216
217
218
219
220
221
222
  string f_rep_V = DEFAULT_V_REP ;
  string f_rep_D = DEFAULT_D_REP ;
  string f_rep_J = DEFAULT_J_REP ;
  string f_reads = DEFAULT_READS ;
  string seed = DEFAULT_SEED ;
  string prefix_filename = "";

  string out_dir = OUT_DIR;
  
  string comp_filename = COMP_FILENAME;

  int k = DEFAULT_K ;
Mikaël Salson's avatar
Mikaël Salson committed
223
224
225
  int w = 0 ;
  int default_w = DEFAULT_W ;

Mikaël Salson's avatar
Mikaël Salson committed
226
227
228
229
230
231
232
233
234
235
236
  int epsilon = DEFAULT_EPSILON ;
  int minPts = DEFAULT_MINPTS ;
  Cost cluster_cost = DEFAULT_CLUSTER_COST ;
  Cost segment_cost = DEFAULT_SEGMENT_COST ;
  
  
  int save_comp = 0;
  int load_comp = 0;
  int segment_D = 0;
  
  int verbose = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
237
  int command = CMD_WINDOWS;
Mikaël Salson's avatar
Mikaël Salson committed
238

Mikaël Salson's avatar
Mikaël Salson committed
239
  int max_clones = MAX_CLONES ;
Mathieu Giraud's avatar
Mathieu Giraud committed
240
  int min_reads_window = MIN_READS_WINDOW ;
Mikaël Salson's avatar
Mikaël Salson committed
241
242
243
244
  int min_reads_clone = MIN_READS_CLONE ;
  float ratio_reads_clone = RATIO_READS_CLONE;
  // int average_deletion = 4;     // Average number of deletion in V or J

Mathieu Giraud's avatar
Mathieu Giraud committed
245
246
  size_t min_cover_representative = DEFAULT_MIN_COVER_REPRESENTATIVE;
  float ratio_representative = DEFAULT_RATIO_REPRESENTATIVE;
Mikaël Salson's avatar
Mikaël Salson committed
247
  unsigned int max_auditionned = DEFAULT_MAX_AUDITIONED;
Mathieu Giraud's avatar
Mathieu Giraud committed
248

Mikaël Salson's avatar
Mikaël Salson committed
249
250
251
252
253
254
255
  // Admissible delta between left and right segmentation points
  int delta_min = DEFAULT_DELTA_MIN ; // Kmer+Fine
  int delta_max = DEFAULT_DELTA_MAX ; // Fine
  int delta_max_kmer = 50 ; // TODO 

  bool output_sequences_by_cluster = false;
  bool detailed_cluster_analysis = true ;
Mikaël Salson's avatar
Mikaël Salson committed
256
  bool very_detailed_cluster_analysis = false ;
Mathieu Giraud's avatar
Mathieu Giraud committed
257
  bool output_unsegmented = false;
Mikaël Salson's avatar
Mikaël Salson committed
258
259
260

  string forced_edges = "" ;

Mathieu Giraud's avatar
Mathieu Giraud committed
261
  string windows_labels_file = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
262
  string normalization_file = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
263
264
265

  char c ;

Mathieu Giraud's avatar
Mathieu Giraud committed
266
267
  //$$ options: getopt

268
  while ((c = getopt(argc, argv, "AhaG:V:D:J:k:r:R:vw:e:C:t:l:dc:m:M:N:s:p:Sn:o:Lx%:Z:z:u")) != EOF)
Mikaël Salson's avatar
Mikaël Salson committed
269
270
271
272
273
274
275
276
277
278
279
280

    switch (c)
      {
      case 'h':
        usage(argv[0]);
      case 'a':
	output_sequences_by_cluster = true;
	break;
      case 'd':
	segment_D = 1 ;
	delta_min = DEFAULT_DELTA_MIN_D ;
	delta_max = DEFAULT_DELTA_MAX_D ;
Mikaël Salson's avatar
Mikaël Salson committed
281
	default_w = DEFAULT_W_D ;
Mikaël Salson's avatar
Mikaël Salson committed
282
283
284
285
286
	break;
      case 'e':
	forced_edges = optarg;
	break;
      case 'l':
Mathieu Giraud's avatar
Mathieu Giraud committed
287
	windows_labels_file = optarg; 
Mikaël Salson's avatar
Mikaël Salson committed
288
	break;
Mikaël Salson's avatar
Mikaël Salson committed
289
290
291
      case 'Z':
	normalization_file = optarg; 
	break;
Mikaël Salson's avatar
Mikaël Salson committed
292
293
294
295
296
297
298
299
      case 'x':
	detailed_cluster_analysis = false;
	break;
      case 'c':
        if (!strcmp(COMMAND_ANALYSIS,optarg))
          command = CMD_ANALYSIS;
        else if (!strcmp(COMMAND_SEGMENT,optarg))
          command = CMD_SEGMENT;
Mathieu Giraud's avatar
Mathieu Giraud committed
300
301
        else if (!strcmp(COMMAND_WINDOWS,optarg))
          command = CMD_WINDOWS;
Mikaël Salson's avatar
Mikaël Salson committed
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
        else {
          cerr << "Unknwown command " << optarg << endl;
	  usage(argv[0]);
        }
        break;
      case 'v':
	verbose += 1 ;
	break;

      // Germline

      case 'V':
	f_rep_V = optarg;
	break;

      case 'D':
	f_rep_D = optarg;
        segment_D = 1;
	break;
        
      case 'J':
	f_rep_J = optarg;
	break;

      case 'G':
Mikaël Salson's avatar
Mikaël Salson committed
327
328
329
330
	germline_system = string(optarg);
	f_rep_V = (germline_system + "V.fa").c_str() ;
	f_rep_D = (germline_system + "D.fa").c_str() ;
	f_rep_J = (germline_system + "J.fa").c_str() ;
Mikaël Salson's avatar
Mikaël Salson committed
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
	// TODO: if VDJ, set segment_D // NO, bad idea, depends on naming convention
	break;

      // Algorithm

      case 'k':
	k = atoi(optarg);
	seed = seed_contiguous(k);
        break;

      case 'w':
	w = atoi(optarg);
        break;

      case 'm':
	delta_min = atoi(optarg);
        break;

      case 'M':
	delta_max = atoi(optarg);
        break;

      case 'o':
        out_dir = optarg ;
        break;

      case 'p':
        prefix_filename = optarg;
        break;

361
362
      // Limits

Mikaël Salson's avatar
Mikaël Salson committed
363
      case 'r':
Mathieu Giraud's avatar
Mathieu Giraud committed
364
	min_reads_window = atoi(optarg);
Mikaël Salson's avatar
Mikaël Salson committed
365
366
367
368
369
370
371
372
373
374
        break;

      case '%':
	ratio_reads_clone = atof(optarg);
	break;

      case 'R':
	min_reads_clone = atoi(optarg);
        break;

Mikaël Salson's avatar
Mikaël Salson committed
375
376
377
      case 'z':
	max_clones = atoi(optarg);
        break;
378
379
380
381
382
383
384
385
386
387

      case 'A': // --all
	min_reads_window = 1 ;
	ratio_reads_clone = 0 ;
	min_reads_clone = 0 ;
	max_clones = 0 ;
	break ;

      // Seeds

Mikaël Salson's avatar
Mikaël Salson committed
388
389
390
391
392
      case 's':
#ifndef NO_SPACED_SEEDS
	seed = string(optarg);
	k = seed_weight(seed);
#else
Mathieu Giraud's avatar
Mathieu Giraud committed
393
        cerr << "To enable the option -s, please compile without NO_SPACED_SEEDS" << endl;
Mikaël Salson's avatar
Mikaël Salson committed
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
#endif
        break;
	
      // Clusterisation
	
      case 'n':
	epsilon = atoi(optarg);
        break;

      case 'N':
	minPts = atoi(optarg);
        break;
	
      case 'S':
	save_comp=1;
        break;

      case 'L':
	load_comp=1;
        break;
	
      case 'C':
	cluster_cost=strToCost(optarg, Cluster);
        break;
	
      case 't':
	segment_cost=strToCost(optarg, VDJ);
        break;
Mathieu Giraud's avatar
Mathieu Giraud committed
422
423
424
425

      case 'u':
        output_unsegmented = true;
        break;
Mikaël Salson's avatar
Mikaël Salson committed
426
427
      }

Mikaël Salson's avatar
Mikaël Salson committed
428
429
430
431
432
433
434
  // If there was no -w option, then w is either DEFAULT_W or DEFAULT_W_D
  if (w == 0)
    w = default_w ;

  
  string out_seqdir = out_dir + "/seq/" ;

Mikaël Salson's avatar
Mikaël Salson committed
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
  if (verbose)
    cout << "# verbose " << verbose << endl ;

  if (optind == argc)
    {
      cout << "# using default sequence file: " << f_reads << endl ;
    }
  else if (optind+1 == argc)
    {
      f_reads = argv[optind] ; 
    }
  else
    {
      cout << "wrong number of arguments." << endl ;
      exit(1);
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
451
452

  //$$ options: post-processing+display
Mathieu Giraud's avatar
Mathieu Giraud committed
453
454
455
456
457
458
459
460
461
462
463


#ifndef NO_SPACED_SEEDS
  // Check seed buffer  
  if (seed.size() >= MAX_SEED_SIZE)
    {
      cout << "Seed size is too large (MAX_SEED_SIZE). Aborting." << endl ;
      exit(1);
    }
#endif

Mikaël Salson's avatar
Mikaël Salson committed
464
465
466
467
468
469
470
471
  // Check that out_dir is an existing directory or creates it
  const char *out_cstr = out_dir.c_str();

  if (mkpath(out_cstr, 0755) == -1) {
    perror("Directory creation");
    exit(2);
  }

Mikaël Salson's avatar
Mikaël Salson committed
472
473
474
475
476
477
  const char *outseq_cstr = out_seqdir.c_str();
  if (mkpath(outseq_cstr, 0755) == -1) {
    perror("Directory creation");
    exit(2);
  }

Mikaël Salson's avatar
Mikaël Salson committed
478
479
480
  out_dir += "/" ;

  /// Load labels ;
Mathieu Giraud's avatar
Mathieu Giraud committed
481
  map <string, string> windows_labels = load_map(windows_labels_file);
Mikaël Salson's avatar
Mikaël Salson committed
482

Mikaël Salson's avatar
Mikaël Salson committed
483
484
  map <string, pair <string, float> > normalization = load_map_norm(normalization_file);

Mikaël Salson's avatar
Mikaël Salson committed
485
486

  switch(command) {
Mathieu Giraud's avatar
Mathieu Giraud committed
487
  case CMD_WINDOWS: cout << "Extracting windows" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
488
489
490
491
492
493
494
    break;
  case CMD_ANALYSIS: cout << "Analysing clones" << endl; 
    break;
  case CMD_SEGMENT: cout << "Segmenting V(D)J" << endl;
    break;
  }

Mathieu Giraud's avatar
Mathieu Giraud committed
495
  cout << "Command line: ";
Mikaël Salson's avatar
Mikaël Salson committed
496
  for (int i=0; i < argc; i++) {
Mathieu Giraud's avatar
Mathieu Giraud committed
497
    cout << argv[i] << " ";
Mikaël Salson's avatar
Mikaël Salson committed
498
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
499
  cout << endl;
Mikaël Salson's avatar
Mikaël Salson committed
500
501
502
503
504
505
506
507
508
509
510
511

  //////////////////////////////////
  // Display time and date
  time_t rawtime;
  struct tm *timeinfo;
  char time_buffer[80];

  time (&rawtime );
  timeinfo = localtime (&rawtime);

  strftime (time_buffer, 80,"%F %T", timeinfo);

Mathieu Giraud's avatar
Mathieu Giraud committed
512
  cout << "# " << time_buffer << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
513
514
515
516
517
518


  //////////////////////////////////
  // Display version information or git log

#ifdef RELEASE_TAG
Mathieu Giraud's avatar
Mathieu Giraud committed
519
  cout << "# version: vidjil " << RELEASE_TAG << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
520
#else
Mathieu Giraud's avatar
Mathieu Giraud committed
521
522
  if (system("git log -1 --pretty=format:'# git: %h (%ci)' --abbrev-commit 2> /dev/null") == 0){}
  cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
523
524
525
#endif

  //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
526
  //$$ Read sequence files
Mikaël Salson's avatar
Mikaël Salson committed
527
528
529
530

  if (!segment_D) // TODO: add other constructor to Fasta, and do not load rep_D in this case
    f_rep_D = "";

Mathieu Giraud's avatar
Mathieu Giraud committed
531
532
533
534
  Fasta rep_V(f_rep_V, 2, "|", cout);
  Fasta rep_D(f_rep_D, 2, "|", cout);
  Fasta rep_J(f_rep_J, 2, "|", cout);

Mikaël Salson's avatar
Mikaël Salson committed
535
536
537
538
539
  OnlineFasta *reads;

  try {
    reads = new OnlineFasta(f_reads, 1, " ");
  } catch (const std::ios_base::failure e) {
Mathieu Giraud's avatar
Mathieu Giraud committed
540
    cout << "Error while reading reads file " << f_reads << ": " << e.what()
Mikaël Salson's avatar
Mikaël Salson committed
541
542
543
544
545
546
547
548
549
        << endl;
    exit(1);
  }

  out_dir += "/";

  ////////////////////////////////////////
  //           CLONE ANALYSIS           //
  ////////////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
550
  if (command == CMD_ANALYSIS || command == CMD_WINDOWS) {
Mikaël Salson's avatar
Mikaël Salson committed
551
552

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
553
554
555
556
557
558
    cout << "# seed = " << seed << "," ;
    cout << " weight = " << seed_weight(seed) << "," ;
    cout << " span = " << seed.size() << endl ;
    cout << "# k = " << k << "," ;
    cout << " w = " << w << "," ;
    cout << " delta = [" << delta_min << "," << delta_max << "]" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
559
560
561


    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
562
563
    //$$ Build Kmer indexes
    cout << "Build Kmer indexes" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
564
565
566
567
568
569
570
571
572

    bool rc = true ;
    
    IKmerStore<KmerAffect>  *index = KmerStoreFactory::createIndex<KmerAffect>(seed, rc);
    index->insert(rep_V, "V");
    index->insert(rep_J, "J");

  
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
573
    //$$ Kmer Segmentation
Mikaël Salson's avatar
Mikaël Salson committed
574

Mathieu Giraud's avatar
Mathieu Giraud committed
575
576
    cout << endl;
    cout << "Loop through reads, looking for windows" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
577

Mathieu Giraud's avatar
Mathieu Giraud committed
578
579
580
581
    string f_segmented = out_dir + prefix_filename + SEGMENTED_FILENAME ;
    cout << "  ==> " << f_segmented << endl ;
    ofstream out_segmented(f_segmented.c_str()) ;
    ofstream *out_unsegmented = NULL;
Mikaël Salson's avatar
Mikaël Salson committed
582
 
Mathieu Giraud's avatar
Mathieu Giraud committed
583
584
585
586
587
588
589
590
591
592
593
    WindowExtractor we;
    we.setSegmentedOutput(&out_segmented);
    if (output_unsegmented) {
      string f_unsegmented = out_dir + prefix_filename + UNSEGMENTED_FILENAME ;
      cout << "  ==> " << f_unsegmented << endl ;
      out_unsegmented = new ofstream(f_unsegmented.c_str());
      we.setUnsegmentedOutput(out_unsegmented);
    }
    WindowsStorage *windowsStorage = we.extract(reads, index, w, delta_min, 
                                                delta_max_kmer, windows_labels);
    size_t nb_total_reads = we.getNbReads();
Mikaël Salson's avatar
Mikaël Salson committed
594
595


Mathieu Giraud's avatar
Mathieu Giraud committed
596
    //$$ Display statistics on segmentation causes
Mikaël Salson's avatar
Mikaël Salson committed
597

Mikaël Salson's avatar
Mikaël Salson committed
598

Mathieu Giraud's avatar
Mathieu Giraud committed
599
600
    int nb_segmented_including_too_short = we.getNbSegmented(TOTAL_SEG_AND_WINDOW) 
      + we.getNbSegmented(TOTAL_SEG_BUT_TOO_SHORT_FOR_THE_WINDOW);
Mikaël Salson's avatar
Mikaël Salson committed
601

Mathieu Giraud's avatar
Mathieu Giraud committed
602
    cout << "  ==> segmented " << nb_segmented_including_too_short << " reads"
Mikaël Salson's avatar
Mikaël Salson committed
603
	<< " (" << setprecision(3) << 100 * (float) nb_segmented_including_too_short / nb_total_reads << "%)" 
Mathieu Giraud's avatar
Mathieu Giraud committed
604
605
	<< endl ;

Mikaël Salson's avatar
Mikaël Salson committed
606
    // nb_segmented is the main denominator for the following (but will be normalized)
Mathieu Giraud's avatar
Mathieu Giraud committed
607
    int nb_segmented = we.getNbSegmented(TOTAL_SEG_AND_WINDOW);
Mikaël Salson's avatar
Mikaël Salson committed
608

Mathieu Giraud's avatar
Mathieu Giraud committed
609
    cout << "  ==> found " << windowsStorage->size() << " " << w << "-windows"
Mikaël Salson's avatar
Mikaël Salson committed
610
611
	<< " in " << nb_segmented << " segments"
	<< " (" << setprecision(3) << 100 * (float) nb_segmented / nb_total_reads << "%)"
Mikaël Salson's avatar
Mikaël Salson committed
612
613
	<< " inside " << nb_total_reads << " sequences" << endl ;
  
Mathieu Giraud's avatar
Mathieu Giraud committed
614
    cout << "                                  #      av. length" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
615

Mikaël Salson's avatar
Mikaël Salson committed
616
    for (int i=0; i<STATS_SIZE; i++)
Mikaël Salson's avatar
Mikaël Salson committed
617
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
618
619
	cout << "   " << left << setw(20) << segmented_mesg[i] 
             << " ->" << right << setw(9) << we.getNbSegmented(static_cast<SEGMENTED>(i)) ;
Mikaël Salson's avatar
Mikaël Salson committed
620

Mathieu Giraud's avatar
Mathieu Giraud committed
621
622
623
624
625
	if (we.getAverageSegmentationLength(static_cast<SEGMENTED>(i)))
	  cout << "      " << setw(5) << fixed << setprecision(1) 
               << we.getAverageSegmentationLength(static_cast<SEGMENTED>(i));
	
	cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
626
627
      }
    
Mathieu Giraud's avatar
Mathieu Giraud committed
628
      map <junction, JsonList> json_data_segment ;
Mikaël Salson's avatar
Mikaël Salson committed
629
    
Mikaël Salson's avatar
Mikaël Salson committed
630
631

	//////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
632
	//$$ Sort windows
Mikaël Salson's avatar
Mikaël Salson committed
633
	
Mathieu Giraud's avatar
Mathieu Giraud committed
634
635
        cout << "Sort windows by number of occurrences" << endl;
        windowsStorage->sort();
Mikaël Salson's avatar
Mikaël Salson committed
636
637

	//////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
638
	//$$ Output windows
Mikaël Salson's avatar
Mikaël Salson committed
639
640
	//////////////////////////////////

Mathieu Giraud's avatar
Mathieu Giraud committed
641
	string f_all_windows = out_dir + prefix_filename + WINDOWS_FILENAME;
Mathieu Giraud's avatar
Mathieu Giraud committed
642
	cout << "  ==> " << f_all_windows << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
643

Mathieu Giraud's avatar
Mathieu Giraud committed
644
	ofstream out_all_windows(f_all_windows.c_str());
Mathieu Giraud's avatar
Mathieu Giraud committed
645
        windowsStorage->printSortedWindows(out_all_windows);
Mikaël Salson's avatar
Mikaël Salson committed
646

Mathieu Giraud's avatar
Mathieu Giraud committed
647
648
	//$$ Normalization
	list< pair <float, int> > norm_list = compute_normalization_list(windowsStorage->getMap(), normalization, nb_segmented);
Mikaël Salson's avatar
Mikaël Salson committed
649
650
651
652
653


    if (command == CMD_ANALYSIS) {

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
654
655
    //$$ min_reads_window (ou label)
    cout << "Considering only windows with >= " << min_reads_window << " reads and labeled windows" << endl;
Mathieu Giraud's avatar
Mathieu Giraud committed
656

Mathieu Giraud's avatar
Mathieu Giraud committed
657
    pair<int, int> info_remove = windowsStorage->keepInterestingWindows((size_t) min_reads_window);
Mikaël Salson's avatar
Mikaël Salson committed
658
	 
Mathieu Giraud's avatar
Mathieu Giraud committed
659
660
    cout << "  ==> keep " <<  windowsStorage->size() << " windows in " << info_remove.second << " reads" ;
    cout << " (" << setprecision(3) << 100 * (float) info_remove.second / nb_total_reads << "%)  " << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
661
662

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
663
    //$$ Clustering
Mikaël Salson's avatar
Mikaël Salson committed
664

Mathieu Giraud's avatar
Mathieu Giraud committed
665
    list <list <junction> > clones_windows;
Mathieu Giraud's avatar
Mathieu Giraud committed
666
    comp_matrix comp=comp_matrix(*windowsStorage);
Mikaël Salson's avatar
Mikaël Salson committed
667
668
669

    if (epsilon || forced_edges.size())
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
670
	cout << "Cluster similar windows" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
671
672
673
674
675
676
677

	if (load_comp==1) 
	  {
	    comp.load((out_dir+prefix_filename + comp_filename).c_str());
	  }
	else
	  {
Mathieu Giraud's avatar
Mathieu Giraud committed
678
	    comp.compare( cout, cluster_cost);
Mikaël Salson's avatar
Mikaël Salson committed
679
680
681
682
683
684
685
	  }
	
	if (save_comp==1)
	  {
	    comp.save(( out_dir+prefix_filename + comp_filename).c_str());
	  }
       
Mathieu Giraud's avatar
Mathieu Giraud committed
686
687
	clones_windows  = comp.cluster(forced_edges, w, cout, epsilon, minPts) ;
	comp.stat_cluster(clones_windows, out_dir + prefix_filename + GRAPH_FILENAME, cout );
Mikaël Salson's avatar
Mikaël Salson committed
688
689
690
691
	comp.del();
      } 
    else
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
692
	cout << "No clustering" << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
693
	clones_windows  = comp.nocluster() ;
Mikaël Salson's avatar
Mikaël Salson committed
694
695
      }

Mathieu Giraud's avatar
Mathieu Giraud committed
696
    cout << "  ==> " << clones_windows.size() << " clones" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
697
 
Mathieu Giraud's avatar
Mathieu Giraud committed
698
    //$$ Sort clones, number of occurrences
Mikaël Salson's avatar
Mikaël Salson committed
699
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
700
    cout << "Sort clones by number of occurrences" << endl;
Mikaël Salson's avatar
Mikaël Salson committed
701
702
703

    list<pair<list <junction>, int> >sort_clones;

Mathieu Giraud's avatar
Mathieu Giraud committed
704
    for (list <list <junction> >::const_iterator it = clones_windows.begin(); it != clones_windows.end(); ++it)
Mikaël Salson's avatar
Mikaël Salson committed
705
706
707
      {
        list <junction>clone = *it ;

Mikaël Salson's avatar
Mikaël Salson committed
708
709
710
	int clone_nb_reads=0;
	
        for (list <junction>::const_iterator it2 = clone.begin(); it2 != clone.end(); ++it2)
Mathieu Giraud's avatar
Mathieu Giraud committed
711
	  clone_nb_reads += windowsStorage->getNbReads(*it2);
Mikaël Salson's avatar
Mikaël Salson committed
712
	  
Mikaël Salson's avatar
Mikaël Salson committed
713
	bool labeled = false ;
Mathieu Giraud's avatar
Mathieu Giraud committed
714
	// Is there a labeled window ?
Mikaël Salson's avatar
Mikaël Salson committed
715
	for (list <junction>::const_iterator iit = clone.begin(); iit != clone.end(); ++iit) {
Mathieu Giraud's avatar
Mathieu Giraud committed
716
	  if (windows_labels.find(*iit) != windows_labels.end())
Mikaël Salson's avatar
Mikaël Salson committed
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
	    {
	      labeled = true ;
	      break ;
	    }
	}

	  if (labeled 
	      || ((clone_nb_reads >= min_reads_clone) 
		  && (clone_nb_reads * 100.0 / nb_segmented >= ratio_reads_clone)))
          // Record the clone and its number of occurrence
          sort_clones.push_back(make_pair(clone, clone_nb_reads));
      }

    // Sort clones
    sort_clones.sort(pair_occurrence_sort<list<junction> >);

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
734
    //$$ Output clones
735
736
737
738
739
740
    if (max_clones > 0)
      cout << "Output at most " << max_clones<< " clones" ;
    else
      cout << "Output all clones" ;

    cout << " with >= " << min_reads_clone << " reads and with a ratio >= " << ratio_reads_clone << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
741
742

    map <string, int> clones_codes ;
Mathieu Giraud's avatar
Mathieu Giraud committed
743
    map <string, string> clones_map_windows ;
Mikaël Salson's avatar
Mikaël Salson committed
744
745
746
747
748
749
750
751
752

    list <Sequence> representatives ;
    list <string> representatives_labels ;

    VirtualReadScore *scorer = new KmerAffectReadScore(*index);
    int num_clone = 0 ;

    ofstream out_edges((out_dir+prefix_filename + EDGES_FILENAME).c_str());
    int nb_edges = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
753
    cout << "  ==> suggested edges in " << out_dir+ prefix_filename + EDGES_FILENAME
Mikaël Salson's avatar
Mikaël Salson committed
754
755
        << endl ;

Mathieu Giraud's avatar
Mathieu Giraud committed
756
    cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
757

Mathieu Giraud's avatar
Mathieu Giraud committed
758
    cout << "Output clones in " << out_dir + prefix_filename << endl ; 
Mikaël Salson's avatar
Mikaël Salson committed
759
760
761
762
763
764
765
766

    for (list <pair<list <junction>,int> >::const_iterator it = sort_clones.begin();
         it != sort_clones.end(); ++it) {
      list<junction> clone = it->first;
      int clone_nb_reads = it->second;

    
      ++num_clone ;
Mikaël Salson's avatar
Mikaël Salson committed
767

768
      if (max_clones && (num_clone == max_clones + 1))
Mikaël Salson's avatar
Mikaël Salson committed
769
770
	  break ;

Mikaël Salson's avatar
Mikaël Salson committed
771
      cout << "#### " ;
Mikaël Salson's avatar
Mikaël Salson committed
772
773
774
775

      string clone_file_name = out_seqdir+ prefix_filename + CLONE_FILENAME + string_of_int(num_clone) ;
      string windows_file_name = out_seqdir+ prefix_filename + WINDOWS_FILENAME + "-" + string_of_int(num_clone) ;
      string sequences_file_name = out_seqdir+ prefix_filename + SEQUENCES_FILENAME + "-" + string_of_int(num_clone) ;
Mikaël Salson's avatar
Mikaël Salson committed
776
777

      ofstream out_clone(clone_file_name.c_str());
Mathieu Giraud's avatar
Mathieu Giraud committed
778
779
780
781
782
783
      ofstream out_windows(windows_file_name.c_str());
      ofstream out_sequences;

      if (output_sequences_by_cluster) {
        out_sequences.open(sequences_file_name.c_str());
      }
Mikaël Salson's avatar
Mikaël Salson committed
784
      
Mathieu Giraud's avatar
Mathieu Giraud committed
785
786
787
      cout << "Clone #" << right << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone ;
      cout << " – " << setfill(' ') << setw(WIDTH_NB_READS) << clone_nb_reads << " reads" ;
      cout << " – " << setprecision(3) << 100 * (float) clone_nb_reads / nb_segmented << "%  "  ;
Mikaël Salson's avatar
Mikaël Salson committed
788

Mathieu Giraud's avatar
Mathieu Giraud committed
789
      cout << " – " << 100 * (float) clone_nb_reads * compute_normalization_one(norm_list, clone_nb_reads) / nb_segmented << "% " 
Mikaël Salson's avatar
Mikaël Salson committed
790
	  << compute_normalization_one(norm_list, clone_nb_reads) << " ";
Mathieu Giraud's avatar
Mathieu Giraud committed
791
      cout.flush();
Mikaël Salson's avatar
Mikaël Salson committed
792
793
794

      //////////////////////////////////

Mathieu Giraud's avatar
Mathieu Giraud committed
795
      //$$ Sort sequences by nb_reads
Mathieu Giraud's avatar
Mathieu Giraud committed
796
      list<pair<junction, int> >sort_windows;
Mikaël Salson's avatar
Mikaël Salson committed
797
798

      for (list <junction>::const_iterator it = clone.begin(); it != clone.end(); ++it) {
Mathieu Giraud's avatar
Mathieu Giraud committed
799
	int nb_reads = windowsStorage->getNbReads(*it);
Mathieu Giraud's avatar
Mathieu Giraud committed
800
        sort_windows.push_back(make_pair(*it, nb_reads));
Mikaël Salson's avatar
Mikaël Salson committed
801
      }
Mathieu Giraud's avatar
Mathieu Giraud committed
802
      sort_windows.sort(pair_occurrence_sort<junction>);
Mikaël Salson's avatar
Mikaël Salson committed
803

Mathieu Giraud's avatar
Mathieu Giraud committed
804
      //$$ Output windows 
Mikaël Salson's avatar
Mikaël Salson committed
805
806

      int num_seq = 0 ;
Mikaël Salson's avatar
Mikaël Salson committed
807

Mathieu Giraud's avatar
Mathieu Giraud committed
808
809
      for (list <pair<junction, int> >::const_iterator it = sort_windows.begin(); 
           it != sort_windows.end(); ++it) {
Mikaël Salson's avatar
Mikaël Salson committed
810
811
	num_seq++ ;

Mathieu Giraud's avatar
Mathieu Giraud committed
812
813
        out_windows << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ;
	out_windows << it->first << endl;
Mikaël Salson's avatar
Mikaël Salson committed
814
815
816

	if ((!detailed_cluster_analysis) && (num_seq == 1))
	  {
Mathieu Giraud's avatar
Mathieu Giraud committed
817
818
819
	    cout << "\t" << setw(WIDTH_NB_READS) << it->second << "\t";
	    cout << it->first ;
	    cout << "\t" << windows_labels[it->first] ;
Mikaël Salson's avatar
Mikaël Salson committed
820
821
822
823
824
	  }
      }

      if (!detailed_cluster_analysis)
	{
Mathieu Giraud's avatar
Mathieu Giraud committed
825
	  cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
826
827
828
829
	  continue ;
	}

	
Mathieu Giraud's avatar
Mathieu Giraud committed
830
      //$$ First pass, choose one representative per cluster
Mikaël Salson's avatar
Mikaël Salson committed
831
832
833
834

      string best_V ;
      string best_D ;
      string best_J ;
Mathieu Giraud's avatar
Mathieu Giraud committed
835
      int more_windows = 0 ;
Mikaël Salson's avatar
Mikaël Salson committed
836
      
Mathieu Giraud's avatar
Mathieu Giraud committed
837
838
      for (list <pair<junction, int> >::const_iterator it = sort_windows.begin(); 
           it != sort_windows.end(); ++it) {
Mikaël Salson's avatar
Mikaël Salson committed
839

Mikaël Salson's avatar
Mikaël Salson committed
840
841
        out_clone << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ;
	out_clone << it->first << endl;
Mikaël Salson's avatar
Mikaël Salson committed
842

Mikaël Salson's avatar
Mikaël Salson committed
843
844
845
846
847

	// Display statistics on auditionned sequences
	if (verbose)
	{
	  int total_length = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
848
849
          list<Sequence> auditioned = windowsStorage->getSample(it->first, max_auditionned);
	  for (list<Sequence>::const_iterator it = auditioned.begin(); it != auditioned.end(); ++it) 
Mikaël Salson's avatar
Mikaël Salson committed
850
851
	    total_length += (*it).sequence.size() ;
	  
Mathieu Giraud's avatar
Mathieu Giraud committed
852
	  cout << auditioned.size() << " auditioned sequences, avg length " << total_length / auditioned.size() << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
853
854
	}

Mathieu Giraud's avatar
Mathieu Giraud committed
855
856
857
858
859
860
861
862
        Sequence representative 
          = windowsStorage->getRepresentative(it->first, seed, 
                                             min_cover_representative,
                                             ratio_representative,
                                             max_auditionned);

	//$$ There is one representative, FineSegmenter
        if (representative != NULL_SEQUENCE) {
Mathieu Giraud's avatar
Mathieu Giraud committed
863
864
          representative.label = string_of_int(it->second) + "-" 
            + representative.label;
Mikaël Salson's avatar
Mikaël Salson committed
865
	  FineSegmenter seg(representative, rep_V, rep_J, delta_min, delta_max, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
866
	
Mikaël Salson's avatar
Mikaël Salson committed
867
868
	if (segment_D)
	  seg.FineSegmentD(rep_V, rep_D, rep_J);
Mikaël Salson's avatar
Mikaël Salson committed
869
	
Mathieu Giraud's avatar
Mathieu Giraud committed
870
871
872
873
        //cout << seg.toJson();
        json_data_segment[it->first]=seg.toJsonList(rep_V, rep_D, rep_J);
        
        if (seg.isSegmented())
Mikaël Salson's avatar
Mikaël Salson committed
874
875
876
877
878
879
	  {
	    
	    // As soon as one representative is segmented
	    
	    representatives.push_back(seg.getSequence());
            representatives_labels.push_back("#" + string_of_int(num_clone));
Mikaël Salson's avatar
Mikaël Salson committed
880

Mathieu Giraud's avatar
Mathieu Giraud committed
881
882
              // We need to find the window in the representative
              size_t window_pos = seg.getSequence().sequence.find(it->first);
Mikaël Salson's avatar
Mikaël Salson committed
883

Mathieu Giraud's avatar
Mathieu Giraud committed
884
885
              // Default
              int ww = 2*w/3 ; // /2 ;
Mikaël Salson's avatar
Mikaël Salson committed
886

Mathieu Giraud's avatar
Mathieu Giraud committed
887
888
889
890
              if (window_pos != string::npos) {
                // for V.
                ww = seg.getLeft() - window_pos + seg.del_V;
              } 
Mikaël Salson's avatar
Mikaël Salson committed
891
892
            

Mathieu Giraud's avatar
Mathieu Giraud committed
893
              string end_V ="";
Mikaël Salson's avatar
Mikaël Salson committed
894
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
895
896
897
898
              // avoid case when V is not in the window
              if (seg.getLeft() > (int) window_pos)
                end_V = rep_V.sequence(seg.best_V).substr(rep_V.sequence(seg.best_V).size() - ww, 
                                                          ww - seg.del_V);
Mikaël Salson's avatar
Mikaël Salson committed
899

Mathieu Giraud's avatar
Mathieu Giraud committed
900
              string mid_D = "";
Mikaël Salson's avatar
Mikaël Salson committed
901
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
902
903
904
              if (segment_D)
                mid_D = rep_D.sequence(seg.best_D).substr(seg.del_D_left, 
                                                          rep_D.sequence(seg.best_D).size() - seg.del_D_left - seg.del_D_right );
Mikaël Salson's avatar
Mikaël Salson committed
905
	   
Mathieu Giraud's avatar
Mathieu Giraud committed
906
907
908
909
              if (window_pos != string::npos) {
                // for J.
                ww = (window_pos + w - 1) - seg.getRight() + seg.del_J;
              }
Mikaël Salson's avatar
Mikaël Salson committed
910
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
911
              string start_J = "";
Mikaël Salson's avatar
Mikaël Salson committed
912
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
913
914
915
              // avoid case when J is not in the window
              if (seg.getRight() > (int) (window_pos + w - 1))
                start_J=rep_J.sequence(seg.best_J).substr(seg.del_J, ww);
Mikaël Salson's avatar
Mikaël Salson committed
916
	      
Mathieu Giraud's avatar
Mathieu Giraud committed
917
918
919
              best_V = rep_V.label(seg.best_V) ;
              if (segment_D) best_D = rep_D.label(seg.best_D) ;
              best_J = rep_J.label(seg.best_J) ;
Mikaël Salson's avatar
Mikaël Salson committed
920
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
921
922
              // TODO: pad aux dimensions exactes
              string pad_N = "NNNNNNNNNNNNNNNN" ;
Mikaël Salson's avatar
Mikaël Salson committed
923

Mathieu Giraud's avatar
Mathieu Giraud committed
924
              // Add V, (D) and J to windows to be aligned
Mikaël Salson's avatar
Mikaël Salson committed
925
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
926
927
928
929
930
931
932
933
934
              out_windows << ">" << best_V << "-window" << endl ;
              out_windows << end_V << pad_N << endl ;
              more_windows++;

              if (segment_D) {
                out_windows << ">" << best_D << "-window" << endl ;
                out_windows << mid_D << endl ;   
                more_windows++ ;
              }
Mikaël Salson's avatar
Mikaël Salson committed
935
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
936
937
938
              out_windows << ">" << best_J << "-window" << endl ;
              out_windows << pad_N << start_J <<  endl ;
              more_windows++;
Mikaël Salson's avatar
Mikaël Salson committed
939

Mathieu Giraud's avatar
Mathieu Giraud committed
940
941
              string code = seg.code ;
              int cc = clones_codes[code];
Mikaël Salson's avatar
Mikaël Salson committed
942

Mathieu Giraud's avatar
Mathieu Giraud committed
943
944
              if (cc)
                {
Mathieu Giraud's avatar
Mathieu Giraud committed
945
                  cout << " (similar to Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << cc << setfill(' ') << ")";
Mikaël Salson's avatar
Mikaël Salson committed
946

Mathieu Giraud's avatar
Mathieu Giraud committed
947
948
949
950
951
                  nb_edges++ ;
                  out_edges << clones_map_windows[code] + " " + it->first + " "  ;
                  out_edges << code << "  " ;
                  out_edges << "Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << cc        << setfill(' ') << "  " ;
                  out_edges << "Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << setfill(' ') << "  " ;
Mathieu Giraud's avatar
Mathieu Giraud committed
952
                  out_edges << endl ;                }
Mathieu Giraud's avatar
Mathieu Giraud committed
953
954
955
956
957
              else
                {
                  clones_codes[code] = num_clone ;
                  clones_map_windows[code] = it->first ;
                }
Mikaël Salson's avatar
Mikaël Salson committed
958

Mathieu Giraud's avatar
Mathieu Giraud committed
959
960
961
              out_clone << seg ;
              out_clone << endl ;
          }
Mikaël Salson's avatar
Mikaël Salson committed
962

Mathieu Giraud's avatar
Mathieu Giraud committed
963
964
        if (seg.isSegmented() 
            || it == --(sort_windows.end())) {
Mathieu Giraud's avatar
Mathieu Giraud committed
965
              // display window
Mikaël Salson's avatar
Mikaël Salson committed
966
              cout << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
967
968
		   << ">clone-"  << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << "-window"  << " " << windows_labels[it->first] << endl
		   << it->first << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
969

Mathieu Giraud's avatar
Mathieu Giraud committed
970
971
	      // display representative, possibly segmented
	      cout << ">clone-"  << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << "-representative" << " " << seg.info << setfill(' ') << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
972

Mathieu Giraud's avatar
Mathieu Giraud committed
973
	      cout << representative.sequence << endl;
Mathieu Giraud's avatar
Mathieu Giraud committed
974
              break ;
Mathieu Giraud's avatar
Mathieu Giraud committed
975
        }
Mathieu Giraud's avatar
Mathieu Giraud committed
976
        }
Mikaël Salson's avatar
Mikaël Salson committed
977
978
      }

Mathieu Giraud's avatar
Mathieu Giraud committed
979
      cout << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
980
      out_windows.close();
Mikaël Salson's avatar
Mikaël Salson committed
981

Mikaël Salson's avatar
Mikaël Salson committed
982
983
984
985
986
987
      if (!very_detailed_cluster_analysis)
	{
	  continue ;
	}


Mathieu Giraud's avatar
Mathieu Giraud committed
988
      //$$ Very detailed cluster analysis (with sequences)
Mikaël Salson's avatar
Mikaël Salson committed
989
990
991
992
993

      list<string> msa;
      bool good_msa = false ;

      // TODO: do something if no sequences have been segmented !
Mathieu Giraud's avatar
Mathieu Giraud committed
994
      if (!more_windows)
Mikaël Salson's avatar
Mikaël Salson committed
995
	{
Mathieu Giraud's avatar
Mathieu Giraud committed
996
	  cout << "!! No segmented sequence, deleting clone" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
997
998
999
	  // continue ;
	} else 
        {
Mathieu Giraud's avatar
Mathieu Giraud committed
1000
          msa = multiple_seq_align(windows_file_name);
Mikaël Salson's avatar
Mikaël Salson committed
1001
        
Mathieu Giraud's avatar
Mathieu Giraud committed
1002
          // Alignment of windows
Mikaël Salson's avatar
Mikaël Salson committed
1003
1004
1005
          
          if (!msa.empty())
            {
Mathieu Giraud's avatar
Mathieu Giraud committed
1006
              if (msa.size() == sort_windows.size() + more_windows)
Mikaël Salson's avatar
Mikaël Salson committed
1007
                {
Mathieu Giraud's avatar
Mathieu Giraud committed
1008
                  // cout << "clustalw parse: success" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
1009
1010
1011
1012
                  good_msa = true ;
                }
              else
                {
Mathieu Giraud's avatar
Mathieu Giraud committed
1013
                  cout << "! clustalw parse: failed" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
1014
1015
1016
1017
                }
            }
        }
      
Mathieu Giraud's avatar
Mathieu Giraud committed
1018
      //$$ Second pass: output clone, all representatives      
Mikaël Salson's avatar
Mikaël Salson committed
1019
1020
1021
1022
1023

      num_seq = 0 ;
      list <Sequence> representatives_this_clone ;
      string code_representative = "";

Mathieu Giraud's avatar
Mathieu Giraud committed
1024
1025
      for (list <pair<junction, int> >::const_iterator it = sort_windows.begin(); 
           it != sort_windows.end(); ++it) {
Mikaël Salson's avatar
Mikaël Salson committed
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045

	num_seq++ ;

	string junc ;
	
	if (!good_msa)
	  {
	    junc = it->first ;
	  }
	else
	  {
	    junc = msa.back();
	    msa.pop_back();
	  }


	// Output all sequences

	if (output_sequences_by_cluster)
	  {
Mathieu Giraud's avatar
Mathieu Giraud committed
1046
	    out_sequences << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
1047
1048
	    out_sequences << it->first << endl;

Mathieu Giraud's avatar
Mathieu Giraud committed
1049
	    list<Sequence> &sequences = windowsStorage->getReads(it->first);
Mikaël Salson's avatar
Mikaël Salson committed
1050
1051
1052
1053
1054
1055
1056
	    
	    for (list<Sequence>::const_iterator itt = sequences.begin(); itt != sequences.end(); ++itt)
	      {
		out_sequences << *itt ;
	      }
	  }

Mathieu Giraud's avatar
Mathieu Giraud committed
1057
1058
1059
1060
1061
        Sequence representative 
          = windowsStorage->getRepresentative(it->first, seed, 
                                             min_cover_representative,
                                             ratio_representative,
                                             max_auditionned);
Mikaël Salson's avatar
Mikaël Salson committed
1062

Mikaël Salson's avatar
Mikaël Salson committed
1063

Mathieu Giraud's avatar
Mathieu Giraud committed
1064
        if (representative != NULL_SEQUENCE) {
Mathieu Giraud's avatar
Mathieu Giraud committed
1065
1066
          representative.label = string_of_int(it->second) + "-" 
            + representative.label;
Mikaël Salson's avatar
Mikaël Salson committed
1067

Mathieu Giraud's avatar
Mathieu Giraud committed
1068
          FineSegmenter seg(representative, rep_V, rep_J, delta_min, delta_max, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
1069

Mathieu Giraud's avatar
Mathieu Giraud committed
1070
1071
          if (segment_D)
            seg.FineSegmentD(rep_V, rep_D, rep_J);
Mikaël Salson's avatar
Mikaël Salson committed
1072
		
Mathieu Giraud's avatar
Mathieu Giraud committed
1073
1074
1075
1076
          //cout << seg.toJson();
          json_data_segment[it->first]=seg.toJsonList(rep_V, rep_D, rep_J);

          if (seg.isSegmented())
Mikaël Salson's avatar
Mikaël Salson committed
1077
1078
1079
	  {
	    representatives_this_clone.push_back(seg.getSequence());
	  }
Mikaël Salson's avatar
Mikaël Salson committed
1080

Mathieu Giraud's avatar
Mathieu Giraud committed
1081
          /// TODO: et si pas isSegmented ?
Mikaël Salson's avatar
Mikaël Salson committed
1082

Mathieu Giraud's avatar
Mathieu Giraud committed
1083
          bool warning = false;
Mikaël Salson's avatar
Mikaël Salson committed
1084

Mathieu Giraud's avatar
Mathieu Giraud committed
1085
1086
          if (num_seq <= 20) /////
            {
Mathieu Giraud's avatar
Mathieu Giraud committed
1087
              cout << setw(20) << representative.label << " " ;
Mathieu Giraud's avatar
Mathieu Giraud committed
1088
              cout << "   " << junc ;
Mathieu Giraud's avatar
Mathieu Giraud committed
1089
1090
1091
1092
              cout << " " << setw(WIDTH_NB_READS) << it->second << " " ;
              cout << (warning ? "§ " : "  ") ;
              cout << seg.info ;
              cout << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
1093
1094
            }
        }
Mikaël Salson's avatar
Mikaël Salson committed
1095
      }
Mathieu Giraud's avatar
Mathieu Giraud committed
1096
1097
      
      //$$ Display msa
Mikaël Salson's avatar
Mikaël Salson committed
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
      if (good_msa)
	{
	  cout << setw(20) << best_V << "    " << msa.back() << endl ;
	  msa.pop_back();

	  if (segment_D)
	    {
	      cout << setw(20) << best_D << "    " << msa.back() << endl ;
	      msa.pop_back();
	    }

	  cout << setw(20) << best_J << "    " << msa.back() << endl ;
	  msa.pop_back();
	}
 
      out_clone.close();
Mathieu Giraud's avatar
Mathieu Giraud committed
1114
      cout << endl;
Mikaël Salson's avatar
Mikaël Salson committed
1115
      
Mathieu Giraud's avatar
Mathieu Giraud committed
1116
1117
      //$$ Compare representatives of this clone
      cout << "Comparing representatives of this clone 2 by 2" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
1118
1119
1120
1121
1122
1123
1124
1125
      // compare_all(representatives_this_clone);
      SimilarityMatrix matrix = compare_all(representatives_this_clone, true);
      cout << RawOutputSimilarityMatrix(matrix, 90);
    }

    out_edges.close() ;

    cout << "#### end of clones" << endl; 
Mathieu Giraud's avatar
Mathieu Giraud committed
1126
    cout << endl;
Mikaël Salson's avatar
Mikaël Salson committed
1127
  
Mathieu Giraud's avatar
Mathieu Giraud committed
1128
    //$$ Compare representatives of all clones
Mikaël Salson's avatar
Mikaël Salson committed
1129
1130
1131
1132
1133
1134

    if (detailed_cluster_analysis)
      {

    if (nb_edges)
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
1135
        cout << "Please review the " << nb_edges << " suggested edge(s) in " << out_dir+EDGES_FILENAME << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
1136
1137
1138
      }

    cout << "Comparing clone representatives 2 by 2" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
1139
1140
1141
    list<Sequence> first_representatives = keep_n_first<Sequence>(representatives,
                                                                  LIMIT_DISPLAY);
    SimilarityMatrix matrix = compare_all(first_representatives, true, 
Mikaël Salson's avatar
Mikaël Salson committed
1142
1143
1144
1145
1146
1147
1148
1149
1150
                                          representatives_labels);
    cout << RawOutputSimilarityMatrix(matrix, 90);

     }


    delete scorer;
    }
    
Mathieu Giraud's avatar
Mathieu Giraud committed
1151
1152
1153
    //$$ .json output: json_data_segment
    string f_json = out_dir + prefix_filename + "vidjil" + JSON_SUFFIX ; // TODO: retrieve basename from f_reads instead of "vidjil"
    cout << "  ==> " << f_json << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
1154
    ofstream out_json(f_json.c_str()) ;
Mathieu Giraud's avatar
Mathieu Giraud committed
1155
1156
1157
    
    JsonList *json;
    json=new JsonList();
Mikaël Salson's avatar
Mikaël Salson committed
1158
1159

    // Version/timestamp/git info
Mathieu Giraud's avatar
Mathieu Giraud committed
1160
1161
1162

    ostringstream stream_cmdline;
    for (int i=0; i < argc; i++) stream_cmdline << argv[i] << " ";
Mikaël Salson's avatar
Mikaël Salson committed
1163
    
Mathieu Giraud's avatar
Mathieu Giraud committed
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
    JsonArray json_nb_reads;
    json_nb_reads.add(nb_total_reads);

    JsonArray json_nb_segmented;
    json_nb_segmented.add(nb_segmented);
	
	JsonArray json_normalization_factor;
    json_normalization_factor.add( (float)  compute_normalization_one(norm_list, nb_segmented));
	
    //JsonArray normalization_names = json_normalization_names();
    //JsonArray normalization_res1 = json_normalization(norm_list, 1, nb_segmented);
    //JsonArray normalization_res5 = json_normalization(norm_list, 5, nb_segmented);
Mikaël Salson's avatar
Mikaël Salson committed
1176
    
Mathieu Giraud's avatar
Mathieu Giraud committed
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
    json->add("vidjil_json_version", VIDJIL_JSON_VERSION);
    json->add("timestamp", time_buffer);
    json->add("commandline", stream_cmdline.str());// TODO: escape "s in argv
    json->add("germline", germline_system);
    json->add("reads_total", json_nb_reads);
    json->add("reads_segmented", json_nb_segmented); 
	json->add("normalization_factor", json_normalization_factor ); 
	
    //json->add("normalizations", normalization_names);
    //json->add("resolution1", normalization_res1);
    //json->add("resolution5", normalization_res5);

    JsonArray jsonSortedWindows = windowsStorage->sortedWindowsToJsonArray(json_data_segment,
                                                                           norm_list,
                                                                           nb_segmented);
    json->add("windows", jsonSortedWindows);
    out_json << json->toString();
    
    delete index ;
    delete json;
    delete windowsStorage;
    if (output_unsegmented)
      delete out_unsegmented;
Mikaël Salson's avatar
Mikaël Salson committed
1200
  } else if (command == CMD_SEGMENT) {
Mathieu Giraud's avatar
Mathieu Giraud committed
1201
    //$$ CMD_SEGMENT
Mikaël Salson's avatar
Mikaël Salson committed
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
    ////////////////////////////////////////
    //       V(D)J SEGMENTATION           //
    ////////////////////////////////////////

    // déja déclaré ?
    //reads = OnlineFasta(f_reads, 1, " ");
    
    while (reads->hasNext()) 
      {
        reads->next();
        FineSegmenter s(reads->getSequence(), rep_V, rep_J, delta_min, delta_max, segment_cost);
	if (s.isSegmented()) {
	  if (segment_D)
	  s.FineSegmentD(rep_V, rep_D, rep_J);
          cout << s << endl;
        } else {
Mathieu Giraud's avatar
Mathieu Giraud committed
1218
1219
1220
          cout << "Unable to segment" << endl;
          cout << reads->getSequence();
          cout << endl << endl;
Mikaël Salson's avatar