vidjil.cpp 43.5 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
2
/*
  This file is part of "Vidjil" <http://bioinfo.lifl.fr/vidjil>
Mathieu Giraud's avatar
Mathieu Giraud committed
3
4
  Copyright (C) 2011, 2012, 2013, 2014 by Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille
  Contributors: Mathieu Giraud <mathieu.giraud@lifl.fr>, Mikaël Salson <mikael.salson@lifl.fr>, Marc Duez <marc.duez@lifl.fr>
Mikaël Salson's avatar
Mikaël Salson committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

Mathieu Giraud's avatar
Mathieu Giraud committed
20
//$$ #include
Mikaël Salson's avatar
Mikaël Salson committed
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

#include<algorithm>
#include<utility>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <cstring>
#include <time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#include "core/tools.h"
#include "core/kmerstore.h"
#include "core/fasta.h"
#include "core/segment.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
38
#include "core/windows.h"
Mikaël Salson's avatar
Mikaël Salson committed
39
40
41
42
43
44
45
46
47
#include "core/cluster-junctions.h"
#include "core/dynprog.h"
#include "core/read_score.h"
#include "core/read_chooser.h"
#include "core/msa.h"
#include "core/compare-all.h"
#include "core/teestream.h"
#include "core/mkdir.h"
#include "core/labels.h"
Mikaël Salson's avatar
Mikaël Salson committed
48
#include "core/list_utils.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
49
#include "core/windowExtractor.h"
Mikaël Salson's avatar
Mikaël Salson committed
50
51
52
53
54
55
56
57

#include "vidjil.h"

// RELEASE_TAG may be defined in the "release.h" file.
// If RELEASE_TAG is undefined, the version will be the git hash.
// #define RELEASE_TAG  "2013.04"
#include "release.h"

Mathieu Giraud's avatar
Mathieu Giraud committed
58
#define VIDJIL_JSON_VERSION "2014.02"
Mikaël Salson's avatar
Mikaël Salson committed
59

Mathieu Giraud's avatar
Mathieu Giraud committed
60
61
62
63
//$$ #define (mainly default options)

#define DEFAULT_GERMLINE_SYSTEM "IGH" 
#define DEFAULT_V_REP  "./germline/IGHV.fa"
Mikaël Salson's avatar
Mikaël Salson committed
64
#define DEFAULT_D_REP  "./germline/IGHD.fa" 
Mathieu Giraud's avatar
Mathieu Giraud committed
65
#define DEFAULT_J_REP  "./germline/IGHJ.fa"
Mikaël Salson's avatar
Mikaël Salson committed
66

Mathieu Giraud's avatar
Mathieu Giraud committed
67
#define DEFAULT_READS  "./data/Stanford_S22.fasta"
Mathieu Giraud's avatar
Mathieu Giraud committed
68
#define MIN_READS_WINDOW 10
Mikaël Salson's avatar
Mikaël Salson committed
69
70
#define MIN_READS_CLONE 10
#define MAX_CLONES 20
71
#define RATIO_READS_CLONE 0.0
Mikaël Salson's avatar
Mikaël Salson committed
72

Mathieu Giraud's avatar
Mathieu Giraud committed
73
#define COMMAND_WINDOWS "windows"
Mikaël Salson's avatar
Mikaël Salson committed
74
75
#define COMMAND_ANALYSIS "clones"
#define COMMAND_SEGMENT "segment"
76
#define COMMAND_GERMLINES "germlines"
Mikaël Salson's avatar
Mikaël Salson committed
77
 
78
enum { CMD_WINDOWS, CMD_ANALYSIS, CMD_SEGMENT, CMD_GERMLINES } ;
Mikaël Salson's avatar
Mikaël Salson committed
79
80

#define OUT_DIR "./out/" 
81
#define CLONES_FILENAME "clones.vdj.fa"
Mikaël Salson's avatar
Mikaël Salson committed
82
#define CLONE_FILENAME "clone.fa-"
Mathieu Giraud's avatar
Mathieu Giraud committed
83
#define WINDOWS_FILENAME "windows.fa"
Mikaël Salson's avatar
Mikaël Salson committed
84
85
#define SEQUENCES_FILENAME "sequences.fa"
#define SEGMENTED_FILENAME "segmented.vdj.fa"
Mikaël Salson's avatar
Mikaël Salson committed
86
#define UNSEGMENTED_FILENAME "unsegmented.fa"
Mikaël Salson's avatar
Mikaël Salson committed
87
88
89
#define EDGES_FILENAME "edges"
#define COMP_FILENAME "comp.data"
#define GRAPH_FILENAME "graph"
Mathieu Giraud's avatar
Mathieu Giraud committed
90
#define JSON_SUFFIX ".data"
Mikaël Salson's avatar
Mikaël Salson committed
91
92
93

// "tests/data/leukemia.fa" 

94
#define DEFAULT_K      0
Mikaël Salson's avatar
Mikaël Salson committed
95
#define DEFAULT_W      40
Mikaël Salson's avatar
Mikaël Salson committed
96
#define DEFAULT_W_D    60
97
#define DEFAULT_SEED   ""
Mikaël Salson's avatar
Mikaël Salson committed
98
99

#define DEFAULT_DELTA_MIN  -10
Mathieu Giraud's avatar
Mathieu Giraud committed
100
#define DEFAULT_DELTA_MAX   20
Mikaël Salson's avatar
Mikaël Salson committed
101
102

#define DEFAULT_DELTA_MIN_D  0
103
#define DEFAULT_DELTA_MAX_D  80
Mikaël Salson's avatar
Mikaël Salson committed
104

Mikaël Salson's avatar
Mikaël Salson committed
105
#define DEFAULT_MAX_AUDITIONED 2000
Mathieu Giraud's avatar
Mathieu Giraud committed
106
#define DEFAULT_RATIO_REPRESENTATIVE 0.5
107
#define MIN_COVER_REPRESENTATIVE_RATIO_MIN_READS_CLONE 1.0
Mathieu Giraud's avatar
Mathieu Giraud committed
108

Mikaël Salson's avatar
Mikaël Salson committed
109
110
111
112
113
114
#define DEFAULT_EPSILON  0
#define DEFAULT_MINPTS   10

#define DEFAULT_CLUSTER_COST  Cluster
#define DEFAULT_SEGMENT_COST   VDJ

115
// warn
116
#define WARN_PERCENT_SEGMENTED 40
117

Mikaël Salson's avatar
Mikaël Salson committed
118
119
120
// display
#define WIDTH_NB_READS 7
#define WIDTH_NB_CLONES 3
Mikaël Salson's avatar
Mikaël Salson committed
121
#define WIDTH_WINDOW_POS 14+WIDTH_NB_CLONES
Mikaël Salson's avatar
Mikaël Salson committed
122
123
124

using namespace std ;

Mathieu Giraud's avatar
Mathieu Giraud committed
125
//$$ options: usage
Mikaël Salson's avatar
Mikaël Salson committed
126

Mathieu Giraud's avatar
Mathieu Giraud committed
127
extern char *optarg;
Mikaël Salson's avatar
Mikaël Salson committed
128
129
130
131
132
133
134
135

extern int optind, optopt, opterr;

void usage(char *progname)
{
  cerr << "Usage: " << progname << " [options] <reads.fa>" << endl << endl;

  cerr << "Command selection" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
136
       << "  -c <command> \t" << COMMAND_WINDOWS << "\t window extracting (default)" << endl 
Mikaël Salson's avatar
Mikaël Salson committed
137
       << "  \t\t" << COMMAND_ANALYSIS  << "  \t clone analysis" << endl 
138
       << "  \t\t" << COMMAND_SEGMENT   << "  \t V(D)J segmentation (not recommended)" << endl
139
       << "  \t\t" << COMMAND_GERMLINES << "  \t discover all germlines" << endl
Mikaël Salson's avatar
Mikaël Salson committed
140
141
142
143
144
145
146
147
148
       << endl       

       << "Germline databases" << endl
       << "  -V <file>     V germline multi-fasta file" << endl
       << "  -D <file>     D germline multi-fasta file (automatically implies -d)" << endl
       << "  -J <file>     J germline multi-fasta file" << endl
       << "  -G <prefix>   prefix for V (D) and J repertoires (shortcut for -V <prefix>V.fa -D <prefix>D.fa -J <prefix>J.fa)" << endl
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
149
       << "Window prediction" << endl
Mikaël Salson's avatar
Mikaël Salson committed
150
#ifndef NO_SPACED_SEEDS
151
       << "  (use either -s or -k option, but not both)" << endl
152
153
       << "  -s <string>   spaced seed used for the V/J affectation" << endl
       << "                (default: #####-#####, ######-######, #######-#######, depends on germline)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
154
#endif
155
       << "  -k <int>      k-mer size used for the V/J affectation (default: 10, 12, 13, depends on germline)" << endl
156
157
158
#ifndef NO_SPACED_SEEDS
       << "                (using -k option is equivalent to set with -s a contiguous seed with only '#' characters)" << endl
#endif
Mathieu Giraud's avatar
Mathieu Giraud committed
159
       << "  -w <int>      w-mer size used for the length of the extracted window (default: " << DEFAULT_W << ")(default with -d: " << DEFAULT_W_D << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
160
161
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
162
163
       << "Window annotations" << endl
       << "  -l <file>     labels for some windows -- these windows will be kept even if some limits are not reached" << endl
Mikaël Salson's avatar
Mikaël Salson committed
164
165
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
166
167
       << "Limit to keep a window" << endl
       << "  -r <nb>       minimal number of reads containing a window (default: " << MIN_READS_WINDOW << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
168
169
170
171
172
173
174
175
176
177
178
179
180
       << endl

       << "Clusterisation" << endl
       << "  -e <file>     manual clusterisation -- a file used to force some specific edges" << endl
       << "  -n <int>      maximum distance between neighbors for automatic clusterisation (default " << DEFAULT_EPSILON << "). No automatic clusterisation if =0." << endl
       << "  -N <int>      minimum required neighbors for automatic clusterisation (default " << DEFAULT_MINPTS << ")" << endl
       << "  -S            generate and save comparative matrix for clustering" << endl
       << "  -L            load comparative matrix for clustering" << endl
       << "  -C <string>   use custom Cost for automatic clustering : format \"match, subst, indels, homo, del_end\" (default "<<Cluster<<" )"<< endl
       << endl

       << "Limits to report a clone" << endl
       << "  -R <nb>       minimal number of reads supporting a clone (default: " << MIN_READS_CLONE << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
181
       << "  -% <ratio>    minimal percentage of reads supporting a clone (default: " << RATIO_READS_CLONE << ")" << endl
182
       << "  -z <nb>       maximal number of clones reported (0: no limit) (default: " << MAX_CLONES << ")" << endl
183
       << "  -A            reports all clones (-r 0 -R 1 -% 0 -z 0), to be used only on very small datasets" << endl
Mikaël Salson's avatar
Mikaël Salson committed
184
185
       << endl

186
       << "Fine segmentation options (second pass, see warning in doc/README)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
187
188
189
190
191
192
       << "  -d            segment into V(D)J components instead of VJ " << endl
       << "  -m <int>      minimal admissible delta between segmentation points (default: " << DEFAULT_DELTA_MIN << ") (default when -d is used: " << DEFAULT_DELTA_MIN_D << ")" << endl
       << "  -M <int>      maximal admissible delta between segmentation points (default: " << DEFAULT_DELTA_MAX << ") (default when -d is used: " << DEFAULT_DELTA_MAX_D << ")" << endl
       << "  -f <string>   use custom Cost for fine segmenter : format \"match, subst, indels, homo, del_end\" (default "<<VDJ<<" )"<< endl
       << endl

193
194
195
       << "Debug" << endl
       << "  -u            output unsegmented sequences (default: " << UNSEGMENTED_FILENAME << ")" << endl
       << "                and display detailed k-mer affectation both on segmented and on unsegmented sequences" << endl
Mikaël Salson's avatar
Mikaël Salson committed
196
197
198
199
200
       << "Output" << endl
       << "  -o <dir>      output directory (default: " << OUT_DIR << ")" <<  endl
       << "  -p <string>   prefix output filenames by the specified string" << endl
    
       << "  -a            output all sequences by cluster (" << SEQUENCES_FILENAME << ")" << endl
201
       << "  -x            do not compute representative sequences" << endl
Mikaël Salson's avatar
Mikaël Salson committed
202
203
204
205
206
       << "  -v            verbose mode" << endl
       << endl        

       << endl 
       << "Examples (see doc/README)" << endl
207
208
       << "  " << progname << "             -G germline/IGH             -d data/Stanford_S22.fasta" << endl
       << "  " << progname << " -c clones   -G germline/IGH  -r 5 -R 5  -d data/Stanford_S22.fasta" << endl
209
       << "  " << progname << " -c segment  -G germline/IGH             -d data/Stanford_S22.fasta   # (only for testing)" << endl
210
       << "  " << progname << " -c germlines                               data/Stanford_S22.fasta" << endl
Mikaël Salson's avatar
Mikaël Salson committed
211
212
213
214
215
216
217
    ;
  exit(1);
}

int main (int argc, char **argv)
{
  cout << "# Vidjil -- V(D)J recombinations analysis <http://bioinfo.lifl.fr/vidjil>" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
218
219
       << "# Copyright (C) 2011, 2012, 2013, 2014 by the Vidjil team" << endl
       << "# Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille" << endl 
Mikaël Salson's avatar
Mikaël Salson committed
220
221
       << endl ;

Mathieu Giraud's avatar
Mathieu Giraud committed
222
223
  //$$ options: defaults

Mikaël Salson's avatar
Mikaël Salson committed
224
  string germline_system = DEFAULT_GERMLINE_SYSTEM ;
Mikaël Salson's avatar
Mikaël Salson committed
225
226
227
228
229
230
231
232
233
234
235
236
  string f_rep_V = DEFAULT_V_REP ;
  string f_rep_D = DEFAULT_D_REP ;
  string f_rep_J = DEFAULT_J_REP ;
  string f_reads = DEFAULT_READS ;
  string seed = DEFAULT_SEED ;
  string prefix_filename = "";

  string out_dir = OUT_DIR;
  
  string comp_filename = COMP_FILENAME;

  int k = DEFAULT_K ;
Mikaël Salson's avatar
Mikaël Salson committed
237
238
239
  int w = 0 ;
  int default_w = DEFAULT_W ;

Mikaël Salson's avatar
Mikaël Salson committed
240
241
242
243
244
245
246
247
248
249
250
  int epsilon = DEFAULT_EPSILON ;
  int minPts = DEFAULT_MINPTS ;
  Cost cluster_cost = DEFAULT_CLUSTER_COST ;
  Cost segment_cost = DEFAULT_SEGMENT_COST ;
  
  
  int save_comp = 0;
  int load_comp = 0;
  int segment_D = 0;
  
  int verbose = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
251
  int command = CMD_WINDOWS;
Mikaël Salson's avatar
Mikaël Salson committed
252

Mikaël Salson's avatar
Mikaël Salson committed
253
  int max_clones = MAX_CLONES ;
Mathieu Giraud's avatar
Mathieu Giraud committed
254
  int min_reads_window = MIN_READS_WINDOW ;
Mikaël Salson's avatar
Mikaël Salson committed
255
256
257
258
  int min_reads_clone = MIN_READS_CLONE ;
  float ratio_reads_clone = RATIO_READS_CLONE;
  // int average_deletion = 4;     // Average number of deletion in V or J

Mathieu Giraud's avatar
Mathieu Giraud committed
259
  float ratio_representative = DEFAULT_RATIO_REPRESENTATIVE;
Mikaël Salson's avatar
Mikaël Salson committed
260
  unsigned int max_auditionned = DEFAULT_MAX_AUDITIONED;
Mathieu Giraud's avatar
Mathieu Giraud committed
261

Mikaël Salson's avatar
Mikaël Salson committed
262
263
264
265
266
267
  // Admissible delta between left and right segmentation points
  int delta_min = DEFAULT_DELTA_MIN ; // Kmer+Fine
  int delta_max = DEFAULT_DELTA_MAX ; // Fine

  bool output_sequences_by_cluster = false;
  bool detailed_cluster_analysis = true ;
Mikaël Salson's avatar
Mikaël Salson committed
268
  bool very_detailed_cluster_analysis = false ;
Mathieu Giraud's avatar
Mathieu Giraud committed
269
  bool output_unsegmented = false;
Mikaël Salson's avatar
Mikaël Salson committed
270
271
272

  string forced_edges = "" ;

Mathieu Giraud's avatar
Mathieu Giraud committed
273
  string windows_labels_file = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
274
  string normalization_file = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
275
276
277

  char c ;

278
279
  int options_s_k = 0 ;

Mathieu Giraud's avatar
Mathieu Giraud committed
280
281
  //$$ options: getopt

282
  while ((c = getopt(argc, argv, "AhaG:V:D:J:k:r:R:vw:e:C:t:l:dc:m:M:N:s:p:Sn:o:Lx%:Z:z:u")) != EOF)
Mikaël Salson's avatar
Mikaël Salson committed
283
284
285
286
287
288
289
290
291
292
293
294

    switch (c)
      {
      case 'h':
        usage(argv[0]);
      case 'a':
	output_sequences_by_cluster = true;
	break;
      case 'd':
	segment_D = 1 ;
	delta_min = DEFAULT_DELTA_MIN_D ;
	delta_max = DEFAULT_DELTA_MAX_D ;
Mikaël Salson's avatar
Mikaël Salson committed
295
	default_w = DEFAULT_W_D ;
Mikaël Salson's avatar
Mikaël Salson committed
296
297
298
299
300
	break;
      case 'e':
	forced_edges = optarg;
	break;
      case 'l':
Mathieu Giraud's avatar
Mathieu Giraud committed
301
	windows_labels_file = optarg; 
Mikaël Salson's avatar
Mikaël Salson committed
302
	break;
Mikaël Salson's avatar
Mikaël Salson committed
303
304
305
      case 'Z':
	normalization_file = optarg; 
	break;
Mikaël Salson's avatar
Mikaël Salson committed
306
307
308
309
310
311
312
313
      case 'x':
	detailed_cluster_analysis = false;
	break;
      case 'c':
        if (!strcmp(COMMAND_ANALYSIS,optarg))
          command = CMD_ANALYSIS;
        else if (!strcmp(COMMAND_SEGMENT,optarg))
          command = CMD_SEGMENT;
Mathieu Giraud's avatar
Mathieu Giraud committed
314
315
        else if (!strcmp(COMMAND_WINDOWS,optarg))
          command = CMD_WINDOWS;
316
317
        else if (!strcmp(COMMAND_GERMLINES,optarg))
          command = CMD_GERMLINES;
Mikaël Salson's avatar
Mikaël Salson committed
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
        else {
          cerr << "Unknwown command " << optarg << endl;
	  usage(argv[0]);
        }
        break;
      case 'v':
	verbose += 1 ;
	break;

      // Germline

      case 'V':
	f_rep_V = optarg;
	break;

      case 'D':
	f_rep_D = optarg;
        segment_D = 1;
	break;
        
      case 'J':
	f_rep_J = optarg;
	break;

      case 'G':
Mikaël Salson's avatar
Mikaël Salson committed
343
344
345
346
	germline_system = string(optarg);
	f_rep_V = (germline_system + "V.fa").c_str() ;
	f_rep_D = (germline_system + "D.fa").c_str() ;
	f_rep_J = (germline_system + "J.fa").c_str() ;
Mikaël Salson's avatar
Mikaël Salson committed
347
348
349
350
351
352
353
354
	// TODO: if VDJ, set segment_D // NO, bad idea, depends on naming convention
	break;

      // Algorithm

      case 'k':
	k = atoi(optarg);
	seed = seed_contiguous(k);
355
	options_s_k++ ;
Mikaël Salson's avatar
Mikaël Salson committed
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
        break;

      case 'w':
	w = atoi(optarg);
        break;

      case 'm':
	delta_min = atoi(optarg);
        break;

      case 'M':
	delta_max = atoi(optarg);
        break;

      case 'o':
        out_dir = optarg ;
        break;

      case 'p':
        prefix_filename = optarg;
        break;

378
379
      // Limits

Mikaël Salson's avatar
Mikaël Salson committed
380
      case 'r':
Mathieu Giraud's avatar
Mathieu Giraud committed
381
	min_reads_window = atoi(optarg);
Mikaël Salson's avatar
Mikaël Salson committed
382
383
384
385
386
387
388
389
390
391
        break;

      case '%':
	ratio_reads_clone = atof(optarg);
	break;

      case 'R':
	min_reads_clone = atoi(optarg);
        break;

Mikaël Salson's avatar
Mikaël Salson committed
392
393
394
      case 'z':
	max_clones = atoi(optarg);
        break;
395
396
397
398
399
400
401
402
403
404

      case 'A': // --all
	min_reads_window = 1 ;
	ratio_reads_clone = 0 ;
	min_reads_clone = 0 ;
	max_clones = 0 ;
	break ;

      // Seeds

Mikaël Salson's avatar
Mikaël Salson committed
405
406
407
408
      case 's':
#ifndef NO_SPACED_SEEDS
	seed = string(optarg);
	k = seed_weight(seed);
409
	options_s_k++ ;
Mikaël Salson's avatar
Mikaël Salson committed
410
#else
Mathieu Giraud's avatar
Mathieu Giraud committed
411
        cerr << "To enable the option -s, please compile without NO_SPACED_SEEDS" << endl;
Mikaël Salson's avatar
Mikaël Salson committed
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
#endif
        break;
	
      // Clusterisation
	
      case 'n':
	epsilon = atoi(optarg);
        break;

      case 'N':
	minPts = atoi(optarg);
        break;
	
      case 'S':
	save_comp=1;
        break;

      case 'L':
	load_comp=1;
        break;
	
      case 'C':
	cluster_cost=strToCost(optarg, Cluster);
        break;
	
      case 't':
	segment_cost=strToCost(optarg, VDJ);
        break;
Mathieu Giraud's avatar
Mathieu Giraud committed
440
441
442
443

      case 'u':
        output_unsegmented = true;
        break;
Mikaël Salson's avatar
Mikaël Salson committed
444
445
      }

Mikaël Salson's avatar
Mikaël Salson committed
446
447
448
449
450
  // If there was no -w option, then w is either DEFAULT_W or DEFAULT_W_D
  if (w == 0)
    w = default_w ;

  
451
452
453
454
455
456
  if (options_s_k > 1)
    {
      cout << "use at most one -s or -k option." << endl ;
      exit(1);
    }

Mikaël Salson's avatar
Mikaël Salson committed
457
458
  string out_seqdir = out_dir + "/seq/" ;

Mikaël Salson's avatar
Mikaël Salson committed
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
  if (verbose)
    cout << "# verbose " << verbose << endl ;

  if (optind == argc)
    {
      cout << "# using default sequence file: " << f_reads << endl ;
    }
  else if (optind+1 == argc)
    {
      f_reads = argv[optind] ; 
    }
  else
    {
      cout << "wrong number of arguments." << endl ;
      exit(1);
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
475
476

  //$$ options: post-processing+display
Mathieu Giraud's avatar
Mathieu Giraud committed
477

478
  size_t min_cover_representative = (size_t) (MIN_COVER_REPRESENTATIVE_RATIO_MIN_READS_CLONE * min_reads_clone) ;
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503

  // Default seeds

#ifndef NO_SPACED_SEEDS
  if (k == DEFAULT_K)
    {
      if (germline_system.find("TRA") != string::npos)
	seed = "#######-######" ;

      else if ((germline_system.find("TRB") != string::npos)
	       || (germline_system.find("IGH") != string::npos))
	seed = "######-######" ; 
      else // TRD, TRG, IGK, IGL
	seed = "#####-#####" ; 

      k = seed_weight(seed);
    }
#else
  {
    cout << "Vidjil was compiled with NO_SPACED_SEEDS: please provide a -k option." << endl;
    exit(1) ;
  }
#endif
	  

Mathieu Giraud's avatar
Mathieu Giraud committed
504
505
506
507
508
509
510
511
512
#ifndef NO_SPACED_SEEDS
  // Check seed buffer  
  if (seed.size() >= MAX_SEED_SIZE)
    {
      cout << "Seed size is too large (MAX_SEED_SIZE). Aborting." << endl ;
      exit(1);
    }
#endif

Mikaël Salson's avatar
Mikaël Salson committed
513
514
515
516
517
518
519
520
  // Check that out_dir is an existing directory or creates it
  const char *out_cstr = out_dir.c_str();

  if (mkpath(out_cstr, 0755) == -1) {
    perror("Directory creation");
    exit(2);
  }

Mikaël Salson's avatar
Mikaël Salson committed
521
522
523
524
525
526
  const char *outseq_cstr = out_seqdir.c_str();
  if (mkpath(outseq_cstr, 0755) == -1) {
    perror("Directory creation");
    exit(2);
  }

Mikaël Salson's avatar
Mikaël Salson committed
527
528
529
  out_dir += "/" ;

  /// Load labels ;
Mathieu Giraud's avatar
Mathieu Giraud committed
530
  map <string, string> windows_labels = load_map(windows_labels_file);
Mikaël Salson's avatar
Mikaël Salson committed
531

Mikaël Salson's avatar
Mikaël Salson committed
532
533
  map <string, pair <string, float> > normalization = load_map_norm(normalization_file);

Mikaël Salson's avatar
Mikaël Salson committed
534
535

  switch(command) {
Mathieu Giraud's avatar
Mathieu Giraud committed
536
  case CMD_WINDOWS: cout << "Extracting windows" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
537
538
539
540
541
    break;
  case CMD_ANALYSIS: cout << "Analysing clones" << endl; 
    break;
  case CMD_SEGMENT: cout << "Segmenting V(D)J" << endl;
    break;
542
543
  case CMD_GERMLINES: cout << "Discovering germlines" << endl;
    break;
Mikaël Salson's avatar
Mikaël Salson committed
544
545
  }

Mathieu Giraud's avatar
Mathieu Giraud committed
546
  cout << "Command line: ";
Mikaël Salson's avatar
Mikaël Salson committed
547
  for (int i=0; i < argc; i++) {
Mathieu Giraud's avatar
Mathieu Giraud committed
548
    cout << argv[i] << " ";
Mikaël Salson's avatar
Mikaël Salson committed
549
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
550
  cout << endl;
Mikaël Salson's avatar
Mikaël Salson committed
551
552
553
554
555
556
557
558
559
560
561
562

  //////////////////////////////////
  // Display time and date
  time_t rawtime;
  struct tm *timeinfo;
  char time_buffer[80];

  time (&rawtime );
  timeinfo = localtime (&rawtime);

  strftime (time_buffer, 80,"%F %T", timeinfo);

Mathieu Giraud's avatar
Mathieu Giraud committed
563
  cout << "# " << time_buffer << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
564
565
566
567
568
569


  //////////////////////////////////
  // Display version information or git log

#ifdef RELEASE_TAG
Mathieu Giraud's avatar
Mathieu Giraud committed
570
  cout << "# version: vidjil " << RELEASE_TAG << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
571
#else
Mathieu Giraud's avatar
Mathieu Giraud committed
572
573
  if (system("git log -1 --pretty=format:'# git: %h (%ci)' --abbrev-commit 2> /dev/null") == 0){}
  cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
574
575
#endif

576
577
578
579
580
581

  //////////////////////////////://////////
  //         DISCOVER GERMLINES          //
  /////////////////////////////////////////
  if (command == CMD_GERMLINES)
    {
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
      list< char* > f_germlines ;
      f_germlines.push_back("germline/TRGV.fa");
      f_germlines.push_back("germline/TRGJ.fa");
      f_germlines.push_back("germline/IGHV.fa");
      f_germlines.push_back("germline/IGHD.fa");
      f_germlines.push_back("germline/IGHJ.fa");

      // Read germline and build one unique index

      bool rc = true ;   
      IKmerStore<KmerStringAffect>  *index = KmerStoreFactory::createIndex<KmerStringAffect>(seed, rc);
      map <string, int> stats_kmer;

      for (list< char* >::const_iterator it = f_germlines.begin(); it != f_germlines.end(); ++it)
	{
	  Fasta rep(*it, 2, "|", cout);
	  index->insert(rep, *it);
	  stats_kmer[string(*it)] = 0 ;
	}
601
602
603
604
605
606
607
608
609
610
611
612

      // Open read file (copied frow below)

      OnlineFasta *reads;

      try {
	reads = new OnlineFasta(f_reads, 1, " ");
      } catch (const std::ios_base::failure e) {
	cout << "Error while reading reads file " << f_reads << ": " << e.what()
	     << endl;
	exit(1);
      }
613
      
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
      // Loop through all reads

      int nb_reads = 0 ;
      while (reads->hasNext())
	{
	  reads->next();
	  nb_reads++;
	  string seq = reads->getSequence().sequence;

	  KmerAffectAnalyser<KmerStringAffect> *kaa = new KmerAffectAnalyser<KmerStringAffect>(*index, seq);

	  for (int i = 0; i < kaa->count(); i++) 
	    { 
	      KmerStringAffect ksa = kaa->getAffectation(i);
	      // if (! it.isAmbiguous() && ! it.isUnknown()) { }

	      stats_kmer[ksa.label]++ ;
	    }
	}

      // Display statistics

      cout << "  <== " << nb_reads << " reads" << endl ;
      for (list< char* >::const_iterator it = f_germlines.begin(); it != f_germlines.end(); ++it)
	{
	  cout << setw(12) << stats_kmer[*it] << "\t" << *it << endl ;
	}

642
643
644
645
      exit(0);
    }


Mikaël Salson's avatar
Mikaël Salson committed
646
  //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
647
  //$$ Read sequence files
Mikaël Salson's avatar
Mikaël Salson committed
648
649
650
651

  if (!segment_D) // TODO: add other constructor to Fasta, and do not load rep_D in this case
    f_rep_D = "";

Mathieu Giraud's avatar
Mathieu Giraud committed
652
653
654
655
  Fasta rep_V(f_rep_V, 2, "|", cout);
  Fasta rep_D(f_rep_D, 2, "|", cout);
  Fasta rep_J(f_rep_J, 2, "|", cout);

Mikaël Salson's avatar
Mikaël Salson committed
656
657
658
659
660
  OnlineFasta *reads;

  try {
    reads = new OnlineFasta(f_reads, 1, " ");
  } catch (const std::ios_base::failure e) {
Mathieu Giraud's avatar
Mathieu Giraud committed
661
    cout << "Error while reading reads file " << f_reads << ": " << e.what()
Mikaël Salson's avatar
Mikaël Salson committed
662
663
664
665
666
667
668
669
670
        << endl;
    exit(1);
  }

  out_dir += "/";

  ////////////////////////////////////////
  //           CLONE ANALYSIS           //
  ////////////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
671
  if (command == CMD_ANALYSIS || command == CMD_WINDOWS) {
Mikaël Salson's avatar
Mikaël Salson committed
672
673

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
674
675
676
677
678
679
    cout << "# seed = " << seed << "," ;
    cout << " weight = " << seed_weight(seed) << "," ;
    cout << " span = " << seed.size() << endl ;
    cout << "# k = " << k << "," ;
    cout << " w = " << w << "," ;
    cout << " delta = [" << delta_min << "," << delta_max << "]" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
680
681
682


    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
683
684
    //$$ Build Kmer indexes
    cout << "Build Kmer indexes" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
685
686
687
688
689
690
691
692
693

    bool rc = true ;
    
    IKmerStore<KmerAffect>  *index = KmerStoreFactory::createIndex<KmerAffect>(seed, rc);
    index->insert(rep_V, "V");
    index->insert(rep_J, "J");

  
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
694
    //$$ Kmer Segmentation
Mikaël Salson's avatar
Mikaël Salson committed
695

Mathieu Giraud's avatar
Mathieu Giraud committed
696
697
    cout << endl;
    cout << "Loop through reads, looking for windows" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
698

Mathieu Giraud's avatar
Mathieu Giraud committed
699
    string f_segmented = out_dir + prefix_filename + SEGMENTED_FILENAME ;
700
    cout << "  ==> " << f_segmented << "\t(result of window detection)" << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
701
702
    ofstream out_segmented(f_segmented.c_str()) ;
    ofstream *out_unsegmented = NULL;
Mikaël Salson's avatar
Mikaël Salson committed
703
 
Mathieu Giraud's avatar
Mathieu Giraud committed
704
705
706
707
708
709
710
711
712
    WindowExtractor we;
    we.setSegmentedOutput(&out_segmented);
    if (output_unsegmented) {
      string f_unsegmented = out_dir + prefix_filename + UNSEGMENTED_FILENAME ;
      cout << "  ==> " << f_unsegmented << endl ;
      out_unsegmented = new ofstream(f_unsegmented.c_str());
      we.setUnsegmentedOutput(out_unsegmented);
    }
    WindowsStorage *windowsStorage = we.extract(reads, index, w, delta_min, 
713
                                                delta_max, windows_labels);
Mathieu Giraud's avatar
Mathieu Giraud committed
714
    size_t nb_total_reads = we.getNbReads();
Mikaël Salson's avatar
Mikaël Salson committed
715
716


Mathieu Giraud's avatar
Mathieu Giraud committed
717
    //$$ Display statistics on segmentation causes
Mikaël Salson's avatar
Mikaël Salson committed
718

Mikaël Salson's avatar
Mikaël Salson committed
719

Mathieu Giraud's avatar
Mathieu Giraud committed
720
721
    int nb_segmented_including_too_short = we.getNbSegmented(TOTAL_SEG_AND_WINDOW) 
      + we.getNbSegmented(TOTAL_SEG_BUT_TOO_SHORT_FOR_THE_WINDOW);
Mikaël Salson's avatar
Mikaël Salson committed
722

Mathieu Giraud's avatar
Mathieu Giraud committed
723
    cout << "  ==> segmented " << nb_segmented_including_too_short << " reads"
Mikaël Salson's avatar
Mikaël Salson committed
724
	<< " (" << setprecision(3) << 100 * (float) nb_segmented_including_too_short / nb_total_reads << "%)" 
Mathieu Giraud's avatar
Mathieu Giraud committed
725
726
	<< endl ;

Mikaël Salson's avatar
Mikaël Salson committed
727
    // nb_segmented is the main denominator for the following (but will be normalized)
Mathieu Giraud's avatar
Mathieu Giraud committed
728
    int nb_segmented = we.getNbSegmented(TOTAL_SEG_AND_WINDOW);
729
    float ratio_segmented = 100 * (float) nb_segmented / nb_total_reads ;
Mikaël Salson's avatar
Mikaël Salson committed
730

Mathieu Giraud's avatar
Mathieu Giraud committed
731
    cout << "  ==> found " << windowsStorage->size() << " " << w << "-windows"
Mikaël Salson's avatar
Mikaël Salson committed
732
	<< " in " << nb_segmented << " segments"
733
	<< " (" << setprecision(3) << ratio_segmented << "%)"
Mikaël Salson's avatar
Mikaël Salson committed
734
735
	<< " inside " << nb_total_reads << " sequences" << endl ;
  
736
    // warn if there are too few segmented sequences
737
    if (ratio_segmented < WARN_PERCENT_SEGMENTED)
738
739
740
741
742
      {
        cout << "  ! There are not so many CDR3 windows found in this set of reads." << endl ;
        cout << "  ! If this is unexpected, check the germline (-G) and try to change seed parameters (-k)." << endl ;
      }

Mathieu Giraud's avatar
Mathieu Giraud committed
743
    cout << "                                  #      av. length" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
744

Mikaël Salson's avatar
Mikaël Salson committed
745
    for (int i=0; i<STATS_SIZE; i++)
Mikaël Salson's avatar
Mikaël Salson committed
746
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
747
748
	cout << "   " << left << setw(20) << segmented_mesg[i] 
             << " ->" << right << setw(9) << we.getNbSegmented(static_cast<SEGMENTED>(i)) ;
Mikaël Salson's avatar
Mikaël Salson committed
749

Mathieu Giraud's avatar
Mathieu Giraud committed
750
751
752
753
754
	if (we.getAverageSegmentationLength(static_cast<SEGMENTED>(i)))
	  cout << "      " << setw(5) << fixed << setprecision(1) 
               << we.getAverageSegmentationLength(static_cast<SEGMENTED>(i));
	
	cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
755
756
      }
    
Mathieu Giraud's avatar
Mathieu Giraud committed
757
      map <junction, JsonList> json_data_segment ;
Mikaël Salson's avatar
Mikaël Salson committed
758
    
Mikaël Salson's avatar
Mikaël Salson committed
759
760

	//////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
761
	//$$ Sort windows
Mikaël Salson's avatar
Mikaël Salson committed
762
	
Mathieu Giraud's avatar
Mathieu Giraud committed
763
764
        cout << "Sort windows by number of occurrences" << endl;
        windowsStorage->sort();
Mikaël Salson's avatar
Mikaël Salson committed
765
766

	//////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
767
	//$$ Output windows
Mikaël Salson's avatar
Mikaël Salson committed
768
769
	//////////////////////////////////

Mathieu Giraud's avatar
Mathieu Giraud committed
770
	string f_all_windows = out_dir + prefix_filename + WINDOWS_FILENAME;
Mathieu Giraud's avatar
Mathieu Giraud committed
771
	cout << "  ==> " << f_all_windows << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
772

Mathieu Giraud's avatar
Mathieu Giraud committed
773
	ofstream out_all_windows(f_all_windows.c_str());
Mathieu Giraud's avatar
Mathieu Giraud committed
774
        windowsStorage->printSortedWindows(out_all_windows);
Mikaël Salson's avatar
Mikaël Salson committed
775

Mathieu Giraud's avatar
Mathieu Giraud committed
776
777
	//$$ Normalization
	list< pair <float, int> > norm_list = compute_normalization_list(windowsStorage->getMap(), normalization, nb_segmented);
Mikaël Salson's avatar
Mikaël Salson committed
778
779
780
781
782


    if (command == CMD_ANALYSIS) {

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
783
784
    //$$ min_reads_window (ou label)
    cout << "Considering only windows with >= " << min_reads_window << " reads and labeled windows" << endl;
Mathieu Giraud's avatar
Mathieu Giraud committed
785

Mathieu Giraud's avatar
Mathieu Giraud committed
786
    pair<int, int> info_remove = windowsStorage->keepInterestingWindows((size_t) min_reads_window);
Mikaël Salson's avatar
Mikaël Salson committed
787
	 
Mathieu Giraud's avatar
Mathieu Giraud committed
788
789
    cout << "  ==> keep " <<  windowsStorage->size() << " windows in " << info_remove.second << " reads" ;
    cout << " (" << setprecision(3) << 100 * (float) info_remove.second / nb_total_reads << "%)  " << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
790

791
792
793
794
    if (windowsStorage->size() == 0)
      {
	cout << "  ! No windows with current parameters." << endl;
      }
Mikaël Salson's avatar
Mikaël Salson committed
795
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
796
    //$$ Clustering
Mikaël Salson's avatar
Mikaël Salson committed
797

Mathieu Giraud's avatar
Mathieu Giraud committed
798
    list <list <junction> > clones_windows;
Mathieu Giraud's avatar
Mathieu Giraud committed
799
    comp_matrix comp=comp_matrix(*windowsStorage);
Mikaël Salson's avatar
Mikaël Salson committed
800
801
802

    if (epsilon || forced_edges.size())
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
803
	cout << "Cluster similar windows" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
804
805
806
807
808
809
810

	if (load_comp==1) 
	  {
	    comp.load((out_dir+prefix_filename + comp_filename).c_str());
	  }
	else
	  {
Mathieu Giraud's avatar
Mathieu Giraud committed
811
	    comp.compare( cout, cluster_cost);
Mikaël Salson's avatar
Mikaël Salson committed
812
813
814
815
816
817
818
	  }
	
	if (save_comp==1)
	  {
	    comp.save(( out_dir+prefix_filename + comp_filename).c_str());
	  }
       
Mathieu Giraud's avatar
Mathieu Giraud committed
819
820
	clones_windows  = comp.cluster(forced_edges, w, cout, epsilon, minPts) ;
	comp.stat_cluster(clones_windows, out_dir + prefix_filename + GRAPH_FILENAME, cout );
Mikaël Salson's avatar
Mikaël Salson committed
821
822
823
824
	comp.del();
      } 
    else
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
825
	cout << "No clustering" << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
826
	clones_windows  = comp.nocluster() ;
Mikaël Salson's avatar
Mikaël Salson committed
827
828
      }

Mathieu Giraud's avatar
Mathieu Giraud committed
829
    cout << "  ==> " << clones_windows.size() << " clones" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
830
 
831
832
833
834
835
836
837
838
    if (clones_windows.size() == 0)
      {
	cout << "  ! No clones with current parameters." << endl;
	cout << "  ! See the 'Limits to report a clone' options (-R, -%, -z, -A)." << endl;
      }
    else // clones_windows.size() > 0
      { 

Mathieu Giraud's avatar
Mathieu Giraud committed
839
    //$$ Sort clones, number of occurrences
Mikaël Salson's avatar
Mikaël Salson committed
840
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
841
    cout << "Sort clones by number of occurrences" << endl;
Mikaël Salson's avatar
Mikaël Salson committed
842
843
844

    list<pair<list <junction>, int> >sort_clones;

Mathieu Giraud's avatar
Mathieu Giraud committed
845
    for (list <list <junction> >::const_iterator it = clones_windows.begin(); it != clones_windows.end(); ++it)
Mikaël Salson's avatar
Mikaël Salson committed
846
847
848
      {
        list <junction>clone = *it ;

Mikaël Salson's avatar
Mikaël Salson committed
849
850
851
	int clone_nb_reads=0;
	
        for (list <junction>::const_iterator it2 = clone.begin(); it2 != clone.end(); ++it2)
Mathieu Giraud's avatar
Mathieu Giraud committed
852
	  clone_nb_reads += windowsStorage->getNbReads(*it2);
Mikaël Salson's avatar
Mikaël Salson committed
853
	  
Mikaël Salson's avatar
Mikaël Salson committed
854
	bool labeled = false ;
Mathieu Giraud's avatar
Mathieu Giraud committed
855
	// Is there a labeled window ?
Mikaël Salson's avatar
Mikaël Salson committed
856
	for (list <junction>::const_iterator iit = clone.begin(); iit != clone.end(); ++iit) {
Mathieu Giraud's avatar
Mathieu Giraud committed
857
	  if (windows_labels.find(*iit) != windows_labels.end())
Mikaël Salson's avatar
Mikaël Salson committed
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
	    {
	      labeled = true ;
	      break ;
	    }
	}

	  if (labeled 
	      || ((clone_nb_reads >= min_reads_clone) 
		  && (clone_nb_reads * 100.0 / nb_segmented >= ratio_reads_clone)))
          // Record the clone and its number of occurrence
          sort_clones.push_back(make_pair(clone, clone_nb_reads));
      }

    // Sort clones
    sort_clones.sort(pair_occurrence_sort<list<junction> >);

874
875
    cout << endl;

Mikaël Salson's avatar
Mikaël Salson committed
876
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
877
    //$$ Output clones
878
879
880
881
882
883
    if (max_clones > 0)
      cout << "Output at most " << max_clones<< " clones" ;
    else
      cout << "Output all clones" ;

    cout << " with >= " << min_reads_clone << " reads and with a ratio >= " << ratio_reads_clone << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
884
885

    map <string, int> clones_codes ;
Mathieu Giraud's avatar
Mathieu Giraud committed
886
    map <string, string> clones_map_windows ;
Mikaël Salson's avatar
Mikaël Salson committed
887
888
889
890
891
892

    list <Sequence> representatives ;
    list <string> representatives_labels ;

    VirtualReadScore *scorer = new KmerAffectReadScore(*index);
    int num_clone = 0 ;
893
    int clones_without_representative = 0 ;
Mikaël Salson's avatar
Mikaël Salson committed
894
895
896

    ofstream out_edges((out_dir+prefix_filename + EDGES_FILENAME).c_str());
    int nb_edges = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
897
    cout << "  ==> suggested edges in " << out_dir+ prefix_filename + EDGES_FILENAME
Mikaël Salson's avatar
Mikaël Salson committed
898
899
        << endl ;

900
901
902
    string f_clones = out_dir + prefix_filename + CLONES_FILENAME ;
    cout << "  ==> " << f_clones << "   \t(main result file)" << endl ;
    ofstream out_clones(f_clones.c_str()) ;
Mikaël Salson's avatar
Mikaël Salson committed
903

904
905
    cout << "  ==> " << out_seqdir + prefix_filename + CLONE_FILENAME + "*" << "\t(detail, by clone)" << endl ; 
    cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
906
907
908
909
910
911
912
913

    for (list <pair<list <junction>,int> >::const_iterator it = sort_clones.begin();
         it != sort_clones.end(); ++it) {
      list<junction> clone = it->first;
      int clone_nb_reads = it->second;

    
      ++num_clone ;
Mikaël Salson's avatar
Mikaël Salson committed
914

915
      if (max_clones && (num_clone == max_clones + 1))
Mikaël Salson's avatar
Mikaël Salson committed
916
917
	  break ;

Mikaël Salson's avatar
Mikaël Salson committed
918
      cout << "#### " ;
Mikaël Salson's avatar
Mikaël Salson committed
919
920
921
922

      string clone_file_name = out_seqdir+ prefix_filename + CLONE_FILENAME + string_of_int(num_clone) ;
      string windows_file_name = out_seqdir+ prefix_filename + WINDOWS_FILENAME + "-" + string_of_int(num_clone) ;
      string sequences_file_name = out_seqdir+ prefix_filename + SEQUENCES_FILENAME + "-" + string_of_int(num_clone) ;
Mikaël Salson's avatar
Mikaël Salson committed
923
924

      ofstream out_clone(clone_file_name.c_str());
Mathieu Giraud's avatar
Mathieu Giraud committed
925
926
927
928
929
930
      ofstream out_windows(windows_file_name.c_str());
      ofstream out_sequences;

      if (output_sequences_by_cluster) {
        out_sequences.open(sequences_file_name.c_str());
      }
Mikaël Salson's avatar
Mikaël Salson committed
931
      
Mathieu Giraud's avatar
Mathieu Giraud committed
932
933
934
      cout << "Clone #" << right << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone ;
      cout << " – " << setfill(' ') << setw(WIDTH_NB_READS) << clone_nb_reads << " reads" ;
      cout << " – " << setprecision(3) << 100 * (float) clone_nb_reads / nb_segmented << "%  "  ;
Mikaël Salson's avatar
Mikaël Salson committed
935

Mathieu Giraud's avatar
Mathieu Giraud committed
936
      cout << " – " << 100 * (float) clone_nb_reads * compute_normalization_one(norm_list, clone_nb_reads) / nb_segmented << "% " 
Mikaël Salson's avatar
Mikaël Salson committed
937
	  << compute_normalization_one(norm_list, clone_nb_reads) << " ";
Mathieu Giraud's avatar
Mathieu Giraud committed
938
      cout.flush();
Mikaël Salson's avatar
Mikaël Salson committed
939
940
941

      //////////////////////////////////

Mathieu Giraud's avatar
Mathieu Giraud committed
942
      //$$ Sort sequences by nb_reads
Mathieu Giraud's avatar
Mathieu Giraud committed
943
      list<pair<junction, int> >sort_windows;
Mikaël Salson's avatar
Mikaël Salson committed
944
945

      for (list <junction>::const_iterator it = clone.begin(); it != clone.end(); ++it) {
Mathieu Giraud's avatar
Mathieu Giraud committed
946
	int nb_reads = windowsStorage->getNbReads(*it);
Mathieu Giraud's avatar
Mathieu Giraud committed
947
        sort_windows.push_back(make_pair(*it, nb_reads));
Mikaël Salson's avatar
Mikaël Salson committed
948
      }
Mathieu Giraud's avatar
Mathieu Giraud committed
949
      sort_windows.sort(pair_occurrence_sort<junction>);
Mikaël Salson's avatar
Mikaël Salson committed
950

Mathieu Giraud's avatar
Mathieu Giraud committed
951
      //$$ Output windows 
Mikaël Salson's avatar
Mikaël Salson committed
952
953

      int num_seq = 0 ;
Mikaël Salson's avatar
Mikaël Salson committed
954

Mathieu Giraud's avatar
Mathieu Giraud committed
955
956
      for (list <pair<junction, int> >::const_iterator it = sort_windows.begin(); 
           it != sort_windows.end(); ++it) {
Mikaël Salson's avatar
Mikaël Salson committed
957
958
	num_seq++ ;

Mathieu Giraud's avatar
Mathieu Giraud committed
959
960
        out_windows << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ;
	out_windows << it->first << endl;
Mikaël Salson's avatar
Mikaël Salson committed
961
962
963

	if ((!detailed_cluster_analysis) && (num_seq == 1))
	  {
Mathieu Giraud's avatar
Mathieu Giraud committed
964
965
966
	    cout << "\t" << setw(WIDTH_NB_READS) << it->second << "\t";
	    cout << it->first ;
	    cout << "\t" << windows_labels[it->first] ;
Mikaël Salson's avatar
Mikaël Salson committed
967
968
969
970
971
	  }
      }

      if (!detailed_cluster_analysis)
	{
Mathieu Giraud's avatar
Mathieu Giraud committed
972
	  cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
973
974
975
976
	  continue ;
	}

	
Mathieu Giraud's avatar
Mathieu Giraud committed
977
      //$$ First pass, choose one representative per cluster
Mikaël Salson's avatar
Mikaël Salson committed
978
979
980
981

      string best_V ;
      string best_D ;
      string best_J ;
Mathieu Giraud's avatar
Mathieu Giraud committed
982
      int more_windows = 0 ;
Mikaël Salson's avatar
Mikaël Salson committed
983
      
Mathieu Giraud's avatar
Mathieu Giraud committed
984
985
      for (list <pair<junction, int> >::const_iterator it = sort_windows.begin(); 
           it != sort_windows.end(); ++it) {
Mikaël Salson's avatar
Mikaël Salson committed
986

Mikaël Salson's avatar
Mikaël Salson committed
987
988
        out_clone << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ;
	out_clone << it->first << endl;
Mikaël Salson's avatar
Mikaël Salson committed
989

Mikaël Salson's avatar
Mikaël Salson committed
990
991
992
993
994

	// Display statistics on auditionned sequences
	if (verbose)
	{
	  int total_length = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
995
996
          list<Sequence> auditioned = windowsStorage->getSample(it->first, max_auditionned);
	  for (list<Sequence>::const_iterator it = auditioned.begin(); it != auditioned.end(); ++it) 
Mikaël Salson's avatar
Mikaël Salson committed
997
998
	    total_length += (*it).sequence.size() ;
	  
Mathieu Giraud's avatar
Mathieu Giraud committed
999
	  cout << auditioned.size() << " auditioned sequences, avg length " << total_length / auditioned.size() << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
1000
1001
	}

Mathieu Giraud's avatar
Mathieu Giraud committed
1002
1003
1004
1005
1006
1007
        Sequence representative 
          = windowsStorage->getRepresentative(it->first, seed, 
                                             min_cover_representative,
                                             ratio_representative,
                                             max_auditionned);

1008
1009
1010
1011
1012
1013
1014

        if (representative == NULL_SEQUENCE) {
	  clones_without_representative++ ;
	  if (verbose)
	    cout << "# (no representative sequence with current parameters)" ;

        } else {
Mathieu Giraud's avatar
Mathieu Giraud committed
1015
	//$$ There is one representative, FineSegmenter
1016
1017


Mathieu Giraud's avatar
Mathieu Giraud committed
1018
1019
          representative.label = string_of_int(it->second) + "-" 
            + representative.label;
Mikaël Salson's avatar
Mikaël Salson committed
1020
	  FineSegmenter seg(representative, rep_V, rep_J, delta_min, delta_max, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
1021
	
Mikaël Salson's avatar
Mikaël Salson committed
1022
1023
	if (segment_D)
	  seg.FineSegmentD(rep_V, rep_D, rep_J);
Mikaël Salson's avatar
Mikaël Salson committed
1024
	
Mathieu Giraud's avatar
Mathieu Giraud committed
1025
1026
1027
1028
        //cout << seg.toJson();
        json_data_segment[it->first]=seg.toJsonList(rep_V, rep_D, rep_J);
        
        if (seg.isSegmented())
Mikaël Salson's avatar
Mikaël Salson committed
1029
1030
1031
	  {
	    
	    // As soon as one representative is segmented
Mikaël Salson's avatar
Mikaël Salson committed
1032

Mathieu Giraud's avatar
Mathieu Giraud committed
1033
1034
              // We need to find the window in the representative
              size_t window_pos = seg.getSequence().sequence.find(it->first);
Mikaël Salson's avatar
Mikaël Salson committed
1035

Mathieu Giraud's avatar
Mathieu Giraud committed
1036
1037
              // Default
              int ww = 2*w/3 ; // /2 ;
Mikaël Salson's avatar
Mikaël Salson committed
1038

Mathieu Giraud's avatar
Mathieu Giraud committed
1039
1040
1041
1042
              if (window_pos != string::npos) {
                // for V.
                ww = seg.getLeft() - window_pos + seg.del_V;
              } 
Mikaël Salson's avatar
Mikaël Salson committed
1043
1044
            

Mathieu Giraud's avatar
Mathieu Giraud committed
1045
              string end_V ="";
Mikaël Salson's avatar
Mikaël Salson committed
1046
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
1047
1048
1049
1050
              // avoid case when V is not in the window
              if (seg.getLeft() > (int) window_pos)
                end_V = rep_V.sequence(seg.best_V).substr(rep_V.sequence(seg.best_V).size() - ww, 
                                                          ww - seg.del_V);
Mikaël Salson's avatar
Mikaël Salson committed
1051

Mathieu Giraud's avatar
Mathieu Giraud committed
1052
              string mid_D = "";
Mikaël Salson's avatar
Mikaël Salson committed
1053
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
1054
1055
1056
              if (segment_D)
                mid_D = rep_D.sequence(seg.best_D).substr(seg.del_D_left, 
                                                          rep_D.sequence(seg.best_D).size() - seg.del_D_left - seg.del_D_right );
Mikaël Salson's avatar
Mikaël Salson committed
1057
	   
Mathieu Giraud's avatar
Mathieu Giraud committed
1058
1059
1060
1061
              if (window_pos != string::npos) {
                // for J.
                ww = (window_pos + w - 1) - seg.getRight() + seg.del_J;
              }
Mikaël Salson's avatar
Mikaël Salson committed
1062
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
1063
              string start_J = "";
Mikaël Salson's avatar
Mikaël Salson committed
1064
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
1065
1066
1067
              // avoid case when J is not in the window
              if (seg.getRight() > (int) (window_pos + w - 1))
                start_J=rep_J.sequence(seg.best_J).substr(seg.del_J, ww);
Mikaël Salson's avatar
Mikaël Salson committed
1068
	      
Mathieu Giraud's avatar
Mathieu Giraud committed
1069
1070
1071
              best_V = rep_V.label(seg.best_V) ;
              if (segment_D) best_D = rep_D.label(seg.best_D) ;
              best_J = rep_J.label(seg.best_J) ;
Mikaël Salson's avatar
Mikaël Salson committed
1072
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
1073
1074
              // TODO: pad aux dimensions exactes
              string pad_N = "NNNNNNNNNNNNNNNN" ;
Mikaël Salson's avatar
Mikaël Salson committed
1075

Mathieu Giraud's avatar
Mathieu Giraud committed
1076
              // Add V, (D) and J to windows to be aligned
Mikaël Salson's avatar
Mikaël Salson committed
1077
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
1078
1079
1080
1081
1082
1083
1084
1085
1086
              out_windows << ">" << best_V << "-window" << endl ;
              out_windows << end_V << pad_N << endl ;
              more_windows++;

              if (segment_D) {
                out_windows << ">" << best_D << "-window" << endl ;
                out_windows << mid_D << endl ;   
                more_windows++ ;
              }
Mikaël Salson's avatar
Mikaël Salson committed
1087
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
1088
1089
1090
              out_windows << ">" << best_J << "-window" << endl ;
              out_windows << pad_N << start_J <<  endl ;
              more_windows++;
Mikaël Salson's avatar
Mikaël Salson committed
1091

Mathieu Giraud's avatar
Mathieu Giraud committed
1092
1093
              string code = seg.code ;
              int cc = clones_codes[code];
Mikaël Salson's avatar
Mikaël Salson committed
1094

Mathieu Giraud's avatar
Mathieu Giraud committed
1095
1096
              if (cc)
                {
Mathieu Giraud's avatar
Mathieu Giraud committed
1097
                  cout << " (similar to Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << cc << setfill(' ') << ")";
Mikaël Salson's avatar
Mikaël Salson committed
1098

Mathieu Giraud's avatar
Mathieu Giraud committed
1099
1100
1101
1102
1103
                  nb_edges++ ;
                  out_edges << clones_map_windows[code] + " " + it->first + " "  ;
                  out_edges << code << "  " ;
                  out_edges << "Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << cc        << setfill(' ') << "  " ;
                  out_edges << "Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << setfill(' ') << "  " ;
Mathieu Giraud's avatar
Mathieu Giraud committed
1104
                  out_edges << endl ;                }
Mathieu Giraud's avatar
Mathieu Giraud committed
1105
1106
1107
1108
1109
              else
                {
                  clones_codes[code] = num_clone ;
                  clones_map_windows[code] = it->first ;
                }
Mikaël Salson's avatar
Mikaël Salson committed
1110

Mathieu Giraud's avatar
Mathieu Giraud committed
1111
1112
1113
              out_clone << seg ;
              out_clone << endl ;
          }
Mikaël Salson's avatar
Mikaël Salson committed
1114

Mathieu Giraud's avatar
Mathieu Giraud committed
1115
1116
        if (seg.isSegmented() 
            || it == --(sort_windows.end())) {
1117
1118
1119
1120
	  // Store the representative and its label
          representatives.push_back(representative);
          representatives_labels.push_back("#" + string_of_int(num_clone));

Mathieu Giraud's avatar
Mathieu Giraud committed
1121
              // display window
Mikaël Salson's avatar
Mikaël Salson committed
1122
              cout << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
1123
1124
		   << ">clone-"  << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << "-window"  << " " << windows_labels[it->first] << endl
		   << it->first << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
1125

1126
1127
1128
	      // display representative, possibly segmented...
	      // (TODO: factorize)
	      // ... on stdout
Mathieu Giraud's avatar
Mathieu Giraud committed
1129
1130
	      cout << ">clone-"  << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << "-representative" << " " << seg.info << setfill(' ') << endl ;
	      cout << representative.sequence << endl;
1131
1132

	      // ... and in out_clones
1133
1134
1135
1136
	      out_clones << ">clone-"  << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << "-representative" 
			 << "-" << setfill('0') << setw(WIDTH_NB_READS) << clone_nb_reads 
			 << "-" << setprecision(3) << 100 * (float) clone_nb_reads / nb_segmented << "%"
			 << " " << seg.info << setfill(' ') << endl ;
1137
1138
	      out_clones << representative.sequence << endl;

Mathieu Giraud's avatar
Mathieu Giraud committed
1139
              break ;
Mathieu Giraud's avatar
Mathieu Giraud committed
1140
        }
Mathieu Giraud's avatar
Mathieu Giraud committed
1141
        }
Mikaël Salson's avatar
Mikaël Salson committed
1142
1143
      }

Mathieu Giraud's avatar
Mathieu Giraud committed
1144
      cout << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
1145
      out_windows.close();
Mikaël Salson's avatar
Mikaël Salson committed
1146

Mikaël Salson's avatar
Mikaël Salson committed
1147
1148
1149
1150
1151
1152
      if (!very_detailed_cluster_analysis)
	{
	  continue ;
	}


1153
      //$$ Very detailed cluster analysis (with sequences) // NOT USED NOW
Mikaël Salson's avatar
Mikaël Salson committed
1154
1155
1156
1157
1158

      list<string> msa;
      bool good_msa = false ;

      // TODO: do something if no sequences have been segmented !
Mathieu Giraud's avatar
Mathieu Giraud committed
1159
      if (!more_windows)
Mikaël Salson's avatar
Mikaël Salson committed
1160
	{
Mathieu Giraud's avatar
Mathieu Giraud committed
1161
	  cout << "!! No segmented sequence, deleting clone" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
1162
1163
1164
	  // continue ;
	} else 
        {
Mathieu Giraud's avatar
Mathieu Giraud committed
1165
          msa = multiple_seq_align(windows_file_name);
Mikaël Salson's avatar
Mikaël Salson committed
1166
        
Mathieu Giraud's avatar
Mathieu Giraud committed
1167
          // Alignment of windows
Mikaël Salson's avatar
Mikaël Salson committed
1168
1169
1170
          
          if (!msa.empty())
            {
Mathieu Giraud's avatar
Mathieu Giraud committed
1171
              if (msa.size() == sort_windows.size() + more_windows)
Mikaël Salson's avatar