vidjil.cpp 41.2 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
2
/*
  This file is part of "Vidjil" <http://bioinfo.lifl.fr/vidjil>
Mathieu Giraud's avatar
Mathieu Giraud committed
3
4
  Copyright (C) 2011, 2012, 2013, 2014 by Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille
  Contributors: Mathieu Giraud <mathieu.giraud@lifl.fr>, Mikaël Salson <mikael.salson@lifl.fr>, Marc Duez <marc.duez@lifl.fr>
Mikaël Salson's avatar
Mikaël Salson committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

Mathieu Giraud's avatar
Mathieu Giraud committed
20
//$$ #include
Mikaël Salson's avatar
Mikaël Salson committed
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

#include<algorithm>
#include<utility>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <cstring>
#include <time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#include "core/tools.h"
#include "core/kmerstore.h"
#include "core/fasta.h"
#include "core/segment.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
38
#include "core/windows.h"
Mikaël Salson's avatar
Mikaël Salson committed
39
40
41
42
43
44
45
46
47
#include "core/cluster-junctions.h"
#include "core/dynprog.h"
#include "core/read_score.h"
#include "core/read_chooser.h"
#include "core/msa.h"
#include "core/compare-all.h"
#include "core/teestream.h"
#include "core/mkdir.h"
#include "core/labels.h"
Mikaël Salson's avatar
Mikaël Salson committed
48
#include "core/list_utils.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
49
#include "core/windowExtractor.h"
Mikaël Salson's avatar
Mikaël Salson committed
50
51
52
53
54
55
56
57

#include "vidjil.h"

// RELEASE_TAG may be defined in the "release.h" file.
// If RELEASE_TAG is undefined, the version will be the git hash.
// #define RELEASE_TAG  "2013.04"
#include "release.h"

Mathieu Giraud's avatar
Mathieu Giraud committed
58
#define VIDJIL_JSON_VERSION "2014.02"
Mikaël Salson's avatar
Mikaël Salson committed
59

Mathieu Giraud's avatar
Mathieu Giraud committed
60
61
62
63
//$$ #define (mainly default options)

#define DEFAULT_GERMLINE_SYSTEM "IGH" 
#define DEFAULT_V_REP  "./germline/IGHV.fa"
Mikaël Salson's avatar
Mikaël Salson committed
64
#define DEFAULT_D_REP  "./germline/IGHD.fa" 
Mathieu Giraud's avatar
Mathieu Giraud committed
65
#define DEFAULT_J_REP  "./germline/IGHJ.fa"
Mikaël Salson's avatar
Mikaël Salson committed
66

Mathieu Giraud's avatar
Mathieu Giraud committed
67
#define DEFAULT_READS  "./data/Stanford_S22.fasta"
Mathieu Giraud's avatar
Mathieu Giraud committed
68
#define MIN_READS_WINDOW 10
Mikaël Salson's avatar
Mikaël Salson committed
69
70
#define MIN_READS_CLONE 10
#define MAX_CLONES 20
71
#define RATIO_READS_CLONE 0.0
Mikaël Salson's avatar
Mikaël Salson committed
72

Mathieu Giraud's avatar
Mathieu Giraud committed
73
#define COMMAND_WINDOWS "windows"
Mikaël Salson's avatar
Mikaël Salson committed
74
75
76
#define COMMAND_ANALYSIS "clones"
#define COMMAND_SEGMENT "segment"
 
Mathieu Giraud's avatar
Mathieu Giraud committed
77
enum { CMD_WINDOWS, CMD_ANALYSIS, CMD_SEGMENT } ;
Mikaël Salson's avatar
Mikaël Salson committed
78
79

#define OUT_DIR "./out/" 
80
#define CLONES_FILENAME "clones.vdj.fa"
Mikaël Salson's avatar
Mikaël Salson committed
81
#define CLONE_FILENAME "clone.fa-"
Mathieu Giraud's avatar
Mathieu Giraud committed
82
#define WINDOWS_FILENAME "windows.fa"
Mikaël Salson's avatar
Mikaël Salson committed
83
84
#define SEQUENCES_FILENAME "sequences.fa"
#define SEGMENTED_FILENAME "segmented.vdj.fa"
Mikaël Salson's avatar
Mikaël Salson committed
85
#define UNSEGMENTED_FILENAME "unsegmented.fa"
Mikaël Salson's avatar
Mikaël Salson committed
86
87
88
#define EDGES_FILENAME "edges"
#define COMP_FILENAME "comp.data"
#define GRAPH_FILENAME "graph"
Mathieu Giraud's avatar
Mathieu Giraud committed
89
#define JSON_SUFFIX ".data"
Mikaël Salson's avatar
Mikaël Salson committed
90
91
92

// "tests/data/leukemia.fa" 

93
#define DEFAULT_K      0
Mikaël Salson's avatar
Mikaël Salson committed
94
#define DEFAULT_W      40
Mikaël Salson's avatar
Mikaël Salson committed
95
#define DEFAULT_W_D    60
96
#define DEFAULT_SEED   ""
Mikaël Salson's avatar
Mikaël Salson committed
97
98

#define DEFAULT_DELTA_MIN  -10
Mathieu Giraud's avatar
Mathieu Giraud committed
99
#define DEFAULT_DELTA_MAX   20
Mikaël Salson's avatar
Mikaël Salson committed
100
101

#define DEFAULT_DELTA_MIN_D  0
102
#define DEFAULT_DELTA_MAX_D  80
Mikaël Salson's avatar
Mikaël Salson committed
103

Mikaël Salson's avatar
Mikaël Salson committed
104
#define DEFAULT_MAX_AUDITIONED 2000
Mathieu Giraud's avatar
Mathieu Giraud committed
105
#define DEFAULT_RATIO_REPRESENTATIVE 0.5
106
#define MIN_COVER_REPRESENTATIVE_RATIO_MIN_READS_CLONE 1.0
Mathieu Giraud's avatar
Mathieu Giraud committed
107

Mikaël Salson's avatar
Mikaël Salson committed
108
109
110
111
112
113
#define DEFAULT_EPSILON  0
#define DEFAULT_MINPTS   10

#define DEFAULT_CLUSTER_COST  Cluster
#define DEFAULT_SEGMENT_COST   VDJ

114
// warn
115
#define WARN_PERCENT_SEGMENTED 40
116

Mikaël Salson's avatar
Mikaël Salson committed
117
118
119
// display
#define WIDTH_NB_READS 7
#define WIDTH_NB_CLONES 3
Mikaël Salson's avatar
Mikaël Salson committed
120
#define WIDTH_WINDOW_POS 14+WIDTH_NB_CLONES
Mikaël Salson's avatar
Mikaël Salson committed
121
122
123

using namespace std ;

Mathieu Giraud's avatar
Mathieu Giraud committed
124
//$$ options: usage
Mikaël Salson's avatar
Mikaël Salson committed
125

Mathieu Giraud's avatar
Mathieu Giraud committed
126
extern char *optarg;
Mikaël Salson's avatar
Mikaël Salson committed
127
128
129
130
131
132
133
134

extern int optind, optopt, opterr;

void usage(char *progname)
{
  cerr << "Usage: " << progname << " [options] <reads.fa>" << endl << endl;

  cerr << "Command selection" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
135
       << "  -c <command> \t" << COMMAND_WINDOWS << "\t window extracting (default)" << endl 
Mikaël Salson's avatar
Mikaël Salson committed
136
       << "  \t\t" << COMMAND_ANALYSIS  << "  \t clone analysis" << endl 
137
       << "  \t\t" << COMMAND_SEGMENT   << "  \t V(D)J segmentation (not recommended)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
138
139
140
141
142
143
144
145
146
       << endl       

       << "Germline databases" << endl
       << "  -V <file>     V germline multi-fasta file" << endl
       << "  -D <file>     D germline multi-fasta file (automatically implies -d)" << endl
       << "  -J <file>     J germline multi-fasta file" << endl
       << "  -G <prefix>   prefix for V (D) and J repertoires (shortcut for -V <prefix>V.fa -D <prefix>D.fa -J <prefix>J.fa)" << endl
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
147
       << "Window prediction" << endl
Mikaël Salson's avatar
Mikaël Salson committed
148
#ifndef NO_SPACED_SEEDS
149
       << "  (use either -s or -k option, but not both)" << endl
150
151
       << "  -s <string>   spaced seed used for the V/J affectation" << endl
       << "                (default: #####-#####, ######-######, #######-#######, depends on germline)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
152
#endif
153
       << "  -k <int>      k-mer size used for the V/J affectation (default: 10, 12, 13, depends on germline)" << endl
154
155
156
#ifndef NO_SPACED_SEEDS
       << "                (using -k option is equivalent to set with -s a contiguous seed with only '#' characters)" << endl
#endif
Mathieu Giraud's avatar
Mathieu Giraud committed
157
       << "  -w <int>      w-mer size used for the length of the extracted window (default: " << DEFAULT_W << ")(default with -d: " << DEFAULT_W_D << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
158
159
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
160
161
       << "Window annotations" << endl
       << "  -l <file>     labels for some windows -- these windows will be kept even if some limits are not reached" << endl
Mikaël Salson's avatar
Mikaël Salson committed
162
163
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
164
165
       << "Limit to keep a window" << endl
       << "  -r <nb>       minimal number of reads containing a window (default: " << MIN_READS_WINDOW << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
166
167
168
169
170
171
172
173
174
175
176
177
178
       << endl

       << "Clusterisation" << endl
       << "  -e <file>     manual clusterisation -- a file used to force some specific edges" << endl
       << "  -n <int>      maximum distance between neighbors for automatic clusterisation (default " << DEFAULT_EPSILON << "). No automatic clusterisation if =0." << endl
       << "  -N <int>      minimum required neighbors for automatic clusterisation (default " << DEFAULT_MINPTS << ")" << endl
       << "  -S            generate and save comparative matrix for clustering" << endl
       << "  -L            load comparative matrix for clustering" << endl
       << "  -C <string>   use custom Cost for automatic clustering : format \"match, subst, indels, homo, del_end\" (default "<<Cluster<<" )"<< endl
       << endl

       << "Limits to report a clone" << endl
       << "  -R <nb>       minimal number of reads supporting a clone (default: " << MIN_READS_CLONE << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
179
       << "  -% <ratio>    minimal percentage of reads supporting a clone (default: " << RATIO_READS_CLONE << ")" << endl
180
       << "  -z <nb>       maximal number of clones reported (0: no limit) (default: " << MAX_CLONES << ")" << endl
181
       << "  -A            reports all clones (-r 0 -R 1 -% 0 -z 0), to be used only on very small datasets" << endl
Mikaël Salson's avatar
Mikaël Salson committed
182
183
       << endl

184
       << "Fine segmentation options (second pass, see warning in doc/README)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
185
186
187
188
189
190
       << "  -d            segment into V(D)J components instead of VJ " << endl
       << "  -m <int>      minimal admissible delta between segmentation points (default: " << DEFAULT_DELTA_MIN << ") (default when -d is used: " << DEFAULT_DELTA_MIN_D << ")" << endl
       << "  -M <int>      maximal admissible delta between segmentation points (default: " << DEFAULT_DELTA_MAX << ") (default when -d is used: " << DEFAULT_DELTA_MAX_D << ")" << endl
       << "  -f <string>   use custom Cost for fine segmenter : format \"match, subst, indels, homo, del_end\" (default "<<VDJ<<" )"<< endl
       << endl

191
192
193
       << "Debug" << endl
       << "  -u            output unsegmented sequences (default: " << UNSEGMENTED_FILENAME << ")" << endl
       << "                and display detailed k-mer affectation both on segmented and on unsegmented sequences" << endl
Mikaël Salson's avatar
Mikaël Salson committed
194
195
196
197
198
       << "Output" << endl
       << "  -o <dir>      output directory (default: " << OUT_DIR << ")" <<  endl
       << "  -p <string>   prefix output filenames by the specified string" << endl
    
       << "  -a            output all sequences by cluster (" << SEQUENCES_FILENAME << ")" << endl
199
       << "  -x            do not compute representative sequences" << endl
Mikaël Salson's avatar
Mikaël Salson committed
200
201
202
203
204
       << "  -v            verbose mode" << endl
       << endl        

       << endl 
       << "Examples (see doc/README)" << endl
205
206
       << "  " << progname << "             -G germline/IGH             -d data/Stanford_S22.fasta" << endl
       << "  " << progname << " -c clones   -G germline/IGH  -r 5 -R 5  -d data/Stanford_S22.fasta" << endl
207
       << "  " << progname << " -c segment  -G germline/IGH             -d data/Stanford_S22.fasta   # (only for testing)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
208
209
210
211
212
213
214
    ;
  exit(1);
}

int main (int argc, char **argv)
{
  cout << "# Vidjil -- V(D)J recombinations analysis <http://bioinfo.lifl.fr/vidjil>" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
215
216
       << "# Copyright (C) 2011, 2012, 2013, 2014 by the Vidjil team" << endl
       << "# Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille" << endl 
Mikaël Salson's avatar
Mikaël Salson committed
217
218
       << endl ;

Mathieu Giraud's avatar
Mathieu Giraud committed
219
220
  //$$ options: defaults

Mikaël Salson's avatar
Mikaël Salson committed
221
  string germline_system = DEFAULT_GERMLINE_SYSTEM ;
Mikaël Salson's avatar
Mikaël Salson committed
222
223
224
225
226
227
228
229
230
231
232
233
  string f_rep_V = DEFAULT_V_REP ;
  string f_rep_D = DEFAULT_D_REP ;
  string f_rep_J = DEFAULT_J_REP ;
  string f_reads = DEFAULT_READS ;
  string seed = DEFAULT_SEED ;
  string prefix_filename = "";

  string out_dir = OUT_DIR;
  
  string comp_filename = COMP_FILENAME;

  int k = DEFAULT_K ;
Mikaël Salson's avatar
Mikaël Salson committed
234
235
236
  int w = 0 ;
  int default_w = DEFAULT_W ;

Mikaël Salson's avatar
Mikaël Salson committed
237
238
239
240
241
242
243
244
245
246
247
  int epsilon = DEFAULT_EPSILON ;
  int minPts = DEFAULT_MINPTS ;
  Cost cluster_cost = DEFAULT_CLUSTER_COST ;
  Cost segment_cost = DEFAULT_SEGMENT_COST ;
  
  
  int save_comp = 0;
  int load_comp = 0;
  int segment_D = 0;
  
  int verbose = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
248
  int command = CMD_WINDOWS;
Mikaël Salson's avatar
Mikaël Salson committed
249

Mikaël Salson's avatar
Mikaël Salson committed
250
  int max_clones = MAX_CLONES ;
Mathieu Giraud's avatar
Mathieu Giraud committed
251
  int min_reads_window = MIN_READS_WINDOW ;
Mikaël Salson's avatar
Mikaël Salson committed
252
253
254
255
  int min_reads_clone = MIN_READS_CLONE ;
  float ratio_reads_clone = RATIO_READS_CLONE;
  // int average_deletion = 4;     // Average number of deletion in V or J

Mathieu Giraud's avatar
Mathieu Giraud committed
256
  float ratio_representative = DEFAULT_RATIO_REPRESENTATIVE;
Mikaël Salson's avatar
Mikaël Salson committed
257
  unsigned int max_auditionned = DEFAULT_MAX_AUDITIONED;
Mathieu Giraud's avatar
Mathieu Giraud committed
258

Mikaël Salson's avatar
Mikaël Salson committed
259
260
261
262
263
264
  // Admissible delta between left and right segmentation points
  int delta_min = DEFAULT_DELTA_MIN ; // Kmer+Fine
  int delta_max = DEFAULT_DELTA_MAX ; // Fine

  bool output_sequences_by_cluster = false;
  bool detailed_cluster_analysis = true ;
Mikaël Salson's avatar
Mikaël Salson committed
265
  bool very_detailed_cluster_analysis = false ;
Mathieu Giraud's avatar
Mathieu Giraud committed
266
  bool output_unsegmented = false;
Mikaël Salson's avatar
Mikaël Salson committed
267
268
269

  string forced_edges = "" ;

Mathieu Giraud's avatar
Mathieu Giraud committed
270
  string windows_labels_file = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
271
  string normalization_file = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
272
273
274

  char c ;

275
276
  int options_s_k = 0 ;

Mathieu Giraud's avatar
Mathieu Giraud committed
277
278
  //$$ options: getopt

279
  while ((c = getopt(argc, argv, "AhaG:V:D:J:k:r:R:vw:e:C:t:l:dc:m:M:N:s:p:Sn:o:Lx%:Z:z:u")) != EOF)
Mikaël Salson's avatar
Mikaël Salson committed
280
281
282
283
284
285
286
287
288
289
290
291

    switch (c)
      {
      case 'h':
        usage(argv[0]);
      case 'a':
	output_sequences_by_cluster = true;
	break;
      case 'd':
	segment_D = 1 ;
	delta_min = DEFAULT_DELTA_MIN_D ;
	delta_max = DEFAULT_DELTA_MAX_D ;
Mikaël Salson's avatar
Mikaël Salson committed
292
	default_w = DEFAULT_W_D ;
Mikaël Salson's avatar
Mikaël Salson committed
293
294
295
296
297
	break;
      case 'e':
	forced_edges = optarg;
	break;
      case 'l':
Mathieu Giraud's avatar
Mathieu Giraud committed
298
	windows_labels_file = optarg; 
Mikaël Salson's avatar
Mikaël Salson committed
299
	break;
Mikaël Salson's avatar
Mikaël Salson committed
300
301
302
      case 'Z':
	normalization_file = optarg; 
	break;
Mikaël Salson's avatar
Mikaël Salson committed
303
304
305
306
307
308
309
310
      case 'x':
	detailed_cluster_analysis = false;
	break;
      case 'c':
        if (!strcmp(COMMAND_ANALYSIS,optarg))
          command = CMD_ANALYSIS;
        else if (!strcmp(COMMAND_SEGMENT,optarg))
          command = CMD_SEGMENT;
Mathieu Giraud's avatar
Mathieu Giraud committed
311
312
        else if (!strcmp(COMMAND_WINDOWS,optarg))
          command = CMD_WINDOWS;
Mikaël Salson's avatar
Mikaël Salson committed
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
        else {
          cerr << "Unknwown command " << optarg << endl;
	  usage(argv[0]);
        }
        break;
      case 'v':
	verbose += 1 ;
	break;

      // Germline

      case 'V':
	f_rep_V = optarg;
	break;

      case 'D':
	f_rep_D = optarg;
        segment_D = 1;
	break;
        
      case 'J':
	f_rep_J = optarg;
	break;

      case 'G':
Mikaël Salson's avatar
Mikaël Salson committed
338
339
340
341
	germline_system = string(optarg);
	f_rep_V = (germline_system + "V.fa").c_str() ;
	f_rep_D = (germline_system + "D.fa").c_str() ;
	f_rep_J = (germline_system + "J.fa").c_str() ;
Mikaël Salson's avatar
Mikaël Salson committed
342
343
344
345
346
347
348
349
	// TODO: if VDJ, set segment_D // NO, bad idea, depends on naming convention
	break;

      // Algorithm

      case 'k':
	k = atoi(optarg);
	seed = seed_contiguous(k);
350
	options_s_k++ ;
Mikaël Salson's avatar
Mikaël Salson committed
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
        break;

      case 'w':
	w = atoi(optarg);
        break;

      case 'm':
	delta_min = atoi(optarg);
        break;

      case 'M':
	delta_max = atoi(optarg);
        break;

      case 'o':
        out_dir = optarg ;
        break;

      case 'p':
        prefix_filename = optarg;
        break;

373
374
      // Limits

Mikaël Salson's avatar
Mikaël Salson committed
375
      case 'r':
Mathieu Giraud's avatar
Mathieu Giraud committed
376
	min_reads_window = atoi(optarg);
Mikaël Salson's avatar
Mikaël Salson committed
377
378
379
380
381
382
383
384
385
386
        break;

      case '%':
	ratio_reads_clone = atof(optarg);
	break;

      case 'R':
	min_reads_clone = atoi(optarg);
        break;

Mikaël Salson's avatar
Mikaël Salson committed
387
388
389
      case 'z':
	max_clones = atoi(optarg);
        break;
390
391
392
393
394
395
396
397
398
399

      case 'A': // --all
	min_reads_window = 1 ;
	ratio_reads_clone = 0 ;
	min_reads_clone = 0 ;
	max_clones = 0 ;
	break ;

      // Seeds

Mikaël Salson's avatar
Mikaël Salson committed
400
401
402
403
      case 's':
#ifndef NO_SPACED_SEEDS
	seed = string(optarg);
	k = seed_weight(seed);
404
	options_s_k++ ;
Mikaël Salson's avatar
Mikaël Salson committed
405
#else
Mathieu Giraud's avatar
Mathieu Giraud committed
406
        cerr << "To enable the option -s, please compile without NO_SPACED_SEEDS" << endl;
Mikaël Salson's avatar
Mikaël Salson committed
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
#endif
        break;
	
      // Clusterisation
	
      case 'n':
	epsilon = atoi(optarg);
        break;

      case 'N':
	minPts = atoi(optarg);
        break;
	
      case 'S':
	save_comp=1;
        break;

      case 'L':
	load_comp=1;
        break;
	
      case 'C':
	cluster_cost=strToCost(optarg, Cluster);
        break;
	
      case 't':
	segment_cost=strToCost(optarg, VDJ);
        break;
Mathieu Giraud's avatar
Mathieu Giraud committed
435
436
437
438

      case 'u':
        output_unsegmented = true;
        break;
Mikaël Salson's avatar
Mikaël Salson committed
439
440
      }

Mikaël Salson's avatar
Mikaël Salson committed
441
442
443
444
445
  // If there was no -w option, then w is either DEFAULT_W or DEFAULT_W_D
  if (w == 0)
    w = default_w ;

  
446
447
448
449
450
451
  if (options_s_k > 1)
    {
      cout << "use at most one -s or -k option." << endl ;
      exit(1);
    }

Mikaël Salson's avatar
Mikaël Salson committed
452
453
  string out_seqdir = out_dir + "/seq/" ;

Mikaël Salson's avatar
Mikaël Salson committed
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
  if (verbose)
    cout << "# verbose " << verbose << endl ;

  if (optind == argc)
    {
      cout << "# using default sequence file: " << f_reads << endl ;
    }
  else if (optind+1 == argc)
    {
      f_reads = argv[optind] ; 
    }
  else
    {
      cout << "wrong number of arguments." << endl ;
      exit(1);
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
470
471

  //$$ options: post-processing+display
Mathieu Giraud's avatar
Mathieu Giraud committed
472

473
  size_t min_cover_representative = (size_t) (MIN_COVER_REPRESENTATIVE_RATIO_MIN_READS_CLONE * min_reads_clone) ;
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498

  // Default seeds

#ifndef NO_SPACED_SEEDS
  if (k == DEFAULT_K)
    {
      if (germline_system.find("TRA") != string::npos)
	seed = "#######-######" ;

      else if ((germline_system.find("TRB") != string::npos)
	       || (germline_system.find("IGH") != string::npos))
	seed = "######-######" ; 
      else // TRD, TRG, IGK, IGL
	seed = "#####-#####" ; 

      k = seed_weight(seed);
    }
#else
  {
    cout << "Vidjil was compiled with NO_SPACED_SEEDS: please provide a -k option." << endl;
    exit(1) ;
  }
#endif
	  

Mathieu Giraud's avatar
Mathieu Giraud committed
499
500
501
502
503
504
505
506
507
#ifndef NO_SPACED_SEEDS
  // Check seed buffer  
  if (seed.size() >= MAX_SEED_SIZE)
    {
      cout << "Seed size is too large (MAX_SEED_SIZE). Aborting." << endl ;
      exit(1);
    }
#endif

Mikaël Salson's avatar
Mikaël Salson committed
508
509
510
511
512
513
514
515
  // Check that out_dir is an existing directory or creates it
  const char *out_cstr = out_dir.c_str();

  if (mkpath(out_cstr, 0755) == -1) {
    perror("Directory creation");
    exit(2);
  }

Mikaël Salson's avatar
Mikaël Salson committed
516
517
518
519
520
521
  const char *outseq_cstr = out_seqdir.c_str();
  if (mkpath(outseq_cstr, 0755) == -1) {
    perror("Directory creation");
    exit(2);
  }

Mikaël Salson's avatar
Mikaël Salson committed
522
523
524
  out_dir += "/" ;

  /// Load labels ;
Mathieu Giraud's avatar
Mathieu Giraud committed
525
  map <string, string> windows_labels = load_map(windows_labels_file);
Mikaël Salson's avatar
Mikaël Salson committed
526

Mikaël Salson's avatar
Mikaël Salson committed
527
528
  map <string, pair <string, float> > normalization = load_map_norm(normalization_file);

Mikaël Salson's avatar
Mikaël Salson committed
529
530

  switch(command) {
Mathieu Giraud's avatar
Mathieu Giraud committed
531
  case CMD_WINDOWS: cout << "Extracting windows" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
532
533
534
535
536
537
538
    break;
  case CMD_ANALYSIS: cout << "Analysing clones" << endl; 
    break;
  case CMD_SEGMENT: cout << "Segmenting V(D)J" << endl;
    break;
  }

Mathieu Giraud's avatar
Mathieu Giraud committed
539
  cout << "Command line: ";
Mikaël Salson's avatar
Mikaël Salson committed
540
  for (int i=0; i < argc; i++) {
Mathieu Giraud's avatar
Mathieu Giraud committed
541
    cout << argv[i] << " ";
Mikaël Salson's avatar
Mikaël Salson committed
542
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
543
  cout << endl;
Mikaël Salson's avatar
Mikaël Salson committed
544
545
546
547
548
549
550
551
552
553
554
555

  //////////////////////////////////
  // Display time and date
  time_t rawtime;
  struct tm *timeinfo;
  char time_buffer[80];

  time (&rawtime );
  timeinfo = localtime (&rawtime);

  strftime (time_buffer, 80,"%F %T", timeinfo);

Mathieu Giraud's avatar
Mathieu Giraud committed
556
  cout << "# " << time_buffer << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
557
558
559
560
561
562


  //////////////////////////////////
  // Display version information or git log

#ifdef RELEASE_TAG
Mathieu Giraud's avatar
Mathieu Giraud committed
563
  cout << "# version: vidjil " << RELEASE_TAG << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
564
#else
Mathieu Giraud's avatar
Mathieu Giraud committed
565
566
  if (system("git log -1 --pretty=format:'# git: %h (%ci)' --abbrev-commit 2> /dev/null") == 0){}
  cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
567
568
569
#endif

  //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
570
  //$$ Read sequence files
Mikaël Salson's avatar
Mikaël Salson committed
571
572
573
574

  if (!segment_D) // TODO: add other constructor to Fasta, and do not load rep_D in this case
    f_rep_D = "";

Mathieu Giraud's avatar
Mathieu Giraud committed
575
576
577
578
  Fasta rep_V(f_rep_V, 2, "|", cout);
  Fasta rep_D(f_rep_D, 2, "|", cout);
  Fasta rep_J(f_rep_J, 2, "|", cout);

Mikaël Salson's avatar
Mikaël Salson committed
579
580
581
582
583
  OnlineFasta *reads;

  try {
    reads = new OnlineFasta(f_reads, 1, " ");
  } catch (const std::ios_base::failure e) {
Mathieu Giraud's avatar
Mathieu Giraud committed
584
    cout << "Error while reading reads file " << f_reads << ": " << e.what()
Mikaël Salson's avatar
Mikaël Salson committed
585
586
587
588
589
590
591
592
593
        << endl;
    exit(1);
  }

  out_dir += "/";

  ////////////////////////////////////////
  //           CLONE ANALYSIS           //
  ////////////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
594
  if (command == CMD_ANALYSIS || command == CMD_WINDOWS) {
Mikaël Salson's avatar
Mikaël Salson committed
595
596

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
597
598
599
600
601
602
    cout << "# seed = " << seed << "," ;
    cout << " weight = " << seed_weight(seed) << "," ;
    cout << " span = " << seed.size() << endl ;
    cout << "# k = " << k << "," ;
    cout << " w = " << w << "," ;
    cout << " delta = [" << delta_min << "," << delta_max << "]" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
603
604
605


    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
606
607
    //$$ Build Kmer indexes
    cout << "Build Kmer indexes" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
608
609
610
611
612
613
614
615
616

    bool rc = true ;
    
    IKmerStore<KmerAffect>  *index = KmerStoreFactory::createIndex<KmerAffect>(seed, rc);
    index->insert(rep_V, "V");
    index->insert(rep_J, "J");

  
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
617
    //$$ Kmer Segmentation
Mikaël Salson's avatar
Mikaël Salson committed
618

Mathieu Giraud's avatar
Mathieu Giraud committed
619
620
    cout << endl;
    cout << "Loop through reads, looking for windows" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
621

Mathieu Giraud's avatar
Mathieu Giraud committed
622
    string f_segmented = out_dir + prefix_filename + SEGMENTED_FILENAME ;
623
    cout << "  ==> " << f_segmented << "\t(result of window detection)" << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
624
625
    ofstream out_segmented(f_segmented.c_str()) ;
    ofstream *out_unsegmented = NULL;
Mikaël Salson's avatar
Mikaël Salson committed
626
 
Mathieu Giraud's avatar
Mathieu Giraud committed
627
628
629
630
631
632
633
634
635
    WindowExtractor we;
    we.setSegmentedOutput(&out_segmented);
    if (output_unsegmented) {
      string f_unsegmented = out_dir + prefix_filename + UNSEGMENTED_FILENAME ;
      cout << "  ==> " << f_unsegmented << endl ;
      out_unsegmented = new ofstream(f_unsegmented.c_str());
      we.setUnsegmentedOutput(out_unsegmented);
    }
    WindowsStorage *windowsStorage = we.extract(reads, index, w, delta_min, 
636
                                                delta_max, windows_labels);
Mathieu Giraud's avatar
Mathieu Giraud committed
637
    size_t nb_total_reads = we.getNbReads();
Mikaël Salson's avatar
Mikaël Salson committed
638
639


Mathieu Giraud's avatar
Mathieu Giraud committed
640
    //$$ Display statistics on segmentation causes
Mikaël Salson's avatar
Mikaël Salson committed
641

Mikaël Salson's avatar
Mikaël Salson committed
642

Mathieu Giraud's avatar
Mathieu Giraud committed
643
644
    int nb_segmented_including_too_short = we.getNbSegmented(TOTAL_SEG_AND_WINDOW) 
      + we.getNbSegmented(TOTAL_SEG_BUT_TOO_SHORT_FOR_THE_WINDOW);
Mikaël Salson's avatar
Mikaël Salson committed
645

Mathieu Giraud's avatar
Mathieu Giraud committed
646
    cout << "  ==> segmented " << nb_segmented_including_too_short << " reads"
Mikaël Salson's avatar
Mikaël Salson committed
647
	<< " (" << setprecision(3) << 100 * (float) nb_segmented_including_too_short / nb_total_reads << "%)" 
Mathieu Giraud's avatar
Mathieu Giraud committed
648
649
	<< endl ;

Mikaël Salson's avatar
Mikaël Salson committed
650
    // nb_segmented is the main denominator for the following (but will be normalized)
Mathieu Giraud's avatar
Mathieu Giraud committed
651
    int nb_segmented = we.getNbSegmented(TOTAL_SEG_AND_WINDOW);
652
    float ratio_segmented = 100 * (float) nb_segmented / nb_total_reads ;
Mikaël Salson's avatar
Mikaël Salson committed
653

Mathieu Giraud's avatar
Mathieu Giraud committed
654
    cout << "  ==> found " << windowsStorage->size() << " " << w << "-windows"
Mikaël Salson's avatar
Mikaël Salson committed
655
	<< " in " << nb_segmented << " segments"
656
	<< " (" << setprecision(3) << ratio_segmented << "%)"
Mikaël Salson's avatar
Mikaël Salson committed
657
658
	<< " inside " << nb_total_reads << " sequences" << endl ;
  
659
    // warn if there are too few segmented sequences
660
    if (ratio_segmented < WARN_PERCENT_SEGMENTED)
661
662
663
664
665
      {
        cout << "  ! There are not so many CDR3 windows found in this set of reads." << endl ;
        cout << "  ! If this is unexpected, check the germline (-G) and try to change seed parameters (-k)." << endl ;
      }

Mathieu Giraud's avatar
Mathieu Giraud committed
666
    cout << "                                  #      av. length" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
667

Mikaël Salson's avatar
Mikaël Salson committed
668
    for (int i=0; i<STATS_SIZE; i++)
Mikaël Salson's avatar
Mikaël Salson committed
669
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
670
671
	cout << "   " << left << setw(20) << segmented_mesg[i] 
             << " ->" << right << setw(9) << we.getNbSegmented(static_cast<SEGMENTED>(i)) ;
Mikaël Salson's avatar
Mikaël Salson committed
672

Mathieu Giraud's avatar
Mathieu Giraud committed
673
674
675
676
677
	if (we.getAverageSegmentationLength(static_cast<SEGMENTED>(i)))
	  cout << "      " << setw(5) << fixed << setprecision(1) 
               << we.getAverageSegmentationLength(static_cast<SEGMENTED>(i));
	
	cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
678
679
      }
    
Mathieu Giraud's avatar
Mathieu Giraud committed
680
      map <junction, JsonList> json_data_segment ;
Mikaël Salson's avatar
Mikaël Salson committed
681
    
Mikaël Salson's avatar
Mikaël Salson committed
682
683

	//////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
684
	//$$ Sort windows
Mikaël Salson's avatar
Mikaël Salson committed
685
	
Mathieu Giraud's avatar
Mathieu Giraud committed
686
687
        cout << "Sort windows by number of occurrences" << endl;
        windowsStorage->sort();
Mikaël Salson's avatar
Mikaël Salson committed
688
689

	//////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
690
	//$$ Output windows
Mikaël Salson's avatar
Mikaël Salson committed
691
692
	//////////////////////////////////

Mathieu Giraud's avatar
Mathieu Giraud committed
693
	string f_all_windows = out_dir + prefix_filename + WINDOWS_FILENAME;
Mathieu Giraud's avatar
Mathieu Giraud committed
694
	cout << "  ==> " << f_all_windows << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
695

Mathieu Giraud's avatar
Mathieu Giraud committed
696
	ofstream out_all_windows(f_all_windows.c_str());
Mathieu Giraud's avatar
Mathieu Giraud committed
697
        windowsStorage->printSortedWindows(out_all_windows);
Mikaël Salson's avatar
Mikaël Salson committed
698

Mathieu Giraud's avatar
Mathieu Giraud committed
699
700
	//$$ Normalization
	list< pair <float, int> > norm_list = compute_normalization_list(windowsStorage->getMap(), normalization, nb_segmented);
Mikaël Salson's avatar
Mikaël Salson committed
701
702
703
704
705


    if (command == CMD_ANALYSIS) {

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
706
707
    //$$ min_reads_window (ou label)
    cout << "Considering only windows with >= " << min_reads_window << " reads and labeled windows" << endl;
Mathieu Giraud's avatar
Mathieu Giraud committed
708

Mathieu Giraud's avatar
Mathieu Giraud committed
709
    pair<int, int> info_remove = windowsStorage->keepInterestingWindows((size_t) min_reads_window);
Mikaël Salson's avatar
Mikaël Salson committed
710
	 
Mathieu Giraud's avatar
Mathieu Giraud committed
711
712
    cout << "  ==> keep " <<  windowsStorage->size() << " windows in " << info_remove.second << " reads" ;
    cout << " (" << setprecision(3) << 100 * (float) info_remove.second / nb_total_reads << "%)  " << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
713

714
715
716
717
    if (windowsStorage->size() == 0)
      {
	cout << "  ! No windows with current parameters." << endl;
      }
Mikaël Salson's avatar
Mikaël Salson committed
718
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
719
    //$$ Clustering
Mikaël Salson's avatar
Mikaël Salson committed
720

Mathieu Giraud's avatar
Mathieu Giraud committed
721
    list <list <junction> > clones_windows;
Mathieu Giraud's avatar
Mathieu Giraud committed
722
    comp_matrix comp=comp_matrix(*windowsStorage);
Mikaël Salson's avatar
Mikaël Salson committed
723
724
725

    if (epsilon || forced_edges.size())
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
726
	cout << "Cluster similar windows" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
727
728
729
730
731
732
733

	if (load_comp==1) 
	  {
	    comp.load((out_dir+prefix_filename + comp_filename).c_str());
	  }
	else
	  {
Mathieu Giraud's avatar
Mathieu Giraud committed
734
	    comp.compare( cout, cluster_cost);
Mikaël Salson's avatar
Mikaël Salson committed
735
736
737
738
739
740
741
	  }
	
	if (save_comp==1)
	  {
	    comp.save(( out_dir+prefix_filename + comp_filename).c_str());
	  }
       
Mathieu Giraud's avatar
Mathieu Giraud committed
742
743
	clones_windows  = comp.cluster(forced_edges, w, cout, epsilon, minPts) ;
	comp.stat_cluster(clones_windows, out_dir + prefix_filename + GRAPH_FILENAME, cout );
Mikaël Salson's avatar
Mikaël Salson committed
744
745
746
747
	comp.del();
      } 
    else
      {
Mathieu Giraud's avatar
Mathieu Giraud committed
748
	cout << "No clustering" << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
749
	clones_windows  = comp.nocluster() ;
Mikaël Salson's avatar
Mikaël Salson committed
750
751
      }

Mathieu Giraud's avatar
Mathieu Giraud committed
752
    cout << "  ==> " << clones_windows.size() << " clones" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
753
 
754
755
756
757
758
759
760
761
    if (clones_windows.size() == 0)
      {
	cout << "  ! No clones with current parameters." << endl;
	cout << "  ! See the 'Limits to report a clone' options (-R, -%, -z, -A)." << endl;
      }
    else // clones_windows.size() > 0
      { 

Mathieu Giraud's avatar
Mathieu Giraud committed
762
    //$$ Sort clones, number of occurrences
Mikaël Salson's avatar
Mikaël Salson committed
763
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
764
    cout << "Sort clones by number of occurrences" << endl;
Mikaël Salson's avatar
Mikaël Salson committed
765
766
767

    list<pair<list <junction>, int> >sort_clones;

Mathieu Giraud's avatar
Mathieu Giraud committed
768
    for (list <list <junction> >::const_iterator it = clones_windows.begin(); it != clones_windows.end(); ++it)
Mikaël Salson's avatar
Mikaël Salson committed
769
770
771
      {
        list <junction>clone = *it ;

Mikaël Salson's avatar
Mikaël Salson committed
772
773
774
	int clone_nb_reads=0;
	
        for (list <junction>::const_iterator it2 = clone.begin(); it2 != clone.end(); ++it2)
Mathieu Giraud's avatar
Mathieu Giraud committed
775
	  clone_nb_reads += windowsStorage->getNbReads(*it2);
Mikaël Salson's avatar
Mikaël Salson committed
776
	  
Mikaël Salson's avatar
Mikaël Salson committed
777
	bool labeled = false ;
Mathieu Giraud's avatar
Mathieu Giraud committed
778
	// Is there a labeled window ?
Mikaël Salson's avatar
Mikaël Salson committed
779
	for (list <junction>::const_iterator iit = clone.begin(); iit != clone.end(); ++iit) {
Mathieu Giraud's avatar
Mathieu Giraud committed
780
	  if (windows_labels.find(*iit) != windows_labels.end())
Mikaël Salson's avatar
Mikaël Salson committed
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
	    {
	      labeled = true ;
	      break ;
	    }
	}

	  if (labeled 
	      || ((clone_nb_reads >= min_reads_clone) 
		  && (clone_nb_reads * 100.0 / nb_segmented >= ratio_reads_clone)))
          // Record the clone and its number of occurrence
          sort_clones.push_back(make_pair(clone, clone_nb_reads));
      }

    // Sort clones
    sort_clones.sort(pair_occurrence_sort<list<junction> >);

797
798
    cout << endl;

Mikaël Salson's avatar
Mikaël Salson committed
799
    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
800
    //$$ Output clones
801
802
803
804
805
806
    if (max_clones > 0)
      cout << "Output at most " << max_clones<< " clones" ;
    else
      cout << "Output all clones" ;

    cout << " with >= " << min_reads_clone << " reads and with a ratio >= " << ratio_reads_clone << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
807
808

    map <string, int> clones_codes ;
Mathieu Giraud's avatar
Mathieu Giraud committed
809
    map <string, string> clones_map_windows ;
Mikaël Salson's avatar
Mikaël Salson committed
810
811
812
813
814
815

    list <Sequence> representatives ;
    list <string> representatives_labels ;

    VirtualReadScore *scorer = new KmerAffectReadScore(*index);
    int num_clone = 0 ;
816
    int clones_without_representative = 0 ;
Mikaël Salson's avatar
Mikaël Salson committed
817
818
819

    ofstream out_edges((out_dir+prefix_filename + EDGES_FILENAME).c_str());
    int nb_edges = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
820
    cout << "  ==> suggested edges in " << out_dir+ prefix_filename + EDGES_FILENAME
Mikaël Salson's avatar
Mikaël Salson committed
821
822
        << endl ;

823
824
825
    string f_clones = out_dir + prefix_filename + CLONES_FILENAME ;
    cout << "  ==> " << f_clones << "   \t(main result file)" << endl ;
    ofstream out_clones(f_clones.c_str()) ;
Mikaël Salson's avatar
Mikaël Salson committed
826

827
828
    cout << "  ==> " << out_seqdir + prefix_filename + CLONE_FILENAME + "*" << "\t(detail, by clone)" << endl ; 
    cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
829
830
831
832
833
834
835
836

    for (list <pair<list <junction>,int> >::const_iterator it = sort_clones.begin();
         it != sort_clones.end(); ++it) {
      list<junction> clone = it->first;
      int clone_nb_reads = it->second;

    
      ++num_clone ;
Mikaël Salson's avatar
Mikaël Salson committed
837

838
      if (max_clones && (num_clone == max_clones + 1))
Mikaël Salson's avatar
Mikaël Salson committed
839
840
	  break ;

Mikaël Salson's avatar
Mikaël Salson committed
841
      cout << "#### " ;
Mikaël Salson's avatar
Mikaël Salson committed
842
843
844
845

      string clone_file_name = out_seqdir+ prefix_filename + CLONE_FILENAME + string_of_int(num_clone) ;
      string windows_file_name = out_seqdir+ prefix_filename + WINDOWS_FILENAME + "-" + string_of_int(num_clone) ;
      string sequences_file_name = out_seqdir+ prefix_filename + SEQUENCES_FILENAME + "-" + string_of_int(num_clone) ;
Mikaël Salson's avatar
Mikaël Salson committed
846
847

      ofstream out_clone(clone_file_name.c_str());
Mathieu Giraud's avatar
Mathieu Giraud committed
848
849
850
851
852
853
      ofstream out_windows(windows_file_name.c_str());
      ofstream out_sequences;

      if (output_sequences_by_cluster) {
        out_sequences.open(sequences_file_name.c_str());
      }
Mikaël Salson's avatar
Mikaël Salson committed
854
      
Mathieu Giraud's avatar
Mathieu Giraud committed
855
856
857
      cout << "Clone #" << right << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone ;
      cout << " – " << setfill(' ') << setw(WIDTH_NB_READS) << clone_nb_reads << " reads" ;
      cout << " – " << setprecision(3) << 100 * (float) clone_nb_reads / nb_segmented << "%  "  ;
Mikaël Salson's avatar
Mikaël Salson committed
858

Mathieu Giraud's avatar
Mathieu Giraud committed
859
      cout << " – " << 100 * (float) clone_nb_reads * compute_normalization_one(norm_list, clone_nb_reads) / nb_segmented << "% " 
Mikaël Salson's avatar
Mikaël Salson committed
860
	  << compute_normalization_one(norm_list, clone_nb_reads) << " ";
Mathieu Giraud's avatar
Mathieu Giraud committed
861
      cout.flush();
Mikaël Salson's avatar
Mikaël Salson committed
862
863
864

      //////////////////////////////////

Mathieu Giraud's avatar
Mathieu Giraud committed
865
      //$$ Sort sequences by nb_reads
Mathieu Giraud's avatar
Mathieu Giraud committed
866
      list<pair<junction, int> >sort_windows;
Mikaël Salson's avatar
Mikaël Salson committed
867
868

      for (list <junction>::const_iterator it = clone.begin(); it != clone.end(); ++it) {
Mathieu Giraud's avatar
Mathieu Giraud committed
869
	int nb_reads = windowsStorage->getNbReads(*it);
Mathieu Giraud's avatar
Mathieu Giraud committed
870
        sort_windows.push_back(make_pair(*it, nb_reads));
Mikaël Salson's avatar
Mikaël Salson committed
871
      }
Mathieu Giraud's avatar
Mathieu Giraud committed
872
      sort_windows.sort(pair_occurrence_sort<junction>);
Mikaël Salson's avatar
Mikaël Salson committed
873

Mathieu Giraud's avatar
Mathieu Giraud committed
874
      //$$ Output windows 
Mikaël Salson's avatar
Mikaël Salson committed
875
876

      int num_seq = 0 ;
Mikaël Salson's avatar
Mikaël Salson committed
877

Mathieu Giraud's avatar
Mathieu Giraud committed
878
879
      for (list <pair<junction, int> >::const_iterator it = sort_windows.begin(); 
           it != sort_windows.end(); ++it) {
Mikaël Salson's avatar
Mikaël Salson committed
880
881
	num_seq++ ;

Mathieu Giraud's avatar
Mathieu Giraud committed
882
883
        out_windows << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ;
	out_windows << it->first << endl;
Mikaël Salson's avatar
Mikaël Salson committed
884
885
886

	if ((!detailed_cluster_analysis) && (num_seq == 1))
	  {
Mathieu Giraud's avatar
Mathieu Giraud committed
887
888
889
	    cout << "\t" << setw(WIDTH_NB_READS) << it->second << "\t";
	    cout << it->first ;
	    cout << "\t" << windows_labels[it->first] ;
Mikaël Salson's avatar
Mikaël Salson committed
890
891
892
893
894
	  }
      }

      if (!detailed_cluster_analysis)
	{
Mathieu Giraud's avatar
Mathieu Giraud committed
895
	  cout << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
896
897
898
899
	  continue ;
	}

	
Mathieu Giraud's avatar
Mathieu Giraud committed
900
      //$$ First pass, choose one representative per cluster
Mikaël Salson's avatar
Mikaël Salson committed
901
902
903
904

      string best_V ;
      string best_D ;
      string best_J ;
Mathieu Giraud's avatar
Mathieu Giraud committed
905
      int more_windows = 0 ;
Mikaël Salson's avatar
Mikaël Salson committed
906
      
Mathieu Giraud's avatar
Mathieu Giraud committed
907
908
      for (list <pair<junction, int> >::const_iterator it = sort_windows.begin(); 
           it != sort_windows.end(); ++it) {
Mikaël Salson's avatar
Mikaël Salson committed
909

Mikaël Salson's avatar
Mikaël Salson committed
910
911
        out_clone << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ;
	out_clone << it->first << endl;
Mikaël Salson's avatar
Mikaël Salson committed
912

Mikaël Salson's avatar
Mikaël Salson committed
913
914
915
916
917

	// Display statistics on auditionned sequences
	if (verbose)
	{
	  int total_length = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
918
919
          list<Sequence> auditioned = windowsStorage->getSample(it->first, max_auditionned);
	  for (list<Sequence>::const_iterator it = auditioned.begin(); it != auditioned.end(); ++it) 
Mikaël Salson's avatar
Mikaël Salson committed
920
921
	    total_length += (*it).sequence.size() ;
	  
Mathieu Giraud's avatar
Mathieu Giraud committed
922
	  cout << auditioned.size() << " auditioned sequences, avg length " << total_length / auditioned.size() << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
923
924
	}

Mathieu Giraud's avatar
Mathieu Giraud committed
925
926
927
928
929
930
        Sequence representative 
          = windowsStorage->getRepresentative(it->first, seed, 
                                             min_cover_representative,
                                             ratio_representative,
                                             max_auditionned);

931
932
933
934
935
936
937

        if (representative == NULL_SEQUENCE) {
	  clones_without_representative++ ;
	  if (verbose)
	    cout << "# (no representative sequence with current parameters)" ;

        } else {
Mathieu Giraud's avatar
Mathieu Giraud committed
938
	//$$ There is one representative, FineSegmenter
939
940


Mathieu Giraud's avatar
Mathieu Giraud committed
941
942
          representative.label = string_of_int(it->second) + "-" 
            + representative.label;
Mikaël Salson's avatar
Mikaël Salson committed
943
	  FineSegmenter seg(representative, rep_V, rep_J, delta_min, delta_max, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
944
	
Mikaël Salson's avatar
Mikaël Salson committed
945
946
	if (segment_D)
	  seg.FineSegmentD(rep_V, rep_D, rep_J);
Mikaël Salson's avatar
Mikaël Salson committed
947
	
Mathieu Giraud's avatar
Mathieu Giraud committed
948
949
950
951
        //cout << seg.toJson();
        json_data_segment[it->first]=seg.toJsonList(rep_V, rep_D, rep_J);
        
        if (seg.isSegmented())
Mikaël Salson's avatar
Mikaël Salson committed
952
953
954
955
956
957
	  {
	    
	    // As soon as one representative is segmented
	    
	    representatives.push_back(seg.getSequence());
            representatives_labels.push_back("#" + string_of_int(num_clone));
Mikaël Salson's avatar
Mikaël Salson committed
958

Mathieu Giraud's avatar
Mathieu Giraud committed
959
960
              // We need to find the window in the representative
              size_t window_pos = seg.getSequence().sequence.find(it->first);
Mikaël Salson's avatar
Mikaël Salson committed
961

Mathieu Giraud's avatar
Mathieu Giraud committed
962
963
              // Default
              int ww = 2*w/3 ; // /2 ;
Mikaël Salson's avatar
Mikaël Salson committed
964

Mathieu Giraud's avatar
Mathieu Giraud committed
965
966
967
968
              if (window_pos != string::npos) {
                // for V.
                ww = seg.getLeft() - window_pos + seg.del_V;
              } 
Mikaël Salson's avatar
Mikaël Salson committed
969
970
            

Mathieu Giraud's avatar
Mathieu Giraud committed
971
              string end_V ="";
Mikaël Salson's avatar
Mikaël Salson committed
972
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
973
974
975
976
              // avoid case when V is not in the window
              if (seg.getLeft() > (int) window_pos)
                end_V = rep_V.sequence(seg.best_V).substr(rep_V.sequence(seg.best_V).size() - ww, 
                                                          ww - seg.del_V);
Mikaël Salson's avatar
Mikaël Salson committed
977

Mathieu Giraud's avatar
Mathieu Giraud committed
978
              string mid_D = "";
Mikaël Salson's avatar
Mikaël Salson committed
979
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
980
981
982
              if (segment_D)
                mid_D = rep_D.sequence(seg.best_D).substr(seg.del_D_left, 
                                                          rep_D.sequence(seg.best_D).size() - seg.del_D_left - seg.del_D_right );
Mikaël Salson's avatar
Mikaël Salson committed
983
	   
Mathieu Giraud's avatar
Mathieu Giraud committed
984
985
986
987
              if (window_pos != string::npos) {
                // for J.
                ww = (window_pos + w - 1) - seg.getRight() + seg.del_J;
              }
Mikaël Salson's avatar
Mikaël Salson committed
988
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
989
              string start_J = "";
Mikaël Salson's avatar
Mikaël Salson committed
990
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
991
992
993
              // avoid case when J is not in the window
              if (seg.getRight() > (int) (window_pos + w - 1))
                start_J=rep_J.sequence(seg.best_J).substr(seg.del_J, ww);
Mikaël Salson's avatar
Mikaël Salson committed
994
	      
Mathieu Giraud's avatar
Mathieu Giraud committed
995
996
997
              best_V = rep_V.label(seg.best_V) ;
              if (segment_D) best_D = rep_D.label(seg.best_D) ;
              best_J = rep_J.label(seg.best_J) ;
Mikaël Salson's avatar
Mikaël Salson committed
998
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
999
1000
              // TODO: pad aux dimensions exactes
              string pad_N = "NNNNNNNNNNNNNNNN" ;
Mikaël Salson's avatar
Mikaël Salson committed
1001

Mathieu Giraud's avatar
Mathieu Giraud committed
1002
              // Add V, (D) and J to windows to be aligned
Mikaël Salson's avatar
Mikaël Salson committed
1003
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
1004
1005
1006
1007
1008
1009
1010
1011
1012
              out_windows << ">" << best_V << "-window" << endl ;
              out_windows << end_V << pad_N << endl ;
              more_windows++;

              if (segment_D) {
                out_windows << ">" << best_D << "-window" << endl ;
                out_windows << mid_D << endl ;   
                more_windows++ ;
              }
Mikaël Salson's avatar
Mikaël Salson committed
1013
	    
Mathieu Giraud's avatar
Mathieu Giraud committed
1014
1015
1016
              out_windows << ">" << best_J << "-window" << endl ;
              out_windows << pad_N << start_J <<  endl ;
              more_windows++;
Mikaël Salson's avatar
Mikaël Salson committed
1017

Mathieu Giraud's avatar
Mathieu Giraud committed
1018
1019
              string code = seg.code ;
              int cc = clones_codes[code];
Mikaël Salson's avatar
Mikaël Salson committed
1020

Mathieu Giraud's avatar
Mathieu Giraud committed
1021
1022
              if (cc)
                {
Mathieu Giraud's avatar
Mathieu Giraud committed
1023
                  cout << " (similar to Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << cc << setfill(' ') << ")";
Mikaël Salson's avatar
Mikaël Salson committed
1024

Mathieu Giraud's avatar
Mathieu Giraud committed
1025
1026
1027
1028
1029
                  nb_edges++ ;
                  out_edges << clones_map_windows[code] + " " + it->first + " "  ;
                  out_edges << code << "  " ;
                  out_edges << "Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << cc        << setfill(' ') << "  " ;
                  out_edges << "Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << setfill(' ') << "  " ;
Mathieu Giraud's avatar
Mathieu Giraud committed
1030
                  out_edges << endl ;                }
Mathieu Giraud's avatar
Mathieu Giraud committed
1031
1032
1033
1034
1035
              else
                {
                  clones_codes[code] = num_clone ;
                  clones_map_windows[code] = it->first ;
                }
Mikaël Salson's avatar
Mikaël Salson committed
1036

Mathieu Giraud's avatar
Mathieu Giraud committed
1037
1038
1039
              out_clone << seg ;
              out_clone << endl ;
          }
Mikaël Salson's avatar
Mikaël Salson committed
1040

Mathieu Giraud's avatar
Mathieu Giraud committed
1041
1042
        if (seg.isSegmented() 
            || it == --(sort_windows.end())) {
Mathieu Giraud's avatar
Mathieu Giraud committed
1043
              // display window
Mikaël Salson's avatar
Mikaël Salson committed
1044
              cout << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
1045
1046
		   << ">clone-"  << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << "-window"  << " " << windows_labels[it->first] << endl
		   << it->first << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
1047

1048
1049
1050
	      // display representative, possibly segmented...
	      // (TODO: factorize)
	      // ... on stdout
Mathieu Giraud's avatar
Mathieu Giraud committed
1051
1052
	      cout << ">clone-"  << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << "-representative" << " " << seg.info << setfill(' ') << endl ;
	      cout << representative.sequence << endl;
1053
1054

	      // ... and in out_clones
1055
1056
1057
1058
	      out_clones << ">clone-"  << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << "-representative" 
			 << "-" << setfill('0') << setw(WIDTH_NB_READS) << clone_nb_reads 
			 << "-" << setprecision(3) << 100 * (float) clone_nb_reads / nb_segmented << "%"
			 << " " << seg.info << setfill(' ') << endl ;
1059
1060
	      out_clones << representative.sequence << endl;

Mathieu Giraud's avatar
Mathieu Giraud committed
1061
              break ;
Mathieu Giraud's avatar
Mathieu Giraud committed
1062
        }
Mathieu Giraud's avatar
Mathieu Giraud committed
1063
        }
Mikaël Salson's avatar
Mikaël Salson committed
1064
1065
      }

Mathieu Giraud's avatar
Mathieu Giraud committed
1066
      cout << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
1067
      out_windows.close();
Mikaël Salson's avatar
Mikaël Salson committed
1068

Mikaël Salson's avatar
Mikaël Salson committed
1069
1070
1071
1072
1073
1074
      if (!very_detailed_cluster_analysis)
	{
	  continue ;
	}


1075
      //$$ Very detailed cluster analysis (with sequences) // NOT USED NOW
Mikaël Salson's avatar
Mikaël Salson committed
1076
1077
1078
1079
1080

      list<string> msa;
      bool good_msa = false ;

      // TODO: do something if no sequences have been segmented !
Mathieu Giraud's avatar
Mathieu Giraud committed
1081
      if (!more_windows)
Mikaël Salson's avatar
Mikaël Salson committed
1082
	{
Mathieu Giraud's avatar
Mathieu Giraud committed
1083
	  cout << "!! No segmented sequence, deleting clone" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
1084
1085
1086
	  // continue ;
	} else 
        {
Mathieu Giraud's avatar
Mathieu Giraud committed
1087
          msa = multiple_seq_align(windows_file_name);
Mikaël Salson's avatar
Mikaël Salson committed
1088
        
Mathieu Giraud's avatar
Mathieu Giraud committed
1089
          // Alignment of windows
Mikaël Salson's avatar
Mikaël Salson committed
1090
1091
1092
          
          if (!msa.empty())
            {
Mathieu Giraud's avatar
Mathieu Giraud committed
1093
              if (msa.size() == sort_windows.size() + more_windows)
Mikaël Salson's avatar
Mikaël Salson committed
1094
                {
Mathieu Giraud's avatar
Mathieu Giraud committed
1095
                  // cout << "clustalw parse: success" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
1096
1097
1098
1099
                  good_msa = true ;
                }
              else
                {
Mathieu Giraud's avatar
Mathieu Giraud committed
1100
                  cout << "! clustalw parse: failed" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
1101
1102
1103
1104
                }
            }
        }
      
Mathieu Giraud's avatar
Mathieu Giraud committed
1105
      //$$ Second pass: output clone, all representatives      
Mikaël Salson's avatar
Mikaël Salson committed
1106
1107
1108
1109
1110

      num_seq = 0 ;
      list <Sequence> representatives_this_clone ;
      string code_representative = "";

Mathieu Giraud's avatar
Mathieu Giraud committed
1111
1112
      for (list <pair<junction, int> >::const_iterator it = sort_windows.begin(); 
           it != sort_windows.end(); ++it) {
Mikaël Salson's avatar
Mikaël Salson committed
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132

	num_seq++ ;

	string junc ;
	
	if (!good_msa)
	  {
	    junc = it->first ;
	  }
	else
	  {
	    junc = msa.back();
	    msa.pop_back();
	  }


	// Output all sequences

	if (output_sequences_by_cluster)
	  {
Mathieu Giraud's avatar
Mathieu Giraud committed
1133
	    out_sequences << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
1134
1135
	    out_sequences << it->first << endl;

Mathieu Giraud's avatar
Mathieu Giraud committed
1136
	    list<Sequence> &sequences = windowsStorage->getReads(it->first);
Mikaël Salson's avatar
Mikaël Salson committed
1137
1138
1139
1140
1141
1142
1143
	    
	    for (list<Sequence>::const_iterator itt = sequences.begin(); itt != sequences.end(); ++itt)
	      {
		out_sequences << *itt ;
	      }
	  }

Mathieu Giraud's avatar
Mathieu Giraud committed
1144
1145
1146
1147
1148
        Sequence representative 
          = windowsStorage->getRepresentative(it->first, seed, 
                                             min_cover_representative,
                                             ratio_representative,
                                             max_auditionned);
Mikaël Salson's avatar
Mikaël Salson committed
1149

Mikaël Salson's avatar
Mikaël Salson committed
1150

Mathieu Giraud's avatar
Mathieu Giraud committed
1151
        if (representative != NULL_SEQUENCE) {
Mathieu Giraud's avatar
Mathieu Giraud committed
1152
1153
          representative.label = string_of_int(it->second) + "-" 
            + representative.label;
Mikaël Salson's avatar
Mikaël Salson committed
1154

Mathieu Giraud's avatar
Mathieu Giraud committed
1155
          FineSegmenter seg(representative, rep_V, rep_J, delta_min, delta_max, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
1156

Mathieu Giraud's avatar
Mathieu Giraud committed
1157