vidjil.cpp 45 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2
3
4
5
6
7
8
  This file is part of Vidjil <http://www.vidjil.org>
  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

Mathieu Giraud's avatar
Mathieu Giraud committed
24
//$$ #include
Mikaël Salson's avatar
Mikaël Salson committed
25
26
27
28
29
30
31
32
33
34
35
36
37
38

#include<algorithm>
#include<utility>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <cstring>
#include <time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#include "core/tools.h"
WebTogz's avatar
WebTogz committed
39
#include "core/json.h"
40
#include "core/germline.h"
Mikaël Salson's avatar
Mikaël Salson committed
41
42
43
#include "core/kmerstore.h"
#include "core/fasta.h"
#include "core/segment.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
44
#include "core/windows.h"
Mikaël Salson's avatar
Mikaël Salson committed
45
46
47
48
49
50
51
#include "core/cluster-junctions.h"
#include "core/dynprog.h"
#include "core/read_score.h"
#include "core/read_chooser.h"
#include "core/compare-all.h"
#include "core/mkdir.h"
#include "core/labels.h"
Mikaël Salson's avatar
Mikaël Salson committed
52
#include "core/list_utils.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
53
#include "core/windowExtractor.h"
Mikaël Salson's avatar
Mikaël Salson committed
54
55
56
57
58
59
60
61

#include "vidjil.h"

// RELEASE_TAG may be defined in the "release.h" file.
// If RELEASE_TAG is undefined, the version will be the git hash.
// #define RELEASE_TAG  "2013.04"
#include "release.h"

62
63
64
// GIT_VERSION should be defined in "git-version.h", created by "create-git-version-h.sh", to be used outside of releases
#include "git-version.h"

65
#define VIDJIL_JSON_VERSION "2014.10"
Mikaël Salson's avatar
Mikaël Salson committed
66

Mathieu Giraud's avatar
Mathieu Giraud committed
67
68
//$$ #define (mainly default options)

69
#define DEFAULT_MULTIGERMLINE "germline"
Mikaël Salson's avatar
Mikaël Salson committed
70

Mathieu Giraud's avatar
Mathieu Giraud committed
71
#define DEFAULT_READS  "./data/Stanford_S22.fasta"
72
#define DEFAULT_MIN_READS_CLONE 5
73
#define DEFAULT_MAX_REPRESENTATIVES 100
74
75
#define DEFAULT_MAX_CLONES 20
#define DEFAULT_RATIO_READS_CLONE 0.0
76
#define NO_LIMIT "all"
Mikaël Salson's avatar
Mikaël Salson committed
77

Mathieu Giraud's avatar
Mathieu Giraud committed
78
#define COMMAND_WINDOWS "windows"
79
#define COMMAND_CLONES "clones"
Mikaël Salson's avatar
Mikaël Salson committed
80
#define COMMAND_SEGMENT "segment"
81
#define COMMAND_GERMLINES "germlines"
Mikaël Salson's avatar
Mikaël Salson committed
82
 
83
enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
Mikaël Salson's avatar
Mikaël Salson committed
84

85
86
87
#define DEFAULT_OUT_DIR "./out/" 

// Fixed filenames/suffixes
88
#define CLONES_FILENAME ".vdj.fa"
Mikaël Salson's avatar
Mikaël Salson committed
89
#define CLONE_FILENAME "clone.fa-"
90
91
#define WINDOWS_FILENAME ".windows.fa"
#define SEGMENTED_FILENAME ".segmented.vdj.fa"
92
#define UNSEGMENTED_FILENAME ".unsegmented.vdj.fa"
93
#define AFFECTS_FILENAME ".affects"
94
95
96
#define EDGES_FILENAME ".edges"
#define COMP_FILENAME "comp.vidjil"
#define JSON_SUFFIX ".vidjil"
Mikaël Salson's avatar
Mikaël Salson committed
97
98
99

// "tests/data/leukemia.fa" 

100
#define DEFAULT_K      0
Mikaël Salson's avatar
Mikaël Salson committed
101
#define DEFAULT_W      40
Mikaël Salson's avatar
Mikaël Salson committed
102
#define DEFAULT_W_D    60
103
#define DEFAULT_SEED   ""
Mikaël Salson's avatar
Mikaël Salson committed
104
105

#define DEFAULT_DELTA_MIN  -10
Mathieu Giraud's avatar
Mathieu Giraud committed
106
#define DEFAULT_DELTA_MAX   20
Mikaël Salson's avatar
Mikaël Salson committed
107
108

#define DEFAULT_DELTA_MIN_D  0
109
#define DEFAULT_DELTA_MAX_D  80
Mikaël Salson's avatar
Mikaël Salson committed
110

Mikaël Salson's avatar
Mikaël Salson committed
111
#define DEFAULT_MAX_AUDITIONED 2000
Mathieu Giraud's avatar
Mathieu Giraud committed
112
113
#define DEFAULT_RATIO_REPRESENTATIVE 0.5

Mikaël Salson's avatar
Mikaël Salson committed
114
115
116
117
118
119
#define DEFAULT_EPSILON  0
#define DEFAULT_MINPTS   10

#define DEFAULT_CLUSTER_COST  Cluster
#define DEFAULT_SEGMENT_COST   VDJ

120
121
122
// error
#define ERROR_STRING "[error] "

123
// warn
124
#define WARN_MAX_CLONES 100
125
#define WARN_PERCENT_SEGMENTED 40
126

Mikaël Salson's avatar
Mikaël Salson committed
127
128
129
// display
#define WIDTH_NB_READS 7
#define WIDTH_NB_CLONES 3
130

Mikaël Salson's avatar
Mikaël Salson committed
131
132
133

using namespace std ;

Mathieu Giraud's avatar
Mathieu Giraud committed
134
//$$ options: usage
Mikaël Salson's avatar
Mikaël Salson committed
135

Mathieu Giraud's avatar
Mathieu Giraud committed
136
extern char *optarg;
Mikaël Salson's avatar
Mikaël Salson committed
137
138
139

extern int optind, optopt, opterr;

140
void usage(char *progname, bool advanced)
Mikaël Salson's avatar
Mikaël Salson committed
141
{
142
  cerr << "Usage: " << progname << " [options] <reads.fa/.fq/.gz>" << endl << endl;
Mikaël Salson's avatar
Mikaël Salson committed
143
144

  cerr << "Command selection" << endl
145
146
147
148
149
       << "  -c <command>"
       << "\t"     << COMMAND_CLONES    << "  \t locus detection, window extraction, clone gathering (default command, most efficient, all outputs)" << endl
       << "  \t\t" << COMMAND_WINDOWS   << "  \t locus detection, window extraction" << endl
       << "  \t\t" << COMMAND_SEGMENT   << "  \t detailed V(D)J segmentation (not recommended)" << endl
       << "  \t\t" << COMMAND_GERMLINES << "  \t statistics on k-mers in different germlines" << endl
Mikaël Salson's avatar
Mikaël Salson committed
150
151
       << endl       

152
       << "Germline databases (one -V/(-D)/-J, or -G, or -g option must be given for all commands except -c " << COMMAND_GERMLINES << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
153
       << "  -V <file>     V germline multi-fasta file" << endl
154
       << "  -D <file>     D germline multi-fasta file (and resets -m, -M and -w options), will segment into V(D)J components" << endl
Mikaël Salson's avatar
Mikaël Salson committed
155
       << "  -J <file>     J germline multi-fasta file" << endl
156
       << "  -G <prefix>   prefix for V (D) and J repertoires (shortcut for -V <prefix>V.fa -D <prefix>D.fa -J <prefix>J.fa) (basename gives germline code)" << endl
157
       << "  -g <path>     multiple germlines (in the path <path>, takes TRA, TRB, TRG, TRD, IGH, IGK and IGL and sets window prediction parameters)" << endl
158
       << "  -i            multiple germlines, also incomplete rearrangements (must be used with -g)" << endl
159
       << endl ;
160

161
162
  if (advanced)
  cerr << "Experimental options (do not use)" << endl
163
       << "  -I            ignore k-mers common to different germline systems (experimental, must be used with -g, do not use)" << endl
164
       << "  -1            use a unique index for all germline systems (experimental, must be used with -g, do not use)" << endl
165
       << "  -!            keep unsegmented reads as clones, taking for junction the complete sequence, to be used on very small datasets (for example -!AX 20)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
166
167
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
168
       << "Window prediction" << endl
Mikaël Salson's avatar
Mikaël Salson committed
169
#ifndef NO_SPACED_SEEDS
170
       << "  (use either -s or -k option, but not both)" << endl
171
       << "  (all these options, except -w, are overriden when using -g)" << endl
172
173
       << "  -s <string>   spaced seed used for the V/J affectation" << endl
       << "                (default: #####-#####, ######-######, #######-#######, depends on germline)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
174
#endif
175
       << "  -k <int>      k-mer size used for the V/J affectation (default: 10, 12, 13, depends on germline)" << endl
176
177
178
#ifndef NO_SPACED_SEEDS
       << "                (using -k option is equivalent to set with -s a contiguous seed with only '#' characters)" << endl
#endif
179
180
181
       << "  -m <int>      minimal admissible delta between last V and first J k-mer (default: " << DEFAULT_DELTA_MIN << ") (default with -D: " << DEFAULT_DELTA_MIN_D << ")" << endl
       << "  -M <int>      maximal admissible delta between last V and first J k-mer (default: " << DEFAULT_DELTA_MAX << ") (default with -D: " << DEFAULT_DELTA_MAX_D << ")" << endl
       << "  -w <int>      w-mer size used for the length of the extracted window (default: " << DEFAULT_W << ") (default with -D: " << DEFAULT_W_D << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
182
183
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
184
       << "Window annotations" << endl
185
       << "  -l <file>     labels for some windows -- these windows will be kept even if -r/-% thresholds are not reached" << endl
186
       << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
187

188
  cerr << "Limits to report a clone (or a window)" << endl
189
190
       << "  -r <nb>       minimal number of reads supporting a clone (default: " << DEFAULT_MIN_READS_CLONE << ")" << endl
       << "  -% <ratio>    minimal percentage of reads supporting a clone (default: " << DEFAULT_RATIO_READS_CLONE << ")" << endl
191
192
       << endl

193
       << "Limits to further analyze some clones" << endl
194
       << "  -y <nb>       maximal number of clones computed with a representative ('" << NO_LIMIT << "': no limit) (default: " << DEFAULT_MAX_REPRESENTATIVES << ")" << endl
195
       << "  -z <nb>       maximal number of clones to be segmented ('" << NO_LIMIT << "': no limit, do not use) (default: " << DEFAULT_MAX_CLONES << ")" << endl
196
197
       << "  -A            reports and segments all clones (-r 0 -% 0 -y " << NO_LIMIT << " -z " << NO_LIMIT << "), to be used only on very small datasets (for example -AX 20)" << endl
       << "  -X <nb>       maximal number of reads to process ('" << NO_LIMIT << "': no limit, default)" << endl
198
       << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
199

200
201
  if (advanced)
  cerr << "Fine segmentation options (second pass, see warning in doc/algo.org)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
202
       << "  -f <string>   use custom Cost for fine segmenter : format \"match, subst, indels, homo, del_end\" (default "<<VDJ<<" )"<< endl
203
       << "  -3            CDR3 detection (experimental)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
204
205
       << endl

206
       << "Additional clustering (experimental)" << endl
207
208
209
210
211
212
       << "  -e <file>     manual clustering -- a file used to force some specific edges" << endl
       << "  -n <int>      maximum distance between neighbors for automatic clustering (default " << DEFAULT_EPSILON << "). No automatic clusterisation if =0." << endl
       << "  -N <int>      minimum required neighbors for automatic clustering (default " << DEFAULT_MINPTS << ")" << endl
       << "  -S            generate and save comparative matrix for clustering" << endl
       << "  -L            load comparative matrix for clustering" << endl
       << "  -C <string>   use custom Cost for automatic clustering : format \"match, subst, indels, homo, del_end\" (default "<<Cluster<<" )"<< endl
213
       << endl ;
214

215
  cerr << "Detailed output per read (not recommended, large files)" << endl
216
217
       << "  -U            output segmented reads (in " << SEGMENTED_FILENAME << " file)" << endl
       << "  -u            output unsegmented reads (in " << UNSEGMENTED_FILENAME << " file)" << endl
218
       << "  -K            output detailed k-mer affectation on all reads (in " << AFFECTS_FILENAME << " file) (use only for debug, for example -KX 100)" << endl
219
220
       << endl
 
Mikaël Salson's avatar
Mikaël Salson committed
221
       << "Output" << endl
222
       << "  -o <dir>      output directory (default: " << DEFAULT_OUT_DIR << ")" <<  endl
223
       << "  -b <string>   output basename (by default basename of the input file)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
224
    
225
       << "  -a            output all sequences by cluster (" << CLONE_FILENAME << "*), to be used only on small datasets" << endl
Mikaël Salson's avatar
Mikaël Salson committed
226
227
228
       << "  -v            verbose mode" << endl
       << endl        

229
230
       << "  -h            help" << endl
       << "  -H            help, including experimental and advanced options" << endl
231
232
233
       << "The full help is available in the doc/algo.org file."
       << endl

Mikaël Salson's avatar
Mikaël Salson committed
234
       << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
235
       << "Examples (see doc/algo.org)" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
236
       << "  " << progname << " -c clones   -G germline/IGH  -r 5          data/Stanford_S22.fasta" << endl
237
238
       << "  " << progname << " -c clones   -g germline      -r 5          data/Stanford_S22.fasta   # (detect the locus for each read)" << endl
       << "  " << progname << " -c windows  -g germline      -u -U         data/Stanford_S22.fasta   # (detect the locus, splits the reads into two (large) files)" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
239
       << "  " << progname << " -c segment  -G germline/IGH                data/Stanford_S22.fasta   # (only for testing)" << endl
240
       << "  " << progname << " -c germlines                               data/Stanford_S22.fasta" << endl
Mikaël Salson's avatar
Mikaël Salson committed
241
242
243
244
    ;
  exit(1);
}

Mathieu Giraud's avatar
Mathieu Giraud committed
245
246
247
248
249
250

int atoi_NO_LIMIT(char *optarg)
{
  return strcmp(NO_LIMIT, optarg) ? atoi(optarg) : -1 ;
}

Mikaël Salson's avatar
Mikaël Salson committed
251
252
int main (int argc, char **argv)
{
253
  cout << "# Vidjil -- V(D)J recombinations analysis <http://www.vidjil.org/>" << endl
254
255
       << "# Copyright (C) 2011, 2012, 2013, 2014, 2015 by the Vidjil team" << endl
       << "# Bonsai bioinformatics at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille" << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
256
257
258
259
260
       << endl
       << "# Vidjil is free software, and you are welcome to redistribute it" << endl
       << "# under certain conditions -- see http://git.vidjil.org/blob/master/doc/LICENSE" << endl
       << "# No lymphocyte was harmed in the making of this software," << endl
       << "# however this software is for research use only and comes with no warranty." << endl
261
262
       << endl
       << "# Please cite http://biomedcentral.com/1471-2164/15/409 if you use Vidjil." 
Mikaël Salson's avatar
Mikaël Salson committed
263
264
       << endl ;

Mathieu Giraud's avatar
Mathieu Giraud committed
265
266
  //$$ options: defaults

267
  string germline_system = "" ;
268
269
270
271
272
  
  list <string> f_reps_V ;
  list <string> f_reps_D ;
  list <string> f_reps_J ;

Mikaël Salson's avatar
Mikaël Salson committed
273
274
  string f_reads = DEFAULT_READS ;
  string seed = DEFAULT_SEED ;
275
  string f_basename = "";
Mikaël Salson's avatar
Mikaël Salson committed
276

277
  string out_dir = DEFAULT_OUT_DIR;
Mikaël Salson's avatar
Mikaël Salson committed
278
279
280
281
  
  string comp_filename = COMP_FILENAME;

  int k = DEFAULT_K ;
Mikaël Salson's avatar
Mikaël Salson committed
282
283
284
  int w = 0 ;
  int default_w = DEFAULT_W ;

Mikaël Salson's avatar
Mikaël Salson committed
285
286
287
288
  int epsilon = DEFAULT_EPSILON ;
  int minPts = DEFAULT_MINPTS ;
  Cost cluster_cost = DEFAULT_CLUSTER_COST ;
  Cost segment_cost = DEFAULT_SEGMENT_COST ;
289
  bool detect_CDR3 = false;
Mikaël Salson's avatar
Mikaël Salson committed
290
291
292
293
294
  
  int save_comp = 0;
  int load_comp = 0;
  
  int verbose = 0 ;
295
  int command = CMD_CLONES;
Mikaël Salson's avatar
Mikaël Salson committed
296

297
  int max_representatives = DEFAULT_MAX_REPRESENTATIVES ;
298
299
300
  int max_clones = DEFAULT_MAX_CLONES ;
  int min_reads_clone = DEFAULT_MIN_READS_CLONE ;
  float ratio_reads_clone = DEFAULT_RATIO_READS_CLONE;
Mikaël Salson's avatar
Mikaël Salson committed
301
302
  // int average_deletion = 4;     // Average number of deletion in V or J

303
  int max_reads_processed = -1;
304

Mathieu Giraud's avatar
Mathieu Giraud committed
305
  float ratio_representative = DEFAULT_RATIO_REPRESENTATIVE;
Mikaël Salson's avatar
Mikaël Salson committed
306
  unsigned int max_auditionned = DEFAULT_MAX_AUDITIONED;
Mathieu Giraud's avatar
Mathieu Giraud committed
307

Mikaël Salson's avatar
Mikaël Salson committed
308
309
310
311
312
  // Admissible delta between left and right segmentation points
  int delta_min = DEFAULT_DELTA_MIN ; // Kmer+Fine
  int delta_max = DEFAULT_DELTA_MAX ; // Fine

  bool output_sequences_by_cluster = false;
313
  bool output_segmented = false;
Mathieu Giraud's avatar
Mathieu Giraud committed
314
  bool output_unsegmented = false;
315
  bool output_affects = false;
316
317
  bool keep_unsegmented_as_clone = false;

318
  bool multi_germline = false;
319
  bool multi_germline_incomplete = false;
320
  bool multi_germline_mark = false;
321
  bool multi_germline_one_index_per_germline = true;
322
  string multi_germline_file = DEFAULT_MULTIGERMLINE;
Mikaël Salson's avatar
Mikaël Salson committed
323
324
325

  string forced_edges = "" ;

Mathieu Giraud's avatar
Mathieu Giraud committed
326
  string windows_labels_file = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
327
328
329

  char c ;

330
331
  int options_s_k = 0 ;

WebTogz's avatar
WebTogz committed
332
333
334
  //JsonArray which contains the Levenshtein distances
  JsonArray jsonLevenshtein;

Mathieu Giraud's avatar
Mathieu Giraud committed
335
336
  //$$ options: getopt

337

338
  while ((c = getopt(argc, argv, "A!X:hHaiI1g:G:V:D:J:k:r:vw:e:C:f:l:c:m:M:N:s:b:Sn:o:L%:y:z:uUK3")) != EOF)
Mikaël Salson's avatar
Mikaël Salson committed
339
340
341
342

    switch (c)
      {
      case 'h':
343
344
345
346
        usage(argv[0], false);

      case 'H':
        usage(argv[0], true);
347

Mikaël Salson's avatar
Mikaël Salson committed
348
      case 'c':
349
350
        if (!strcmp(COMMAND_CLONES,optarg))
          command = CMD_CLONES;
Mikaël Salson's avatar
Mikaël Salson committed
351
352
        else if (!strcmp(COMMAND_SEGMENT,optarg))
          command = CMD_SEGMENT;
Mathieu Giraud's avatar
Mathieu Giraud committed
353
354
        else if (!strcmp(COMMAND_WINDOWS,optarg))
          command = CMD_WINDOWS;
355
356
        else if (!strcmp(COMMAND_GERMLINES,optarg))
          command = CMD_GERMLINES;
Mikaël Salson's avatar
Mikaël Salson committed
357
358
        else {
          cerr << "Unknwown command " << optarg << endl;
359
	  usage(argv[0], false);
Mikaël Salson's avatar
Mikaël Salson committed
360
361
        }
        break;
362

Mikaël Salson's avatar
Mikaël Salson committed
363
364
365
366

      // Germline

      case 'V':
367
	f_reps_V.push_back(optarg);
368
	germline_system = "custom" ;
Mikaël Salson's avatar
Mikaël Salson committed
369
370
371
	break;

      case 'D':
372
	f_reps_D.push_back(optarg);
373
374
375
	delta_min = DEFAULT_DELTA_MIN_D ;
	delta_max = DEFAULT_DELTA_MAX_D ;
	default_w = DEFAULT_W_D ;
Mikaël Salson's avatar
Mikaël Salson committed
376
377
378
	break;
        
      case 'J':
379
	f_reps_J.push_back(optarg);
380
	germline_system = "custom" ;
Mikaël Salson's avatar
Mikaël Salson committed
381
382
	break;

383
      case 'g':
384
385
	multi_germline = true;
	multi_germline_file = string(optarg);
386
	germline_system = "multi" ;
387
	break;
388
389
390
391

      case 'i':
	multi_germline_incomplete = true;
	break;
392
393
394
395

      case 'I':
        multi_germline_mark = true;
	break;
396
397
398
399

      case '1':
        multi_germline_one_index_per_germline = false ;
        break;
400
        
Mikaël Salson's avatar
Mikaël Salson committed
401
      case 'G':
Mikaël Salson's avatar
Mikaël Salson committed
402
	germline_system = string(optarg);
403
	f_reps_V.push_back((germline_system + "V.fa").c_str()) ;
404
405
406
407
408
        // Takes D only if it exists
        {
          struct stat buffer; 
          string putative_f_rep_D = germline_system + "D.fa" ;
          if (stat(putative_f_rep_D.c_str(), &buffer) == 0)
409
410
411
412
413
414
            {
              f_reps_D.push_back(putative_f_rep_D.c_str()) ;
              delta_min = DEFAULT_DELTA_MIN_D ;
              delta_max = DEFAULT_DELTA_MAX_D ;
              default_w = DEFAULT_W_D ;
            }
415
        }
416
	f_reps_J.push_back((germline_system + "J.fa").c_str()) ;
417
	germline_system = extract_basename(germline_system);
418

Mikaël Salson's avatar
Mikaël Salson committed
419
420
421
422
	break;

      // Algorithm

423
424
425
426
427
428
429
430
431
432
      case 's':
#ifndef NO_SPACED_SEEDS
	seed = string(optarg);
	k = seed_weight(seed);
	options_s_k++ ;
#else
        cerr << "To enable the option -s, please compile without NO_SPACED_SEEDS" << endl;
#endif
        break;

Mikaël Salson's avatar
Mikaël Salson committed
433
434
435
      case 'k':
	k = atoi(optarg);
	seed = seed_contiguous(k);
436
	options_s_k++ ;
Mikaël Salson's avatar
Mikaël Salson committed
437
438
439
440
441
442
443
444
445
446
447
448
449
450
        break;

      case 'w':
	w = atoi(optarg);
        break;

      case 'm':
	delta_min = atoi(optarg);
        break;

      case 'M':
	delta_max = atoi(optarg);
        break;

451
452
453
454
      case '!':
        keep_unsegmented_as_clone = true;
        break;

455
456
      // Output 

Mikaël Salson's avatar
Mikaël Salson committed
457
458
459
460
      case 'o':
        out_dir = optarg ;
        break;

461
462
      case 'b':
        f_basename = optarg;
Mikaël Salson's avatar
Mikaël Salson committed
463
464
        break;

465
466
467
468
469
470
471
472
      case 'a':
	output_sequences_by_cluster = true;
	break;

      case 'v':
	verbose += 1 ;
	break;

473
474
      // Limits

Mikaël Salson's avatar
Mikaël Salson committed
475
476
477
478
      case '%':
	ratio_reads_clone = atof(optarg);
	break;

479
      case 'r':
Mikaël Salson's avatar
Mikaël Salson committed
480
481
482
	min_reads_clone = atoi(optarg);
        break;

483
      case 'y':
Mathieu Giraud's avatar
Mathieu Giraud committed
484
	max_representatives = atoi_NO_LIMIT(optarg);
485
486
        break;

Mikaël Salson's avatar
Mikaël Salson committed
487
      case 'z':
Mathieu Giraud's avatar
Mathieu Giraud committed
488
	max_clones = atoi_NO_LIMIT(optarg);
Mikaël Salson's avatar
Mikaël Salson committed
489
        break;
490
491
492

      case 'A': // --all
	ratio_reads_clone = 0 ;
493
494
495
	min_reads_clone = 1 ;
	max_representatives = -1 ;
	max_clones = -1 ;
496
497
	break ;

498
      case 'X':
Mathieu Giraud's avatar
Mathieu Giraud committed
499
        max_reads_processed = atoi_NO_LIMIT(optarg);
500
        break;
501

502
503
504
505
      case 'l':
	windows_labels_file = optarg; 
	break;

506
      // Clustering
507
508
509
510

      case 'e':
	forced_edges = optarg;
	break;
Mikaël Salson's avatar
Mikaël Salson committed
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
	
      case 'n':
	epsilon = atoi(optarg);
        break;

      case 'N':
	minPts = atoi(optarg);
        break;
	
      case 'S':
	save_comp=1;
        break;

      case 'L':
	load_comp=1;
        break;
	
      case 'C':
	cluster_cost=strToCost(optarg, Cluster);
        break;
	
532
      // Fine segmentation
533
534
535
536
      case '3':
        detect_CDR3 = true;
        break;
        
537
      case 'f':
Mikaël Salson's avatar
Mikaël Salson committed
538
539
	segment_cost=strToCost(optarg, VDJ);
        break;
Mathieu Giraud's avatar
Mathieu Giraud committed
540
541
542
543

      case 'u':
        output_unsegmented = true;
        break;
544
545
546
      case 'U':
        output_segmented = true;
        break;
547
548
549
      case 'K':
        output_affects = true;
        break;
Mikaël Salson's avatar
Mikaël Salson committed
550
551
      }

552
553
554

  //$$ options: post-processing+display

555
556
557

  if (!germline_system.size() && (command != CMD_GERMLINES))
    {
558
      cerr << ERROR_STRING << "At least one germline must be given with -V/(-D)/-J, or -G, or -g." << endl ;
559
560
561
      exit(1);
    }

Mikaël Salson's avatar
Mikaël Salson committed
562
563
564
565
566
  // If there was no -w option, then w is either DEFAULT_W or DEFAULT_W_D
  if (w == 0)
    w = default_w ;

  
567
568
  if (options_s_k > 1)
    {
569
      cerr << ERROR_STRING << "Use at most one -s or -k option." << endl ;
570
571
572
      exit(1);
    }

Mikaël Salson's avatar
Mikaël Salson committed
573
574
  string out_seqdir = out_dir + "/seq/" ;

Mikaël Salson's avatar
Mikaël Salson committed
575
576
577
578
579
580
581
582
583
584
585
586
587
  if (verbose)
    cout << "# verbose " << verbose << endl ;

  if (optind == argc)
    {
      cout << "# using default sequence file: " << f_reads << endl ;
    }
  else if (optind+1 == argc)
    {
      f_reads = argv[optind] ; 
    }
  else
    {
588
      cerr << ERROR_STRING << "Wrong number of arguments." << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
589
590
      exit(1);
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
591

592
  size_t min_cover_representative = (size_t) (min_reads_clone < (int) max_auditionned ? min_reads_clone : max_auditionned) ;
593

594
595
596
597
598
599
  // Default seeds

#ifndef NO_SPACED_SEEDS
  if (k == DEFAULT_K)
    {
      if (germline_system.find("TRA") != string::npos)
600
	seed = SEED_S13 ;
601
602
603

      else if ((germline_system.find("TRB") != string::npos)
	       || (germline_system.find("IGH") != string::npos))
604
	seed = SEED_S12 ;
605
      else // TRD, TRG, IGK, IGL, custom, multi
606
	seed = SEED_S10 ;
607
608
609
610
611

      k = seed_weight(seed);
    }
#else
  {
612
    cerr << ERROR_STRING << "Vidjil was compiled with NO_SPACED_SEEDS: please provide a -k option." << endl;
613
614
615
616
617
    exit(1) ;
  }
#endif
	  

Mathieu Giraud's avatar
Mathieu Giraud committed
618
619
620
621
#ifndef NO_SPACED_SEEDS
  // Check seed buffer  
  if (seed.size() >= MAX_SEED_SIZE)
    {
622
      cerr << ERROR_STRING << "Seed size is too large (MAX_SEED_SIZE)." << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
623
624
625
626
      exit(1);
    }
#endif

627
628
629

  if (w < seed_weight(seed))
    {
630
      cerr << ERROR_STRING << "Too small -w. The window size should be at least equal to the seed size (" << seed_weight(seed) << ")." << endl;
631
632
633
      exit(1);
    }

Mikaël Salson's avatar
Mikaël Salson committed
634
635
636
637
  // Check that out_dir is an existing directory or creates it
  const char *out_cstr = out_dir.c_str();

  if (mkpath(out_cstr, 0755) == -1) {
638
    cerr << ERROR_STRING << "Directory creation: " << out_dir << endl; perror("");
Mikaël Salson's avatar
Mikaël Salson committed
639
640
641
    exit(2);
  }

Mikaël Salson's avatar
Mikaël Salson committed
642
643
  const char *outseq_cstr = out_seqdir.c_str();
  if (mkpath(outseq_cstr, 0755) == -1) {
644
    cerr << ERROR_STRING << "Directory creation: " << out_seqdir << endl; perror("");
Mikaël Salson's avatar
Mikaël Salson committed
645
646
647
    exit(2);
  }

648
649
650
651
652
  // Compute basename if not given as an option
  if (f_basename == "") {
    f_basename = extract_basename(f_reads);
  }

Mikaël Salson's avatar
Mikaël Salson committed
653
654
655
  out_dir += "/" ;

  /// Load labels ;
Mathieu Giraud's avatar
Mathieu Giraud committed
656
  map <string, string> windows_labels = load_map(windows_labels_file);
Mikaël Salson's avatar
Mikaël Salson committed
657
658

  switch(command) {
Mathieu Giraud's avatar
Mathieu Giraud committed
659
  case CMD_WINDOWS: cout << "Extracting windows" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
660
    break;
661
  case CMD_CLONES: cout << "Analysing clones" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
662
663
664
    break;
  case CMD_SEGMENT: cout << "Segmenting V(D)J" << endl;
    break;
665
666
  case CMD_GERMLINES: cout << "Discovering germlines" << endl;
    break;
Mikaël Salson's avatar
Mikaël Salson committed
667
668
  }

Mathieu Giraud's avatar
Mathieu Giraud committed
669
  cout << "Command line: ";
Mikaël Salson's avatar
Mikaël Salson committed
670
  for (int i=0; i < argc; i++) {
Mathieu Giraud's avatar
Mathieu Giraud committed
671
    cout << argv[i] << " ";
Mikaël Salson's avatar
Mikaël Salson committed
672
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
673
  cout << endl;
Mikaël Salson's avatar
Mikaël Salson committed
674
675
676
677
678
679
680
681
682
683
684
685

  //////////////////////////////////
  // Display time and date
  time_t rawtime;
  struct tm *timeinfo;
  char time_buffer[80];

  time (&rawtime );
  timeinfo = localtime (&rawtime);

  strftime (time_buffer, 80,"%F %T", timeinfo);

Mathieu Giraud's avatar
Mathieu Giraud committed
686
  cout << "# " << time_buffer << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
687
688
689
690
691


  //////////////////////////////////
  // Display version information or git log

692
  string soft_version = "vidjil ";
Mikaël Salson's avatar
Mikaël Salson committed
693
#ifdef RELEASE_TAG
Mathieu Giraud's avatar
Mathieu Giraud committed
694
  cout << "# version: vidjil " << RELEASE_TAG << endl ;
695
  soft_version.append(RELEASE_TAG);
Mikaël Salson's avatar
Mikaël Salson committed
696
#else
697
698
699
  cout << "# development version" << endl ;
#ifdef GIT_VERSION
  cout << "# git: " << GIT_VERSION << endl ;
700
  soft_version.append("dev ");
701
  soft_version.append(GIT_VERSION);
702
#endif
703
#endif
704

705
706
  //////////////////////////////////
  // Warning for non-optimal use
707

708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
  if (max_clones == -1 || max_clones > WARN_MAX_CLONES)
    {
      cout << endl
	   << "* WARNING: vidjil was run with '-A' option or with a large '-z' option" << endl ;
    }
  
  if (command == CMD_SEGMENT)
    {
      cout << endl
	   << "* WARNING: vidjil was run with '-c segment' option" << endl ;
    }
  
  if (max_clones == -1 || max_clones > WARN_MAX_CLONES || command == CMD_SEGMENT)
    {
      cout << "* Vidjil purpose is to extract very quickly windows overlapping the CDR3" << endl
	   << "* and to gather reads into clones (-c clones), and not to provide an accurate V(D)J segmentation." << endl
	   << "* The following segmentations are slow to compute and are provided only for convenience." << endl
	   << "* They should be checked with other softwares such as IgBlast, iHHMune-align or IMGT/V-QUEST." << endl 
	   << "* More information is provided in the 'doc/algo.org' file." << endl 
	   << endl ;
    }
729

730
731
732
  /////////////////////////////////////////
  //            LOAD GERMLINES           //
  /////////////////////////////////////////
733

734
735
736
737
738
739
740
  if (command == CMD_GERMLINES)
    {
      multi_germline = true ;
      multi_germline_one_index_per_germline = false ;
    }

  MultiGermline *multigermline = new MultiGermline(multi_germline_one_index_per_germline);
741

742
    {
743
      cout << "Load germlines and build Kmer indexes" << endl ;
744
745
746
747
    
      if (multi_germline)
	{
	  multigermline->build_default_set(multi_germline_file);
748
749
          if (multi_germline_incomplete)
            multigermline->build_incomplete_set(multi_germline_file);
750
751
752
753
754
755
	}
      else
	{
	  // Custom germline
	  Germline *germline;
	  germline = new Germline(germline_system, 'X',
756
757
758
                                  f_reps_V, f_reps_D, f_reps_J, 
                                  delta_min, delta_max);

759
          germline->new_index(seed);
760

761
762
763
	  multigermline->insert(germline);
	}
    }
764

765
    cout << endl ;
766

767
768
769
    if (!multi_germline_one_index_per_germline)
      multigermline->build_with_one_index(seed);

770
771
772
    if (multi_germline_mark)
      multigermline->mark_cross_germlines_as_ambiguous();
    
773
774
775
776
    cout << "Germlines loaded" << endl ;
    cout << *multigermline ;
    cout << endl ;
    
777
778
779
780
781
782
783
  //////////////////////////////////
  //$$ Read sequence files
 
  OnlineFasta *reads;

  try {
    reads = new OnlineFasta(f_reads, 1, " ");
784
  } catch (const invalid_argument e) {
785
    cerr << ERROR_STRING << "Vidjil cannot open reads file " << f_reads << ": " << e.what() << endl;
786
787
788
789
790
791
    exit(1);
  }

  out_dir += "/";


792
793
794
795
796
  //////////////////////////////://////////
  //         DISCOVER GERMLINES          //
  /////////////////////////////////////////
  if (command == CMD_GERMLINES)
    {
797
      map <char, int> stats_kmer, stats_max;
798
      IKmerStore<KmerAffect> *index = multigermline->index ;
799
800

      // Initialize statistics, with two additional categories
801
802
      index->labels.push_back(make_pair(KmerAffect::getAmbiguous(), AFFECT_AMBIGUOUS_SYMBOL));
      index->labels.push_back(make_pair(KmerAffect::getUnknown(), AFFECT_UNKNOWN_SYMBOL));
803
804
      
      for (list< pair <KmerAffect, string> >::const_iterator it = index->labels.begin(); it != index->labels.end(); ++it)
805
	{
806
807
808
	  char key = affect_char(it->first.affect) ;
	  stats_kmer[key] = 0 ;
	  stats_max[key] = 0 ;
809
810
	}
      
811
      // init forbidden for .max()
812
813
814
      set<KmerAffect> forbidden;
      forbidden.insert(KmerAffect::getAmbiguous());
      forbidden.insert(KmerAffect::getUnknown());
815
      
816
817
818
      // Loop through all reads

      int nb_reads = 0 ;
819
820
821
      int total_length = 0 ;
      int s = index->getS();

822
823
824
825
826
      while (reads->hasNext())
	{
	  reads->next();
	  nb_reads++;
	  string seq = reads->getSequence().sequence;
827
	  total_length += seq.length() - s + 1;
828

829
	  KmerAffectAnalyser *kaa = new KmerAffectAnalyser(*index, seq);
830
831
832

	  for (int i = 0; i < kaa->count(); i++) 
	    { 
833
834
	      KmerAffect ksa = kaa->getAffectation(i);
	      stats_kmer[affect_char(ksa.affect)]++ ;
835
	    }
836

837
          delete kaa;
838

839
	  CountKmerAffectAnalyser ckaa(*index, seq);
840
841
	  ckaa.setAllowedOverlap(k-1);

842
	  stats_max[affect_char(ckaa.max(forbidden).affect)]++ ;
843

844
845
	}

846
847
      delete reads;

848
849
      // Display statistics

850
851
852
853
      cout << "  <== "
	   << nb_reads << " reads, "
	   << total_length << " kmers"
	   << endl ;
854
      cout << "\t" << " max" << "\t\t" << "        kmers" << "\n" ;
855
856

      for (list< pair <KmerAffect, string> >::const_iterator it = index->labels.begin(); it != index->labels.end(); ++it)
857
	{
858
859
860
861
	  char key = affect_char(it->first.affect) ;
	  
	  cout << setw(12) << stats_max[key] << " " ;
	  cout << setw(6) << fixed << setprecision(2) <<  (float) stats_max[key] / nb_reads * 100 << "%" ;
862

863
	  cout << "     " ;
864

865
866
	  cout << setw(12) << stats_kmer[key] << " " ;
	  cout << setw(6) << fixed << setprecision(2) <<  (float) stats_kmer[key] / total_length * 100 << "%" ;
867

868
	  cout << "     " << key << " " << it->second << endl ;
869
	}
870
871
      
      delete index;
872
873
874
875
      exit(0);
    }


Mikaël Salson's avatar
Mikaël Salson committed
876
877
878
  ////////////////////////////////////////
  //           CLONE ANALYSIS           //
  ////////////////////////////////////////
879
  if (command == CMD_CLONES || command == CMD_WINDOWS) {
Mikaël Salson's avatar
Mikaël Salson committed
880
881

    //////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
882
    //$$ Kmer Segmentation
Mikaël Salson's avatar
Mikaël Salson committed
883

Mathieu Giraud's avatar
Mathieu Giraud committed
884
885
    cout << endl;
    cout << "Loop through reads, looking for windows" << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
886

887
    ofstream *out_segmented = NULL;
Mathieu Giraud's avatar
Mathieu Giraud committed
888
    ofstream *out_unsegmented = NULL;
889
    ofstream *out_affects = NULL;
Mikaël Salson's avatar
Mikaël Salson committed
890
 
Mathieu Giraud's avatar
Mathieu Giraud committed
891
    WindowExtractor we;
892
893
 
    if (output_segmented) {
894
      string f_segmented = out_dir + f_basename + SEGMENTED_FILENAME ;
895
896
897
898
899
      cout << "  ==> " << f_segmented << endl ;
      out_segmented = new ofstream(f_segmented.c_str());
      we.setSegmentedOutput(out_segmented);
    }

Mathieu Giraud's avatar
Mathieu Giraud committed
900
    if (output_unsegmented) {
901
      string f_unsegmented = out_dir + f_basename + UNSEGMENTED_FILENAME ;
Mathieu Giraud's avatar
Mathieu Giraud committed
902
903
904
905
      cout << "  ==> " << f_unsegmented << endl ;
      out_unsegmented = new ofstream(f_unsegmented.c_str());
      we.setUnsegmentedOutput(out_unsegmented);
    }
906

907
908
909
910
911
912
913
    if (output_affects) {
      string f_affects = out_dir + f_basename + AFFECTS_FILENAME ;
      cout << "  ==> " << f_affects << endl ;
      out_affects = new ofstream(f_affects.c_str());
      we.setAffectsOutput(out_affects);
    }

914
    WindowsStorage *windowsStorage = we.extract(reads, multigermline, w, windows_labels, max_reads_processed, keep_unsegmented_as_clone);
WebTogz's avatar
WebTogz committed
915
    windowsStorage->setIdToAll();
Mathieu Giraud's avatar
Mathieu Giraud committed
916
    size_t nb_total_reads = we.getNbReads();
Mikaël Salson's avatar
Mikaël Salson committed
917
918


Mathieu Giraud's avatar
Mathieu Giraud committed
919
    //$$ Display statistics on segmentation causes
Mikaël Salson's avatar
Mikaël Salson committed
920

921
922
        
    ostringstream stream_segmentation_info;
Mikaël Salson's avatar
Mikaël Salson committed
923

Mathieu Giraud's avatar
Mathieu Giraud committed
924
925
    int nb_segmented_including_too_short = we.getNbSegmented(TOTAL_SEG_AND_WINDOW) 
      + we.getNbSegmented(TOTAL_SEG_BUT_TOO_SHORT_FOR_THE_WINDOW);
Mikaël Salson's avatar
Mikaël Salson committed
926

927
    stream_segmentation_info << "  ==> segmented " << nb_segmented_including_too_short << " reads"
Mikaël Salson's avatar
Mikaël Salson committed
928
	<< " (" << setprecision(3) << 100 * (float) nb_segmented_including_too_short / nb_total_reads << "%)" 
Mathieu Giraud's avatar
Mathieu Giraud committed
929
930
	<< endl ;

931
    // nb_segmented is the main denominator for the following
Mathieu Giraud's avatar
Mathieu Giraud committed
932
    int nb_segmented = we.getNbSegmented(TOTAL_SEG_AND_WINDOW);
933
    float ratio_segmented = 100 * (float) nb_segmented / nb_total_reads ;
Mikaël Salson's avatar
Mikaël Salson committed
934

935
    stream_segmentation_info << "  ==> found " << windowsStorage->size() << " " << w << "-windows"
Mikaël Salson's avatar
Mikaël Salson committed
936
	<< " in " << nb_segmented << " segments"
937
	<< " (" << setprecision(3) << ratio_segmented << "%)"
Mikaël Salson's avatar
Mikaël Salson committed
938
939
	<< " inside " << nb_total_reads << " sequences" << endl ;
  
940
    // warn if there are too few segmented sequences
941
    if (ratio_segmented < WARN_PERCENT_SEGMENTED)
942
      {
943
        stream_segmentation_info << "  ! There are not so many CDR3 windows found in this set of reads." << endl ;
944
        stream_segmentation_info << "  ! Please check the unsegmentation causes below and refer to the documentation." << endl ;
945
946
      }

947
948
949
    cout << "Build clone stats" << endl;
    windowsStorage->fillStatsClones();
    
950
    multigermline->out_stats(stream_segmentation_info);
951
952
    stream_segmentation_info << endl;
    we.out_stats(stream_segmentation_info);
Mikaël Salson's avatar
Mikaël Salson committed
953
    
954
    cout << stream_segmentation_info.str();
Mathieu Giraud's avatar
Mathieu Giraud committed
955
      map <junction, JsonList> json_data_segment ;
Mikaël Salson's avatar
Mikaël Salson committed
956
    
Mikaël Salson's avatar
Mikaël Salson committed
957
958

	//////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
959
	//$$ Sort windows
Mikaël Salson's avatar
Mikaël Salson committed
960
	
Mathieu Giraud's avatar
Mathieu Giraud committed
961
962
        cout << "Sort windows by number of occurrences" << endl;
        windowsStorage->sort();
Mikaël Salson's avatar
Mikaël Salson committed
963
964

	//////////////////////////////////
Mathieu Giraud's avatar
Mathieu Giraud committed
965
	//$$ Output windows
Mikaël Salson's avatar
Mikaël Salson committed
966
967
	//////////////////////////////////

968
	string f_all_windows = out_dir + f_basename + WINDOWS_FILENAME;
Mathieu Giraud's avatar
Mathieu Giraud committed
969
	cout << "  ==> " << f_all_windows << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
970

Mathieu Giraud's avatar
Mathieu Giraud committed
971
	ofstream out_all_windows(f_all_windows.c_str());
Mathieu Giraud's avatar
Mathieu Giraud committed
972
        windowsStorage->printSortedWindows(out_all_windows);
Mikaël Salson's avatar
Mikaël Salson committed
973
974
975


    //////////////////////////////////
976
977
978
979
980
981
982
    //$$ min_reads_clone (ou label)

    int min_reads_clone_ratio = (int) (ratio_reads_clone * nb_segmented / 100.0);
    cout << "Considering only labeled windows and windows with >= " << min_reads_clone << " reads"
	 << " and w