vidjil.cpp 55 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2
  This file is part of Vidjil <http://www.vidjil.org>
3
  Copyright (C) 2011-2017 by Bonsai bioinformatics
4
5
6
7
8
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

Mathieu Giraud's avatar
Mathieu Giraud committed
24
//$$ #include
Mikaël Salson's avatar
Mikaël Salson committed
25
26
27
28
29
30
31
32
33
34
35
36
37
38

#include<algorithm>
#include<utility>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <cstring>
#include <time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#include "core/tools.h"
WebTogz's avatar
WebTogz committed
39
#include "core/json.h"
40
#include "core/germline.h"
Mikaël Salson's avatar
Mikaël Salson committed
41
42
#include "core/kmerstore.h"
#include "core/fasta.h"
43
#include "core/bioreader.hpp"
Mikaël Salson's avatar
Mikaël Salson committed
44
#include "core/segment.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
45
#include "core/windows.h"
Mikaël Salson's avatar
Mikaël Salson committed
46
47
48
49
50
51
52
#include "core/cluster-junctions.h"
#include "core/dynprog.h"
#include "core/read_score.h"
#include "core/read_chooser.h"
#include "core/compare-all.h"
#include "core/mkdir.h"
#include "core/labels.h"
Mikaël Salson's avatar
Mikaël Salson committed
53
#include "core/list_utils.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
54
#include "core/windowExtractor.h"
Mikaël Salson's avatar
Mikaël Salson committed
55

56
57
#include "lib/json.hpp"

Mikaël Salson's avatar
Mikaël Salson committed
58
59
60
61
62
63
64
#include "vidjil.h"

// RELEASE_TAG may be defined in the "release.h" file.
// If RELEASE_TAG is undefined, the version will be the git hash.
// #define RELEASE_TAG  "2013.04"
#include "release.h"

65
66
67
// GIT_VERSION should be defined in "git-version.h", created by "create-git-version-h.sh", to be used outside of releases
#include "git-version.h"

68
#define VIDJIL_JSON_VERSION "2016b"
Mikaël Salson's avatar
Mikaël Salson committed
69

Mathieu Giraud's avatar
Mathieu Giraud committed
70
71
//$$ #define (mainly default options)

72
#define DEFAULT_MULTI_GERMLINE_PATH "germline/"
73
#define DEFAULT_MULTI_GERMLINE_FILE "homo-sapiens.g"
Mikaël Salson's avatar
Mikaël Salson committed
74

75
#define DEFAULT_READ_HEADER_SEPARATOR " "
Mathieu Giraud's avatar
Mathieu Giraud committed
76
#define DEFAULT_READS  "./data/Stanford_S22.fasta"
77
#define DEFAULT_MIN_READS_CLONE 5
78
#define DEFAULT_MAX_REPRESENTATIVES 100
79
#define DEFAULT_MAX_CLONES 100
80
#define DEFAULT_RATIO_READS_CLONE 0.0
81
#define NO_LIMIT "all"
Mikaël Salson's avatar
Mikaël Salson committed
82

Mathieu Giraud's avatar
Mathieu Giraud committed
83
#define COMMAND_WINDOWS "windows"
84
#define COMMAND_CLONES "clones"
Mikaël Salson's avatar
Mikaël Salson committed
85
#define COMMAND_SEGMENT "segment"
86
#define COMMAND_GERMLINES "germlines"
Mikaël Salson's avatar
Mikaël Salson committed
87
 
88
enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
Mikaël Salson's avatar
Mikaël Salson committed
89

90
91
92
#define DEFAULT_OUT_DIR "./out/" 

// Fixed filenames/suffixes
93
#define CLONES_FILENAME ".vdj.fa"
Mikaël Salson's avatar
Mikaël Salson committed
94
#define CLONE_FILENAME "clone.fa-"
95
96
#define WINDOWS_FILENAME ".windows.fa"
#define SEGMENTED_FILENAME ".segmented.vdj.fa"
97
#define UNSEGMENTED_FILENAME ".unsegmented.vdj.fa"
98
#define UNSEGMENTED_DETAIL_FILENAME ".fa"
99
#define AFFECTS_FILENAME ".affects"
100
101
102
#define EDGES_FILENAME ".edges"
#define COMP_FILENAME "comp.vidjil"
#define JSON_SUFFIX ".vidjil"
Mikaël Salson's avatar
Mikaël Salson committed
103
104
105

// "tests/data/leukemia.fa" 

106
#define DEFAULT_K      0
107
#define DEFAULT_W      50
108
#define DEFAULT_SEED   ""
Mikaël Salson's avatar
Mikaël Salson committed
109
110
111
112
113

#define DEFAULT_DELTA_MIN  -10

#define DEFAULT_DELTA_MIN_D  0

Mikaël Salson's avatar
Mikaël Salson committed
114
#define DEFAULT_MAX_AUDITIONED 2000
Mathieu Giraud's avatar
Mathieu Giraud committed
115
116
#define DEFAULT_RATIO_REPRESENTATIVE 0.5

Mikaël Salson's avatar
Mikaël Salson committed
117
118
119
120
121
122
#define DEFAULT_EPSILON  0
#define DEFAULT_MINPTS   10

#define DEFAULT_CLUSTER_COST  Cluster
#define DEFAULT_SEGMENT_COST   VDJ

123
#define DEFAULT_TRIM 0
Mikaël Salson's avatar
Mikaël Salson committed
124

125
#define MAX_CLONES_FOR_SIMILARITY 20
126

127
// warn
128
#define WARN_MAX_CLONES 100
129
#define WARN_PERCENT_SEGMENTED 40
130

Mikaël Salson's avatar
Mikaël Salson committed
131
132
133
// display
#define WIDTH_NB_READS 7
#define WIDTH_NB_CLONES 3
134

Mikaël Salson's avatar
Mikaël Salson committed
135
136

using namespace std ;
137
using json = nlohmann::json;
Mikaël Salson's avatar
Mikaël Salson committed
138

Mathieu Giraud's avatar
Mathieu Giraud committed
139
//$$ options: usage
Mikaël Salson's avatar
Mikaël Salson committed
140

Mathieu Giraud's avatar
Mathieu Giraud committed
141
extern char *optarg;
Mikaël Salson's avatar
Mikaël Salson committed
142
143
144

extern int optind, optopt, opterr;

145
void usage(char *progname, bool advanced)
Mikaël Salson's avatar
Mikaël Salson committed
146
{
147
  cerr << "Usage: " << progname << " [options] <reads.fa/.fq/.gz>" << endl << endl;
Mikaël Salson's avatar
Mikaël Salson committed
148
149

  cerr << "Command selection" << endl
150
       << "  -c <command>"
151
       << "\t"     << COMMAND_CLONES    << "  \t locus detection, window extraction, clone clustering (default command, most efficient, all outputs)" << endl
152
       << "  \t\t" << COMMAND_WINDOWS   << "  \t locus detection, window extraction" << endl
153
       << "  \t\t" << COMMAND_SEGMENT   << "  \t detailed V(D)J designation (not recommended)" << endl
154
       << "  \t\t" << COMMAND_GERMLINES << "  \t statistics on k-mers in different germlines" << endl
155
       << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
156

157
158
159
160
161
  if (advanced)
  cerr << "Input" << endl
       << "  -# <string>   separator for headers in the reads file (default: '" << DEFAULT_READ_HEADER_SEPARATOR << "')" << endl
       << endl ;

162
  cerr << "Germline presets (at least one -g or -V/(-D)/-J option must be given for all commands except -c " << COMMAND_GERMLINES << ")" << endl
163
164
       << "  -g <.g file>(:filter)" << endl
       << "                multiple locus/germlines, with tuned parameters." << endl
165
       << "                Common values are '-g germline/homo-sapiens.g' or '-g germline/mus-musculus.g'" << endl
166
167
168
169
170
171
       << "                The list of locus/recombinations can be restricted, such as in '-g germline/homo-sapiens.g:IGH,IGK,IGL'" << endl
       << "  -g <path>     multiple locus/germlines, shortcut for '-g <path>/" << DEFAULT_MULTI_GERMLINE_FILE << "'" << endl
       << "                processes human TRA, TRB, TRG, TRD, IGH, IGK and IGL locus, possibly with some incomplete/unusal recombinations" << endl
       << "  -V <file>     custom V germline multi-fasta file" << endl
       << "  -D <file>     custom D germline multi-fasta file (and resets -m and -w options), will segment into V(D)J components" << endl
       << "  -J <file>     custom J germline multi-fasta file" << endl
172
173
174
       << endl

       << "Locus/recombinations" << endl
175
       << "  -d            try to detect several D (experimental)" << endl
176
       << "  -2            try to detect unexpected recombinations (must be used with -g)" << endl
177
       << endl ;
178

179
180
  if (advanced)
  cerr << "Experimental options (do not use)" << endl
181
       << "  -I            ignore k-mers common to different germline systems (experimental, must be used with -g, do not use)" << endl
182
       << "  -1            use a unique index for all germline systems (experimental, must be used with -g, do not use)" << endl
183
       << "  -4            try to detect unexpected recombinations with translocations (experimental, must be used with -g, do not use)" << endl
184
       << "  -!            keep unsegmented reads as clones, taking for junction the complete sequence, to be used on very small datasets (for example -!AX 20)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
185
186
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
187
       << "Window prediction" << endl
Mikaël Salson's avatar
Mikaël Salson committed
188
#ifndef NO_SPACED_SEEDS
189
       << "  (use either -s or -k option, but not both)" << endl
190
       << "  (all these options, except -w, are overriden when using -g)" << endl
191
192
       << "  -s <string>   spaced seed used for the V/J affectation" << endl
       << "                (default: #####-#####, ######-######, #######-#######, depends on germline)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
193
#endif
194
       << "  -k <int>      k-mer size used for the V/J affectation (default: 10, 12, 13, depends on germline)" << endl
195
196
197
#ifndef NO_SPACED_SEEDS
       << "                (using -k option is equivalent to set with -s a contiguous seed with only '#' characters)" << endl
#endif
Mathieu Giraud's avatar
Mathieu Giraud committed
198
       << "  -w <int>      w-mer size used for the length of the extracted window (default: " << DEFAULT_W << ") ('" << NO_LIMIT << "': use all the read, no window clustering)" << endl
199
       << "  -e <float>    maximal e-value for determining if a V-J segmentation can be trusted (default: " << THRESHOLD_NB_EXPECTED << ")" << endl
200
       << "  -t <int>      trim V and J genes (resp. 5' and 3' regions) to keep at most <int> nt (default: " << DEFAULT_TRIM << ") (0: no trim)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
201
202
       << endl

203
204
205
206
       << "Labeled sequences (windows related to these sequences will be kept even if -r/-% thresholds are not reached)" << endl
       << "  -W <sequence> label the given sequence" << endl
       << "  -l <file>     label a set of sequences given in <file>" << endl
       << "  -F            filter -- keep only the windows related to the labeled sequences" << endl
207
       << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
208

209
  cerr << "Limits to report a clone (or a window)" << endl
210
211
       << "  -r <nb>       minimal number of reads supporting a clone (default: " << DEFAULT_MIN_READS_CLONE << ")" << endl
       << "  -% <ratio>    minimal percentage of reads supporting a clone (default: " << DEFAULT_RATIO_READS_CLONE << ")" << endl
212
213
       << endl

214
       << "Limits to further analyze some clones" << endl
215
       << "  -y <nb>       maximal number of clones computed with a consensus sequence ('" << NO_LIMIT << "': no limit) (default: " << DEFAULT_MAX_REPRESENTATIVES << ")" << endl
216
       << "  -z <nb>       maximal number of clones to be analyzed with a full V(D)J designation ('" << NO_LIMIT << "': no limit, do not use) (default: " << DEFAULT_MAX_CLONES << ")" << endl
217
       << "  -A            reports and segments all clones (-r 0 -% 0 -y " << NO_LIMIT << " -z " << NO_LIMIT << "), to be used only on very small datasets (for example -AX 20)" << endl
218
219
       << "  -x <nb>       maximal number of reads to process ('" << NO_LIMIT << "': no limit, default), only first reads" << endl
       << "  -X <nb>       maximal number of reads to process ('" << NO_LIMIT << "': no limit, default), sampled reads" << endl
220
       << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
221

222
  if (advanced)
223
  cerr << "Fine segmentation options (second pass)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
224
       << "  -f <string>   use custom Cost for fine segmenter : format \"match, subst, indels, homo, del_end\" (default "<<VDJ<<" )"<< endl
225
       << "  -m <int>      minimal admissible delta between the end of the V and the start of the J (default: " << DEFAULT_DELTA_MIN << ") (default with -D: " << DEFAULT_DELTA_MIN_D << ")" << endl
226
       << "  -E <float>    maximal e-value for determining if a D segment can be trusted (default: " << THRESHOLD_NB_EXPECTED_D << ")" << endl
227
       << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
228

229
230
231
232
233
234
  cerr << "Clone analysis (second pass)" << endl
       << "  -3            CDR3/JUNCTION detection (requires gapped V/J germlines)" << endl
       << endl ;

  if (advanced)
  cerr << "Additional clustering (experimental)" << endl
235
       << "  -= <file>     manual clustering -- a file used to force some specific edges" << endl
236
237
238
239
240
       << "  -n <int>      maximum distance between neighbors for automatic clustering (default " << DEFAULT_EPSILON << "). No automatic clusterisation if =0." << endl
       << "  -N <int>      minimum required neighbors for automatic clustering (default " << DEFAULT_MINPTS << ")" << endl
       << "  -S            generate and save comparative matrix for clustering" << endl
       << "  -L            load comparative matrix for clustering" << endl
       << "  -C <string>   use custom Cost for automatic clustering : format \"match, subst, indels, homo, del_end\" (default "<<Cluster<<" )"<< endl
241
       << endl ;
242

Mathieu Giraud's avatar
Mathieu Giraud committed
243
  cerr << "Detailed output per read (generally not recommended, large files, but may be used for filtering, as in -uu -X 1000)" << endl
244
245
       << "  -U            output segmented reads (in " << SEGMENTED_FILENAME << " file)" << endl
       << "  -u            output unsegmented reads (in " << UNSEGMENTED_FILENAME << " file)" << endl
246
247
       << "  -uu           output unsegmented reads, gathered by unsegmentation cause, except for very short and 'too few V/J' reads (in *" << UNSEGMENTED_DETAIL_FILENAME << " files)" << endl
       << "  -uuu          output unsegmented reads, gathered by unsegmentation cause, all reads (in *" << UNSEGMENTED_DETAIL_FILENAME << " files) (use only for debug)" << endl
248
       << "  -K            output detailed k-mer affectation on all reads (in " << AFFECTS_FILENAME << " file) (use only for debug, for example -KX 100)" << endl
249
250
       << endl
 
Mikaël Salson's avatar
Mikaël Salson committed
251
       << "Output" << endl
252
       << "  -o <dir>      output directory (default: " << DEFAULT_OUT_DIR << ")" <<  endl
253
       << "  -b <string>   output basename (by default basename of the input file)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
254
    
255
       << "  -a            output all sequences by cluster (" << CLONE_FILENAME << "*), to be used only on small datasets" << endl
Mikaël Salson's avatar
Mikaël Salson committed
256
257
258
       << "  -v            verbose mode" << endl
       << endl        

259
260
       << "  -h            help" << endl
       << "  -H            help, including experimental and advanced options" << endl
261
262
263
       << "The full help is available in the doc/algo.org file."
       << endl

Mikaël Salson's avatar
Mikaël Salson committed
264
       << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
265
       << "Examples (see doc/algo.org)" << endl
266
267
268
       << "  " << progname << " -c clones   -g germline/homo-sapiens.g     -2 -3     data/Stanford_S22.fasta   # (basic usage, detect the locus for each read," << endl
       << "                                                                                          #  including unexpected recombinations, analyzes CDR3)" << endl
       << "  " << progname << " -c clones   -g germline/homo-sapiens.g:IGH    -3     data/Stanford_S22.fasta   # (restrict to complete recombinations on the IGH locus)" << endl
269
270
       << "  " << progname << " -c windows  -g germline/homo-sapiens.g     -2 -uu -U data/Stanford_S22.fasta   # (detect the locus, splits all the reads into large files)" << endl
       << "  " << progname << " -c segment  -g germline/homo-sapiens.g     -2 -3     data/Stanford_S22.fasta   # (full analysis of each read, only for debug/testing)" << endl
271
       << "  " << progname << " -c germlines -g germline/homo-sapiens.g              data/Stanford_S22.fasta   # (statistics on the k-mers)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
272
273
274
275
    ;
  exit(1);
}

Mathieu Giraud's avatar
Mathieu Giraud committed
276
277
278

int atoi_NO_LIMIT(char *optarg)
{
279
280
  return strcmp(NO_LIMIT, optarg) ? atoi(optarg) : NO_LIMIT_VALUE ;
}
281
double atof_NO_LIMIT(char *optarg)
282
283
{
  return strcmp(NO_LIMIT, optarg) ? atof(optarg) : NO_LIMIT_VALUE ;
Mathieu Giraud's avatar
Mathieu Giraud committed
284
285
}

Mikaël Salson's avatar
Mikaël Salson committed
286
287
int main (int argc, char **argv)
{
288
  cout << "# Vidjil -- V(D)J recombinations analysis <http://www.vidjil.org/>" << endl
289
       << "# Copyright (C) 2011-2017 by the Vidjil team" << endl
290
       << "# Bonsai bioinformatics at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille" << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
291
292
293
294
295
       << endl
       << "# Vidjil is free software, and you are welcome to redistribute it" << endl
       << "# under certain conditions -- see http://git.vidjil.org/blob/master/doc/LICENSE" << endl
       << "# No lymphocyte was harmed in the making of this software," << endl
       << "# however this software is for research use only and comes with no warranty." << endl
296
       << endl
297
       << "# Please cite http://biomedcentral.com/1471-2164/15/409 if you use Vidjil." << endl
Mikaël Salson's avatar
Mikaël Salson committed
298
299
       << endl ;

300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
  //////////////////////////////////
  // Display version information or git log

  string soft_version = "vidjil ";
#ifdef RELEASE_TAG
  cout << "# version: vidjil " << RELEASE_TAG << endl ;
  soft_version.append(RELEASE_TAG);
#else
  cout << "# development version" << endl ;
#ifdef GIT_VERSION
  cout << "# git: " << GIT_VERSION << endl ;
  soft_version.append("dev ");
  soft_version.append(GIT_VERSION);
#endif
#endif

Mathieu Giraud's avatar
Mathieu Giraud committed
316
317
  //$$ options: defaults

318
  string germline_system = "" ;
319
320
321
322
  
  list <string> f_reps_V ;
  list <string> f_reps_D ;
  list <string> f_reps_J ;
323
  list <pair <string, string>> multi_germline_paths_and_files ;
324

325
  string read_header_separator = DEFAULT_READ_HEADER_SEPARATOR ;
Mikaël Salson's avatar
Mikaël Salson committed
326
327
  string f_reads = DEFAULT_READS ;
  string seed = DEFAULT_SEED ;
328
  string f_basename = "";
Mikaël Salson's avatar
Mikaël Salson committed
329

330
  string out_dir = DEFAULT_OUT_DIR;
Mikaël Salson's avatar
Mikaël Salson committed
331
332
333
  
  string comp_filename = COMP_FILENAME;

334
335
  int kmer_size = DEFAULT_K ;
  int wmer_size = DEFAULT_W ;
Mikaël Salson's avatar
Mikaël Salson committed
336

337
  IndexTypes indexType = KMER_INDEX;
338

Mikaël Salson's avatar
Mikaël Salson committed
339
340
341
342
  int epsilon = DEFAULT_EPSILON ;
  int minPts = DEFAULT_MINPTS ;
  Cost cluster_cost = DEFAULT_CLUSTER_COST ;
  Cost segment_cost = DEFAULT_SEGMENT_COST ;
343
  bool detect_CDR3 = false;
Mikaël Salson's avatar
Mikaël Salson committed
344
345
346
347
348
  
  int save_comp = 0;
  int load_comp = 0;
  
  int verbose = 0 ;
349
  int command = CMD_CLONES;
Mikaël Salson's avatar
Mikaël Salson committed
350

351
  int max_representatives = DEFAULT_MAX_REPRESENTATIVES ;
352
353
354
  int max_clones = DEFAULT_MAX_CLONES ;
  int min_reads_clone = DEFAULT_MIN_READS_CLONE ;
  float ratio_reads_clone = DEFAULT_RATIO_READS_CLONE;
Mikaël Salson's avatar
Mikaël Salson committed
355
356
  // int average_deletion = 4;     // Average number of deletion in V or J

357
358
  int max_reads_processed = NO_LIMIT_VALUE;
  int max_reads_processed_sample = NO_LIMIT_VALUE;
359

Mathieu Giraud's avatar
Mathieu Giraud committed
360
  float ratio_representative = DEFAULT_RATIO_REPRESENTATIVE;
Mikaël Salson's avatar
Mikaël Salson committed
361
  unsigned int max_auditionned = DEFAULT_MAX_AUDITIONED;
Mathieu Giraud's avatar
Mathieu Giraud committed
362

Mikaël Salson's avatar
Mikaël Salson committed
363
364
  // Admissible delta between left and right segmentation points
  int delta_min = DEFAULT_DELTA_MIN ; // Kmer+Fine
Mikaël Salson's avatar
Mikaël Salson committed
365
  int trim_sequences = DEFAULT_TRIM;
Mikaël Salson's avatar
Mikaël Salson committed
366
367

  bool output_sequences_by_cluster = false;
368
  bool output_segmented = false;
Mathieu Giraud's avatar
Mathieu Giraud committed
369
  bool output_unsegmented = false;
370
  bool output_unsegmented_detail = false;
371
  bool output_unsegmented_detail_full = false;
372
  bool output_affects = false;
373
374
  bool keep_unsegmented_as_clone = false;

375
376
  bool several_D = false;

377
  bool multi_germline = false;
378
  bool multi_germline_mark = false;
379
  bool multi_germline_one_index_per_germline = true;
380
381
  bool multi_germline_unexpected_recombinations_12 = false;
  bool multi_germline_unexpected_recombinations_1U = false;
382

Mikaël Salson's avatar
Mikaël Salson committed
383
384
  string forced_edges = "" ;

385
  map <string, string> windows_labels ;
Mathieu Giraud's avatar
Mathieu Giraud committed
386
  string windows_labels_file = "" ;
387
  bool only_labeled_windows = false ;
Mikaël Salson's avatar
Mikaël Salson committed
388
389
390

  char c ;

391
392
  int options_s_k = 0 ;

393
  double expected_value = THRESHOLD_NB_EXPECTED;
394
  double expected_value_D = THRESHOLD_NB_EXPECTED_D;
395

396
397
  //json which contains the Levenshtein distances
  json jsonLevenshtein;
398
  bool jsonLevenshteinComputed = false ;
WebTogz's avatar
WebTogz committed
399

Mathieu Giraud's avatar
Mathieu Giraud committed
400
401
  //$$ options: getopt

402

403
  while ((c = getopt(argc, argv, "A!x:X:hHadI124g:V:D:J:k:r:vw:e:E:C:f:W:l:Fc:m:N:s:b:Sn:o:L%:y:z:uUK3E:t:#:q")) != EOF)
Mikaël Salson's avatar
Mikaël Salson committed
404
405
406
407

    switch (c)
      {
      case 'h':
408
409
410
411
        usage(argv[0], false);

      case 'H':
        usage(argv[0], true);
412

Mikaël Salson's avatar
Mikaël Salson committed
413
      case 'c':
414
415
        if (!strcmp(COMMAND_CLONES,optarg))
          command = CMD_CLONES;
Mikaël Salson's avatar
Mikaël Salson committed
416
417
        else if (!strcmp(COMMAND_SEGMENT,optarg))
          command = CMD_SEGMENT;
Mathieu Giraud's avatar
Mathieu Giraud committed
418
419
        else if (!strcmp(COMMAND_WINDOWS,optarg))
          command = CMD_WINDOWS;
420
421
        else if (!strcmp(COMMAND_GERMLINES,optarg))
          command = CMD_GERMLINES;
Mikaël Salson's avatar
Mikaël Salson committed
422
423
        else {
          cerr << "Unknwown command " << optarg << endl;
424
	  usage(argv[0], false);
Mikaël Salson's avatar
Mikaël Salson committed
425
426
        }
        break;
427

428
429
430
431
      case 'q':
        indexType = AC_AUTOMATON;
        break;

432
433
434
435
      // Input
      case '#':
        read_header_separator = string(optarg);
        break;
Mikaël Salson's avatar
Mikaël Salson committed
436
437
438
439

      // Germline

      case 'V':
440
	f_reps_V.push_back(optarg);
441
	germline_system = "custom" ;
Mikaël Salson's avatar
Mikaël Salson committed
442
443
444
	break;

      case 'D':
445
	f_reps_D.push_back(optarg);
446
	delta_min = DEFAULT_DELTA_MIN_D ;
Mikaël Salson's avatar
Mikaël Salson committed
447
448
449
	break;
        
      case 'J':
450
	f_reps_J.push_back(optarg);
451
	germline_system = "custom" ;
Mikaël Salson's avatar
Mikaël Salson committed
452
453
	break;

454
      case 'g':
455
	multi_germline = true;
456
        germline_system = "multi" ;
457
458
459
460
461
462
463
464
        {
          string arg = string(optarg);
          struct stat buffer;
          if (stat(arg.c_str(), &buffer) == 0)
            {
              if( buffer.st_mode & S_IFDIR )
                {
                  // argument is a directory
465
                  multi_germline_paths_and_files.push_back(make_pair(arg, DEFAULT_MULTI_GERMLINE_FILE)) ;
466
                  break ;
467
468
                }
            }
469
470
471
472

          // argument is not a directory (and basename can include ':' with a filter)
          multi_germline_paths_and_files.push_back(make_pair(extract_dirname(arg), extract_basename(arg, false)));
          break ;
473
        }
474

475
476
477
478
      case 'd':
        several_D = true;
        break;

479
480
481
      case 'I':
        multi_germline_mark = true;
	break;
482
483
484
485

      case '1':
        multi_germline_one_index_per_germline = false ;
        break;
486
487

      case '2':
488
489
490
491
492
        multi_germline_unexpected_recombinations_12 = true ;
        break;

      case '4':
        multi_germline_unexpected_recombinations_1U = true ;
493
494
        break;

Mikaël Salson's avatar
Mikaël Salson committed
495
496
497
498
	break;

      // Algorithm

499
500
501
      case 's':
#ifndef NO_SPACED_SEEDS
	seed = string(optarg);
502
	kmer_size = seed_weight(seed);
503
504
505
506
507
508
	options_s_k++ ;
#else
        cerr << "To enable the option -s, please compile without NO_SPACED_SEEDS" << endl;
#endif
        break;

Mikaël Salson's avatar
Mikaël Salson committed
509
      case 'k':
510
511
	kmer_size = atoi(optarg);
	seed = seed_contiguous(kmer_size);
512
	options_s_k++ ;
Mikaël Salson's avatar
Mikaël Salson committed
513
514
515
        break;

      case 'w':
Mathieu Giraud's avatar
Mathieu Giraud committed
516
	wmer_size = atoi_NO_LIMIT(optarg);
Mikaël Salson's avatar
Mikaël Salson committed
517
518
519
520
521
522
        break;

      case 'm':
	delta_min = atoi(optarg);
        break;

523
524
525
526
      case '!':
        keep_unsegmented_as_clone = true;
        break;

527
      case 'e':
528
        expected_value = atof_NO_LIMIT(optarg);
529
530
        break;

531
532
533
534
      case 'E':
        expected_value_D = atof_NO_LIMIT(optarg);
        break;

535
536
      // Output 

Mikaël Salson's avatar
Mikaël Salson committed
537
538
539
540
      case 'o':
        out_dir = optarg ;
        break;

541
542
      case 'b':
        f_basename = optarg;
Mikaël Salson's avatar
Mikaël Salson committed
543
544
        break;

545
546
547
548
      case 'a':
	output_sequences_by_cluster = true;
	break;

Mikaël Salson's avatar
Mikaël Salson committed
549
550
551
552
      case 't':
        trim_sequences = atoi(optarg);
        break;

553
554
555
556
      case 'v':
	verbose += 1 ;
	break;

557
558
      // Limits

Mikaël Salson's avatar
Mikaël Salson committed
559
560
561
562
      case '%':
	ratio_reads_clone = atof(optarg);
	break;

563
      case 'r':
Mikaël Salson's avatar
Mikaël Salson committed
564
565
566
	min_reads_clone = atoi(optarg);
        break;

567
      case 'y':
Mathieu Giraud's avatar
Mathieu Giraud committed
568
	max_representatives = atoi_NO_LIMIT(optarg);
569
570
        break;

Mikaël Salson's avatar
Mikaël Salson committed
571
      case 'z':
Mathieu Giraud's avatar
Mathieu Giraud committed
572
	max_clones = atoi_NO_LIMIT(optarg);
573
        if ((max_representatives < max_clones) && (max_representatives != NO_LIMIT_VALUE))
574
          max_representatives = max_clones ;
Mikaël Salson's avatar
Mikaël Salson committed
575
        break;
576
577
578

      case 'A': // --all
	ratio_reads_clone = 0 ;
579
	min_reads_clone = 1 ;
580
581
	max_representatives = NO_LIMIT_VALUE ;
	max_clones = NO_LIMIT_VALUE ;
582
583
	break ;

584
      case 'X':
585
586
587
588
        max_reads_processed_sample = atoi_NO_LIMIT(optarg);
        break;

      case 'x':
Mathieu Giraud's avatar
Mathieu Giraud committed
589
        max_reads_processed = atoi_NO_LIMIT(optarg);
590
        break;
591

592
593
594
      // Labels

      case 'W':
Mathieu Giraud's avatar
Mathieu Giraud committed
595
        windows_labels[string(optarg)] = string("-W");
596
597
        break;

598
599
600
601
      case 'l':
	windows_labels_file = optarg; 
	break;

602
603
604
605
      case 'F':
        only_labeled_windows = true;
        break;

606
      // Clustering
607

608
      case '=':
609
610
	forced_edges = optarg;
	break;
Mikaël Salson's avatar
Mikaël Salson committed
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
	
      case 'n':
	epsilon = atoi(optarg);
        break;

      case 'N':
	minPts = atoi(optarg);
        break;
	
      case 'S':
	save_comp=1;
        break;

      case 'L':
	load_comp=1;
        break;
	
      case 'C':
	cluster_cost=strToCost(optarg, Cluster);
        break;
	
632
      // Fine segmentation
633
634
635
636
      case '3':
        detect_CDR3 = true;
        break;
        
637
      case 'f':
Mikaël Salson's avatar
Mikaël Salson committed
638
639
	segment_cost=strToCost(optarg, VDJ);
        break;
Mathieu Giraud's avatar
Mathieu Giraud committed
640
641

      case 'u':
642
643
644
        output_unsegmented_detail_full = output_unsegmented_detail; // -uuu
        output_unsegmented_detail = output_unsegmented;             // -uu
        output_unsegmented = true ;                                 // -u
Mathieu Giraud's avatar
Mathieu Giraud committed
645
        break;
646
647
648
      case 'U':
        output_segmented = true;
        break;
649
650
651
      case 'K':
        output_affects = true;
        break;
Mikaël Salson's avatar
Mikaël Salson committed
652
653
      }

654
655
656

  //$$ options: post-processing+display

657

658
  if (!germline_system.size())
659
    {
660
      cerr << ERROR_STRING << "At least one germline must be given with -g or -V/(-D)/-J." << endl ;
661
662
663
      exit(1);
    }

664
665
  if (options_s_k > 1)
    {
666
      cerr << ERROR_STRING << "Use at most one -s or -k option." << endl ;
667
668
669
      exit(1);
    }

Mikaël Salson's avatar
Mikaël Salson committed
670
671
  string out_seqdir = out_dir + "/seq/" ;

Mikaël Salson's avatar
Mikaël Salson committed
672
673
674
675
676
677
678
679
680
681
682
683
684
  if (verbose)
    cout << "# verbose " << verbose << endl ;

  if (optind == argc)
    {
      cout << "# using default sequence file: " << f_reads << endl ;
    }
  else if (optind+1 == argc)
    {
      f_reads = argv[optind] ; 
    }
  else
    {
685
      cerr << ERROR_STRING << "Wrong number of arguments." << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
686
687
      exit(1);
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
688

689
  size_t min_cover_representative = (size_t) (min_reads_clone < (int) max_auditionned ? min_reads_clone : max_auditionned) ;
690

691
692
693
  // Default seeds

#ifndef NO_SPACED_SEEDS
694
  if (kmer_size == DEFAULT_K)
695
696
    {
      if (germline_system.find("TRA") != string::npos)
697
	seed = SEED_S13 ;
698
699
700

      else if ((germline_system.find("TRB") != string::npos)
	       || (germline_system.find("IGH") != string::npos))
701
	seed = SEED_S12 ;
702
      else // TRD, TRG, IGK, IGL, custom, multi
703
	seed = SEED_S10 ;
704

705
      kmer_size = seed_weight(seed);
706
707
708
    }
#else
  {
709
    cerr << ERROR_STRING << "Vidjil was compiled with NO_SPACED_SEEDS: please provide a -k option." << endl;
710
711
712
713
714
    exit(1) ;
  }
#endif
	  

Mathieu Giraud's avatar
Mathieu Giraud committed
715
716
717
718
#ifndef NO_SPACED_SEEDS
  // Check seed buffer  
  if (seed.size() >= MAX_SEED_SIZE)
    {
719
      cerr << ERROR_STRING << "Seed size is too large (MAX_SEED_SIZE)." << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
720
721
722
723
      exit(1);
    }
#endif

724

Mathieu Giraud's avatar
Mathieu Giraud committed
725
  if ((wmer_size< 0) && (wmer_size!= NO_LIMIT_VALUE))
726
    {
727
      cerr << ERROR_STRING << "Too small -w. The window size should be positive" << endl;
728
729
730
      exit(1);
    }

Mikaël Salson's avatar
Mikaël Salson committed
731
732
733
734
  // Check that out_dir is an existing directory or creates it
  const char *out_cstr = out_dir.c_str();

  if (mkpath(out_cstr, 0755) == -1) {
735
    cerr << ERROR_STRING << "Directory creation: " << out_dir << endl; perror("");
Mikaël Salson's avatar
Mikaël Salson committed
736
737
738
    exit(2);
  }

Mikaël Salson's avatar
Mikaël Salson committed
739
740
  const char *outseq_cstr = out_seqdir.c_str();
  if (mkpath(outseq_cstr, 0755) == -1) {
741
    cerr << ERROR_STRING << "Directory creation: " << out_seqdir << endl; perror("");
Mikaël Salson's avatar
Mikaël Salson committed
742
743
744
    exit(2);
  }

745
746
747
748
749
  // Compute basename if not given as an option
  if (f_basename == "") {
    f_basename = extract_basename(f_reads);
  }

Mikaël Salson's avatar
Mikaël Salson committed
750
751
752
  out_dir += "/" ;

  /// Load labels ;
753
  load_into_map(windows_labels, windows_labels_file, "-l");
Mikaël Salson's avatar
Mikaël Salson committed
754
755

  switch(command) {
Mathieu Giraud's avatar
Mathieu Giraud committed
756
  case CMD_WINDOWS: cout << "Extracting windows" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
757
    break;
758
  case CMD_CLONES: cout << "Analysing clones" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
759
760
761
    break;
  case CMD_SEGMENT: cout << "Segmenting V(D)J" << endl;
    break;
762
763
  case CMD_GERMLINES: cout << "Discovering germlines" << endl;
    break;
Mikaël Salson's avatar
Mikaël Salson committed
764
765
  }

Mathieu Giraud's avatar
Mathieu Giraud committed
766
  cout << "Command line: ";
Mikaël Salson's avatar
Mikaël Salson committed
767
  for (int i=0; i < argc; i++) {
Mathieu Giraud's avatar
Mathieu Giraud committed
768
    cout << argv[i] << " ";
Mikaël Salson's avatar
Mikaël Salson committed
769
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
770
  cout << endl;
Mikaël Salson's avatar
Mikaël Salson committed
771
772
773
774
775
776
777
778
779
780
781
782

  //////////////////////////////////
  // Display time and date
  time_t rawtime;
  struct tm *timeinfo;
  char time_buffer[80];

  time (&rawtime );
  timeinfo = localtime (&rawtime);

  strftime (time_buffer, 80,"%F %T", timeinfo);

Mathieu Giraud's avatar
Mathieu Giraud committed
783
  cout << "# " << time_buffer << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
784
785


786
787
  //////////////////////////////////
  // Warning for non-optimal use
788

789
  if (max_clones == NO_LIMIT_VALUE || max_clones > WARN_MAX_CLONES)
790
791
792
793
794
795
796
797
798
799
800
    {
      cout << endl
	   << "* WARNING: vidjil was run with '-A' option or with a large '-z' option" << endl ;
    }
  
  if (command == CMD_SEGMENT)
    {
      cout << endl
	   << "* WARNING: vidjil was run with '-c segment' option" << endl ;
    }
  
801
  if (max_clones == NO_LIMIT_VALUE || max_clones > WARN_MAX_CLONES || command == CMD_SEGMENT)
802
    {
803
      cout << "* Vidjil's purpose is to efficiently extract windows overlapping the CDR3" << endl
804
           << "* to cluster reads into clones ('-c clones')." << endl
805
806
           << "* Computing accurate V(D)J designations for many sequences ('-c segment' or large '-z' values)" << endl
           << "* is slow and should be done only on small datasets or for testing purposes." << endl
807
808
809
	   << "* More information is provided in the 'doc/algo.org' file." << endl 
	   << endl ;
    }
810

811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832

  /////////////////////////////////////////
  //            JSON OUTPUT              //
  /////////////////////////////////////////

  string f_json = out_dir + f_basename + JSON_SUFFIX ;

  ostringstream stream_cmdline;
  for (int i=0; i < argc; i++) stream_cmdline << argv[i] << " ";

  json j = {
    {"vidjil_json_version", VIDJIL_JSON_VERSION},
    {"samples", {
        {"number", 1},
        {"original_names", {f_reads}},
        {"run_timestamp", {time_buffer}},
        {"producer", {soft_version}},
        {"commandline", {stream_cmdline.str()}}
      }}
  };


833
834
835
  /////////////////////////////////////////
  //            LOAD GERMLINES           //
  /////////////////////////////////////////
836

837
838
839
840
841
842
  if (command == CMD_GERMLINES)
    {
      multi_germline = true ;
      multi_germline_one_index_per_germline = false ;
    }

843
  MultiGermline *multigermline = new MultiGermline(indexType, multi_germline_one_index_per_germline);
844

845
    {
846
      cout << "Load germlines and build Kmer indexes" << endl ;
847
848
849
    
      if (multi_germline)
	{
850
          for (pair <string, string> path_file: multi_germline_paths_and_files)
851
852
853
854
855
856
857
858
            {
              try {
                multigermline->build_from_json(path_file.first, path_file.second, GERMLINES_REGULAR, trim_sequences);
              } catch (std::exception& e) {
                cerr << ERROR_STRING << "Vidjil cannot properly read " << path_file.first << "/" << path_file.second << ": " << e.what() << endl;
                exit(1);
              }
            }
859
860
861
862
863
864
	}
      else
	{
	  // Custom germline
	  Germline *germline;
	  germline = new Germline(germline_system, 'X',
865
                                  f_reps_V, f_reps_D, f_reps_J, 
866
                                  delta_min, seed, trim_sequences);
867

868
          germline->new_index(indexType);
869

870
871
872
	  multigermline->insert(germline);
	}
    }
873

874
    cout << endl ;
875

876
    if (!multi_germline_one_index_per_germline) {
877
      multigermline->build_with_one_index(seed, true);
878
    }
879

880
      if (multi_germline_unexpected_recombinations_12 || multi_germline_unexpected_recombinations_1U) {
881
882
883
        if (!multigermline->index) {
          multigermline->build_with_one_index(seed, false);
        }
884
      }
885

886
      if (multi_germline_unexpected_recombinations_12) {
887
        Germline *pseudo = new Germline(PSEUDO_UNEXPECTED, PSEUDO_UNEXPECTED_CODE, -10, "", trim_sequences);
888
        pseudo->seg_method = SEG_METHOD_MAX12 ;
889
890
        pseudo->index = multigermline->index ;
        multigermline->germlines.push_back(pseudo);
891
892
893
      }

      if (multi_germline_unexpected_recombinations_1U) {
894
        Germline *pseudo_u = new Germline(PSEUDO_UNEXPECTED, PSEUDO_UNEXPECTED_CODE, -10, "", trim_sequences);
895
        pseudo_u->seg_method = SEG_METHOD_MAX1U ;
896
        // TODO: there should be more up/downstream regions for the PSEUDO_UNEXPECTED germline. And/or smaller seeds ?
897
898
        pseudo_u->index = multigermline->index ;
        multigermline->germlines.push_back(pseudo_u);
899
900
    }

901
      // Should come after the initialization of regular (and possibly pseudo) germlines
902
    {
903
904
      for (pair <string, string> path_file: multi_germline_paths_and_files)
        multigermline->build_from_json(path_file.first, path_file.second, GERMLINES_INCOMPLETE, trim_sequences);
905
      if ((! multigermline->one_index_per_germline) && (command != CMD_GERMLINES)) {
906
907
        multigermline->insert_in_one_index(multigermline->index, true);
      }
908
909
    }

910
911
    if (multi_germline_mark)
      multigermline->mark_cross_germlines_as_ambiguous();
912
913

    multigermline->finish();
914
    cout << "Germlines loaded: " ;
915
916
    cout << *multigermline ;
    cout << endl ;
917
918

    // Number of reads for e-value computation
919
    unsigned long long nb_reads_for_evalue = (expected_value > NO_LIMIT_VALUE) ? nb_sequences_in_file(f_reads, true) : 1 ;
920

921
    
922
923
  //////////////////////////////////
  //$$ Read sequence files
924
925
926
927

    int only_nth_read = 1 ;
    if (max_reads_processed_sample != NO_LIMIT_VALUE)
      {
928
        only_nth_read = nb_sequences_in_file(f_reads) / max_reads_processed_sample;
929
930
931
        if (only_nth_read == 0)
          only_nth_read = 1 ;

932
        max_reads_processed = max_reads_processed_sample ;
933
934
935
936
937

        if (only_nth_read > 1)
          cout << "Processing every " << only_nth_read
               << (only_nth_read == 2 ? "nd" : (only_nth_read == 3 ? "rd" : "th"))
               << " read" << endl ;
938
939
      }

940
  OnlineBioReader *reads;
941
942

  try {
943
    reads = OnlineBioReaderFactory::create(f_reads, 1, read_header_separator, max_reads_processed, only_nth_read);
944
  } catch (const invalid_argument e) {
945
    cerr << ERROR_STRING << "Vidjil cannot open reads file " << f_reads << ": " << e.what() << endl;
946
947
948
949
950