vidjil.cpp 56.8 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2
3
  This file is part of Vidjil-algo <http://www.vidjil.org>
  Copyright (C) 2011-2018 by Bonsai bioinformatics
4
5
6
7
8
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9

10
  "Vidjil-algo" is free software: you can redistribute it and/or modify
Mikaël Salson's avatar
Mikaël Salson committed
11
12
13
14
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

15
  "Vidjil-algo" is distributed in the hope that it will be useful,
Mikaël Salson's avatar
Mikaël Salson committed
16
17
18
19
20
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
21
  along with "Vidjil-algo". If not, see <http://www.gnu.org/licenses/>
Mikaël Salson's avatar
Mikaël Salson committed
22
23
*/

Mathieu Giraud's avatar
Mathieu Giraud committed
24
//$$ #include
Mikaël Salson's avatar
Mikaël Salson committed
25
26
27
28
29
30
31
32
33
34
35
36
37

#include<algorithm>
#include<utility>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <cstring>
#include <time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

38
#include "core/check-compiler.h"
Mikaël Salson's avatar
Mikaël Salson committed
39
#include "core/tools.h"
WebTogz's avatar
WebTogz committed
40
#include "core/json.h"
41
#include "core/germline.h"
Mikaël Salson's avatar
Mikaël Salson committed
42
43
#include "core/kmerstore.h"
#include "core/fasta.h"
44
#include "core/bioreader.hpp"
Mikaël Salson's avatar
Mikaël Salson committed
45
#include "core/segment.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
46
#include "core/windows.h"
Mikaël Salson's avatar
Mikaël Salson committed
47
48
49
50
51
52
53
#include "core/cluster-junctions.h"
#include "core/dynprog.h"
#include "core/read_score.h"
#include "core/read_chooser.h"
#include "core/compare-all.h"
#include "core/mkdir.h"
#include "core/labels.h"
Mikaël Salson's avatar
Mikaël Salson committed
54
#include "core/list_utils.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
55
#include "core/windowExtractor.h"
Mikaël Salson's avatar
Mikaël Salson committed
56

Mathieu Giraud's avatar
Mathieu Giraud committed
57
#include "lib/CLI11.hpp"
58
59
#include "lib/json.hpp"

Mikaël Salson's avatar
Mikaël Salson committed
60
61
62
63
64
65
66
#include "vidjil.h"

// RELEASE_TAG may be defined in the "release.h" file.
// If RELEASE_TAG is undefined, the version will be the git hash.
// #define RELEASE_TAG  "2013.04"
#include "release.h"

67
68
69
// GIT_VERSION should be defined in "git-version.h", created by "create-git-version-h.sh", to be used outside of releases
#include "git-version.h"

70
#define PROGNAME "vidjil-algo"
71
#define VIDJIL_JSON_VERSION "2016b"
Mikaël Salson's avatar
Mikaël Salson committed
72

Mathieu Giraud's avatar
Mathieu Giraud committed
73
74
//$$ #define (mainly default options)

75
#define DEFAULT_MULTI_GERMLINE_PATH "germline/"
76
#define DEFAULT_MULTI_GERMLINE_FILE "homo-sapiens.g"
Mikaël Salson's avatar
Mikaël Salson committed
77

78
#define DEFAULT_READ_HEADER_SEPARATOR " "
79
#define DEFAULT_READS  "./demo/Stanford_S22.fasta"
80
#define DEFAULT_MIN_READS_CLONE 5
81
#define DEFAULT_MAX_REPRESENTATIVES 100
82
#define DEFAULT_MAX_CLONES 100
83
#define DEFAULT_RATIO_READS_CLONE 0.0
84
#define NO_LIMIT "all"
Mikaël Salson's avatar
Mikaël Salson committed
85

Mathieu Giraud's avatar
Mathieu Giraud committed
86
#define COMMAND_WINDOWS "windows"
87
#define COMMAND_CLONES "clones"
Mikaël Salson's avatar
Mikaël Salson committed
88
#define COMMAND_SEGMENT "segment"
89
#define COMMAND_GERMLINES "germlines"
Mikaël Salson's avatar
Mikaël Salson committed
90
 
91
enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
Mikaël Salson's avatar
Mikaël Salson committed
92

93
94
95
#define DEFAULT_OUT_DIR "./out/" 

// Fixed filenames/suffixes
96
#define CLONES_FILENAME ".vdj.fa"
Mikaël Salson's avatar
Mikaël Salson committed
97
#define CLONE_FILENAME "clone.fa-"
98
99
#define WINDOWS_FILENAME ".windows.fa"
#define SEGMENTED_FILENAME ".segmented.vdj.fa"
100
#define UNSEGMENTED_FILENAME ".unsegmented.vdj.fa"
101
#define UNSEGMENTED_DETAIL_FILENAME ".fa"
102
#define AFFECTS_FILENAME ".affects"
103
104
105
#define EDGES_FILENAME ".edges"
#define COMP_FILENAME "comp.vidjil"
#define JSON_SUFFIX ".vidjil"
Mikaël Salson's avatar
Mikaël Salson committed
106

107
#define DEFAULT_K      0
108
#define DEFAULT_W      50
109
#define DEFAULT_SEED   DEFAULT_GERMLINE_SEED
Mikaël Salson's avatar
Mikaël Salson committed
110

Mikaël Salson's avatar
Mikaël Salson committed
111
#define DEFAULT_MAX_AUDITIONED 2000
Mathieu Giraud's avatar
Mathieu Giraud committed
112
113
#define DEFAULT_RATIO_REPRESENTATIVE 0.5

Mathieu Giraud's avatar
Mathieu Giraud committed
114
115
#define DEFAULT_KMER_THRESHOLD NO_LIMIT_VALUE

Mikaël Salson's avatar
Mikaël Salson committed
116
117
118
119
120
121
#define DEFAULT_EPSILON  0
#define DEFAULT_MINPTS   10

#define DEFAULT_CLUSTER_COST  Cluster
#define DEFAULT_SEGMENT_COST   VDJ

122
#define DEFAULT_TRIM 0
Mikaël Salson's avatar
Mikaël Salson committed
123

124
#define MAX_CLONES_FOR_SIMILARITY 20
125

126
// warn
127
#define WARN_MAX_CLONES 100
128
#define WARN_PERCENT_SEGMENTED 40
129
#define WARN_COVERAGE 0.6
130
#define WARN_NUM_CLONES_SIMILAR 10
131

Mikaël Salson's avatar
Mikaël Salson committed
132
133
134
// display
#define WIDTH_NB_READS 7
#define WIDTH_NB_CLONES 3
135

Mikaël Salson's avatar
Mikaël Salson committed
136
137

using namespace std ;
138
using json = nlohmann::json;
Mikaël Salson's avatar
Mikaël Salson committed
139

Mathieu Giraud's avatar
Mathieu Giraud committed
140
//$$ options: usage
Mikaël Salson's avatar
Mikaël Salson committed
141

Mathieu Giraud's avatar
Mathieu Giraud committed
142
extern char *optarg;
Mikaël Salson's avatar
Mikaël Salson committed
143
144
145

extern int optind, optopt, opterr;

146
string usage_examples(char *progname)
Mikaël Salson's avatar
Mikaël Salson committed
147
{
148
149
  stringstream ss;
  ss
Mikaël Salson's avatar
Mikaël Salson committed
150
       << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
151
       << "Examples (see doc/algo.org)" << endl
152
153
154
       << "  " << progname << " -c clones   -g germline/homo-sapiens.g   -2 -3 -r 1  demo/Demo-X5.fa           # (basic usage, detect the locus for each read," << endl
       << "                                                                                               #  cluster reads and report clones starting from the first read (-r 1)," << endl
       << "                                                                                               #  including unexpected recombinations (-2), assign V(D)J genes and try to detect the CDR3s (-3))" << endl
155
       << "  " << progname << " -c clones   -g germline/homo-sapiens.g:IGH    -3     demo/Stanford_S22.fasta   # (restrict to complete recombinations on the IGH locus)" << endl
156
157
158
       << "  " << progname << " -c clones   -g germline/homo-sapiens.g   -2 -3 -z 20 demo/LIL-L4.fastq.gz      # (basic usage, output detailed V(D)J analysis on the first 20 clones)" << endl
       << "  " << progname << " -c windows  -g germline/homo-sapiens.g   -y 0 -uu -U demo/LIL-L4.fastq.gz      # (splits all the reads into (large) files depending on the detection of V(D)J recombinations)" << endl
       << "  " << progname << " -c segment  -g germline/homo-sapiens.g   -2 -3 -X 50 demo/Stanford_S22.fasta   # (full analysis of each read, only for debug/testing, here on 50 sampled reads)" << endl
159
       << "  " << progname << " -c germlines -g germline/homo-sapiens.g              demo/Stanford_S22.fasta   # (statistics on the k-mers)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
160
    ;
161
162

  return ss.str();
Mikaël Salson's avatar
Mikaël Salson committed
163
164
}

Mathieu Giraud's avatar
Mathieu Giraud committed
165

166
int atoi_NO_LIMIT(const char *optarg)
Mathieu Giraud's avatar
Mathieu Giraud committed
167
{
168
169
  return strcmp(NO_LIMIT, optarg) ? atoi(optarg) : NO_LIMIT_VALUE ;
}
170
double atof_NO_LIMIT(const char *optarg)
171
172
{
  return strcmp(NO_LIMIT, optarg) ? atof(optarg) : NO_LIMIT_VALUE ;
Mathieu Giraud's avatar
Mathieu Giraud committed
173
174
}

175
176
177
178
179
180
181
182
183
184
string string_NO_LIMIT(string s)
{
  if (!strcmp(NO_LIMIT, s.c_str()))
    return NO_LIMIT_VALUE_STRING ;

  return s;
}



Mikaël Salson's avatar
Mikaël Salson committed
185
186
int main (int argc, char **argv)
{
187
  cout << "# " << PROGNAME << " -- V(D)J recombinations analysis <http://www.vidjil.org/>" << endl
188
       << "# Copyright (C) 2011-2018 by the Vidjil team" << endl
189
       << "# Bonsai bioinformatics at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille" << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
190
       << endl
191
       << "# " << PROGNAME << " is free software, and you are welcome to redistribute it" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
192
193
194
       << "# under certain conditions -- see http://git.vidjil.org/blob/master/doc/LICENSE" << endl
       << "# No lymphocyte was harmed in the making of this software," << endl
       << "# however this software is for research use only and comes with no warranty." << endl
195
       << endl
196
       << "# Please cite http://biomedcentral.com/1471-2164/15/409 if you use " << PROGNAME << "." << endl
Mikaël Salson's avatar
Mikaël Salson committed
197
198
       << endl ;

199
200
201
  //////////////////////////////////
  // Display version information or git log

202
203
  string soft_version = PROGNAME ;
  soft_version += " " ;
204
#ifdef RELEASE_TAG
205
  cout << "# version: " PROGNAME << " " << RELEASE_TAG << endl ;
206
207
208
209
210
211
212
213
214
215
  soft_version.append(RELEASE_TAG);
#else
  cout << "# development version" << endl ;
#ifdef GIT_VERSION
  cout << "# git: " << GIT_VERSION << endl ;
  soft_version.append("dev ");
  soft_version.append(GIT_VERSION);
#endif
#endif

Mathieu Giraud's avatar
Mathieu Giraud committed
216
217
  CLI::App app{"Vidjil"};

Mathieu Giraud's avatar
Mathieu Giraud committed
218
219
  //$$ options: defaults

Mathieu Giraud's avatar
Mathieu Giraud committed
220
221
222
223
  vector <string> v_reps_V ;
  vector <string> v_reps_D ;
  vector <string> v_reps_J ;

224
  list <pair <string, string>> multi_germline_paths_and_files ;
225

226
  string read_header_separator = DEFAULT_READ_HEADER_SEPARATOR ;
Mikaël Salson's avatar
Mikaël Salson committed
227
228
  string f_reads = DEFAULT_READS ;
  string seed = DEFAULT_SEED ;
229
  bool seed_changed = false;
230
  string f_basename = "";
Mikaël Salson's avatar
Mikaël Salson committed
231

232
  string out_dir = DEFAULT_OUT_DIR;
Mikaël Salson's avatar
Mikaël Salson committed
233
234
235
  
  string comp_filename = COMP_FILENAME;

236
  int wmer_size = DEFAULT_W ;
Mikaël Salson's avatar
Mikaël Salson committed
237

238
  IndexTypes indexType = KMER_INDEX;
239

Mikaël Salson's avatar
Mikaël Salson committed
240
241
242
243
  int epsilon = DEFAULT_EPSILON ;
  int minPts = DEFAULT_MINPTS ;
  Cost cluster_cost = DEFAULT_CLUSTER_COST ;
  Cost segment_cost = DEFAULT_SEGMENT_COST ;
244
  bool detect_CDR3 = false;
Mikaël Salson's avatar
Mikaël Salson committed
245
  
Mathieu Giraud's avatar
Mathieu Giraud committed
246
247
  bool save_comp = false;
  bool load_comp = false;
Mikaël Salson's avatar
Mikaël Salson committed
248
249
  
  int verbose = 0 ;
250
  int command = CMD_CLONES;
Mikaël Salson's avatar
Mikaël Salson committed
251

252
  int max_representatives = DEFAULT_MAX_REPRESENTATIVES ;
253
254
255
  int max_clones = DEFAULT_MAX_CLONES ;
  int min_reads_clone = DEFAULT_MIN_READS_CLONE ;
  float ratio_reads_clone = DEFAULT_RATIO_READS_CLONE;
Mikaël Salson's avatar
Mikaël Salson committed
256
257
  // int average_deletion = 4;     // Average number of deletion in V or J

258
259
  int max_reads_processed = NO_LIMIT_VALUE;
  int max_reads_processed_sample = NO_LIMIT_VALUE;
260

Mathieu Giraud's avatar
Mathieu Giraud committed
261
  float ratio_representative = DEFAULT_RATIO_REPRESENTATIVE;
Mikaël Salson's avatar
Mikaël Salson committed
262
  unsigned int max_auditionned = DEFAULT_MAX_AUDITIONED;
Mathieu Giraud's avatar
Mathieu Giraud committed
263

Mikaël Salson's avatar
Mikaël Salson committed
264
  int trim_sequences = DEFAULT_TRIM;
Mikaël Salson's avatar
Mikaël Salson committed
265

266
267
  bool trim_sequences_changed = false;
  
Mikaël Salson's avatar
Mikaël Salson committed
268
  bool output_sequences_by_cluster = false;
269
  bool output_segmented = false;
Mathieu Giraud's avatar
Mathieu Giraud committed
270
  bool output_unsegmented = false;
271
  bool output_unsegmented_detail = false;
272
  bool output_unsegmented_detail_full = false;
273
  bool output_affects = false;
274
275
  bool keep_unsegmented_as_clone = false;

276
277
  bool several_D = false;

Mathieu Giraud's avatar
Mathieu Giraud committed
278
  vector <string> multi_germlines ;
279
  bool multi_germline = false;
280
  bool multi_germline_mark = false;
281
  bool multi_germline_one_unique_index = false;
282
283
  bool multi_germline_unexpected_recombinations_12 = false;
  bool multi_germline_unexpected_recombinations_1U = false;
284

Mikaël Salson's avatar
Mikaël Salson committed
285
286
  string forced_edges = "" ;

287
  map <string, string> windows_labels ;
288
  vector <string> windows_labels_explicit ;
Mathieu Giraud's avatar
Mathieu Giraud committed
289
  string windows_labels_file = "" ;
290
  bool only_labeled_windows = false ;
Mikaël Salson's avatar
Mikaël Salson committed
291
292
293

  char c ;

294
295
  int options_s_k = 0 ;

296
  double expected_value = THRESHOLD_NB_EXPECTED;
297
  double expected_value_D = THRESHOLD_NB_EXPECTED_D;
298

299
300
  //json which contains the Levenshtein distances
  json jsonLevenshtein;
301
  bool jsonLevenshteinComputed = false ;
WebTogz's avatar
WebTogz committed
302

Mathieu Giraud's avatar
Mathieu Giraud committed
303
304
  int kmer_threshold = DEFAULT_KMER_THRESHOLD;

Mathieu Giraud's avatar
Mathieu Giraud committed
305
306
  //$$ options: definition wiht CLI11
  string group = "";
Mathieu Giraud's avatar
Mathieu Giraud committed
307

Mathieu Giraud's avatar
Mathieu Giraud committed
308
  // cerr << "Usage: " << progname << " [options] <reads.fa/.fq/.gz>" << endl << endl;
309

Mathieu Giraud's avatar
Mathieu Giraud committed
310
  group = "Files";
311
  app.add_option("reads", f_reads, "reads.fa/.fq/.gz") -> group(group) -> set_type_name("FILE");
312

313

Mathieu Giraud's avatar
Mathieu Giraud committed
314
  group = "Command selection";
Mathieu Giraud's avatar
Mathieu Giraud committed
315
316
317
318
319
320
321
  string cmd = COMMAND_CLONES;
  app.add_option("-c", cmd, "command"
                 "\n  \t\t" COMMAND_CLONES    "  \t locus detection, window extraction, clone clustering (default command, most efficient, all outputs)"
                 "\n  \t\t" COMMAND_WINDOWS   "  \t locus detection, window extraction"
                 "\n  \t\t" COMMAND_SEGMENT   "  \t detailed V(D)J designation (not recommended)"
                 "\n  \t\t" COMMAND_GERMLINES "  \t statistics on k-mers in different germlines")
    -> group(group) -> set_type_name("COMMAND");
Mikaël Salson's avatar
Mikaël Salson committed
322

Mathieu Giraud's avatar
Mathieu Giraud committed
323
  group = "Input" ;
324
  app.add_option("--header-sep", read_header_separator, "separator for headers in the reads file", true) -> group(group) -> level() -> set_type_name("CHAR")  ;
Mikaël Salson's avatar
Mikaël Salson committed
325

Mathieu Giraud's avatar
Mathieu Giraud committed
326
327
  
  group = "Germline presets (at least one -g or -V/(-D)/-J option must be given for all commands except -c " COMMAND_GERMLINES ")";
Mathieu Giraud's avatar
Mathieu Giraud committed
328
329
330
331
332
333
334
335
  app.add_option("-g", multi_germlines, R"Z(
         -g <.g file>(:filter)
                    multiple locus/germlines, with tuned parameters.
                    Common values are '-g germline/homo-sapiens.g' or '-g germline/mus-musculus.g'
                    The list of locus/recombinations can be restricted, such as in '-g germline/homo-sapiens.g:IGH,IGK,IGL'
         -g <path>
                    multiple locus/germlines, shortcut for '-g <path>/)Z" DEFAULT_MULTI_GERMLINE_FILE R"Z(',
                    processes human TRA, TRB, TRG, TRD, IGH, IGK and IGL locus, possibly with some incomplete/unusal recombination
336
)Z") -> group(group) -> set_type_name("GERMLINES");
337

338
339
340
  app.add_option("-V", v_reps_V, "custom V germline multi-fasta file") -> group(group) -> set_type_name("FILE");
  app.add_option("-D", v_reps_D, "custom D germline multi-fasta file (and resets -m and -w options), will segment into V(D)J components") -> group(group) -> set_type_name("FILE");
  app.add_option("-J", v_reps_J, "custom V germline multi-fasta file") -> group(group) -> set_type_name("FILE");
341

Mathieu Giraud's avatar
Mathieu Giraud committed
342
343
  group = "Locus/recombinations";
  app.add_flag("-d", several_D, "try to detect several D (experimental)") -> group(group);
344
  app.add_flag("-2",  multi_germline_unexpected_recombinations_12, "try to detect unexpected recombinations (must be used with -g)") -> group(group);
345

346

Mathieu Giraud's avatar
Mathieu Giraud committed
347
  group = "Experimental options (do not use)";
348
  app.add_flag("-I", multi_germline_mark, "ignore k-mers common to different germline systems (experimental, must be used with -g, do not use)") -> group(group) -> level();
349
  app.add_flag("-1", multi_germline_one_unique_index,
350
351
352
               "use a unique index for all germline systems (experimental, must be used with -g, do not use)") -> group(group) -> level();
  app.add_flag("-4", multi_germline_unexpected_recombinations_1U, "try to detect unexpected recombinations with translocations (experimental, must be used with -g, do not use)") -> group(group) -> level();
  app.add_flag("--keep", keep_unsegmented_as_clone, "keep unsegmented reads as clones, taking for junction the complete sequence, to be used on very small datasets (for example --keep -AX 20)") -> group(group) -> level();
353

Mathieu Giraud's avatar
Mathieu Giraud committed
354
355
356
357
358
359
360
  group = "Window prediction";
#ifndef NO_SPACED_SEEDS
  /*
       << "  (use either -s or -k option, but not both)" << endl
           << "  (all these options, except -w, are overriden when using -g)" << endl
  */
#endif
Mikaël Salson's avatar
Mikaël Salson committed
361

362
363
364
365
366
367
368
369
370
371
372
373
374
  app.add_option("-k",
                 [&](CLI::results_t res) {
                   int kmer_size ;
                   bool worked = CLI::detail::lexical_cast(res[0], kmer_size);
                   if (worked) {
                     seed = seed_contiguous(kmer_size);
                     seed_changed = true;
                     options_s_k++ ;
                   }
                   return worked;
                 },
                 "k-mer size used for the V/J affectation (default: 10, 12, 13, depends on germline)") -> group(group);

Mathieu Giraud's avatar
Mathieu Giraud committed
375
376
377
378
    
#ifndef NO_SPACED_SEEDS
  //      << "                (using -k option is equivalent to set with -s a contiguous seed with only '#' characters)" << endl
#endif
379
380
381
382
  app.add_option("-w", wmer_size,
                 "w-mer size used for the length of the extracted window ('" NO_LIMIT "': use all the read, no window clustering)") -> group(group) -> transform(string_NO_LIMIT);
  app.add_option("-e", expected_value,
                 "maximal e-value for determining if a V-J segmentation can be trusted", true) -> group(group) -> transform(string_NO_LIMIT);
383
384
385
386
387
388
389
  app.add_option("-t",
                 [&](CLI::results_t res) {
                   bool worked = CLI::detail::lexical_cast(res[0], trim_sequences);
                   trim_sequences_changed = true;
                   return true;
                 },
                 // trim_sequences,
Mathieu Giraud's avatar
Mathieu Giraud committed
390
                 "trim V and J genes (resp. 5' and 3' regions) to keep at most <int> nt  (0: no trim)") -> group(group);
Mikaël Salson's avatar
Mikaël Salson committed
391

392
393
394
395
396
397
398
399
400
401
  app.add_option("-s",
                 [&](CLI::results_t res) {
                   seed = res[0] ;
                   options_s_k++ ;
                   seed_changed = true;
                   return true;
                 },
                 "spaced seeds used for the V/J affectation (default: depends on germline)"
                 ) -> group(group) -> level();

Mathieu Giraud's avatar
Mathieu Giraud committed
402
/*
403
#ifdef NO_SPACED_SEEDS
404
405
        cerr << "To enable the option -s, please compile without NO_SPACED_SEEDS" << endl;
#endif
Mathieu Giraud's avatar
Mathieu Giraud committed
406
407
*/
  
Mikaël Salson's avatar
Mikaël Salson committed
408

409

410

411
  group = "Labeled sequences (windows related to these sequences will be kept even if -r/--ratio thresholds are not reached)";
412
413
414

  app.add_option("-W", windows_labels_explicit, "label the given sequence") -> group(group);

415
  app.add_option("-l", windows_labels_file, "label a set of sequences given in <file>") -> group(group) -> set_type_name("FILE");
Mathieu Giraud's avatar
Mathieu Giraud committed
416
  app.add_flag("-F", only_labeled_windows, "filter -- keep only the windows related to the labeled sequences") -> group(group);
Mikaël Salson's avatar
Mikaël Salson committed
417
418


Mathieu Giraud's avatar
Mathieu Giraud committed
419
420
  //  if (advanced)
  group = "Fine segmentation options (second pass)";
421

422
423
424
425
426
  app.add_option("-f",
                 [&segment_cost](CLI::results_t res) {
                   segment_cost = strToCost(res[0].c_str(), VDJ); 
                   return true;
                 },
427
                 "use custom Cost for fine segmenter : format \"match, subst, indels, del_end, homo\" (default " + string_of_cost(DEFAULT_SEGMENT_COST) + ")"
428
                 ) -> group(group);
429

430
  app.add_option("-E", expected_value_D, "maximal e-value for determining if a D segment can be trusted", true) -> group(group) -> level();
Mathieu Giraud's avatar
Mathieu Giraud committed
431

432
433
  app.add_option("-Z", kmer_threshold, "typical number of V genes, selected by k-mer comparison, to compare to the read ('" NO_LIMIT "': all genes, default)", true) -> group(group)
    -> transform(string_NO_LIMIT) -> level() -> set_type_name("NB") ;
Mathieu Giraud's avatar
Mathieu Giraud committed
434
435
  
  group = "Clone analysis (second pass)";
436
  app.add_flag("-3,--cdr3", detect_CDR3, "CDR3/JUNCTION detection (requires gapped V/J germlines)") -> group(group);
437

Mathieu Giraud's avatar
Mathieu Giraud committed
438
439
440
441
442
  
  // if (advanced)
  group = "Additional clustering (experimental)" ;

  app.add_flag_function("-q", [&](size_t n) { indexType = AC_AUTOMATON; });
443
  app.add_option("-n", epsilon, "minimum required neighbors for automatic clustering. No automatic clusterisation if =0.", true) -> group(group) -> level();
Mathieu Giraud's avatar
Mathieu Giraud committed
444

445
446
447
448
  app.add_option("-N", minPts, "", true) -> group(group) -> level();
  app.add_option("-S", save_comp, "generate and save comparative matrix for clustering") -> group(group) -> level();
  app.add_option("-L", load_comp, "load comparative matrix for clustering") -> group(group) -> level();
  app.add_option("--forced-edges", forced_edges, "manual clustering -- a file used to force some specific edges") -> group(group) -> level();
449
450
451
452
453
454

  app.add_option("-C",
                 [&cluster_cost](CLI::results_t res) {
                   cluster_cost = strToCost(res[0].c_str(), Cluster);
                   return true;
                 },
455
                 "use custom Cost for automatic clustering : format \"match, subst, indels, del_end, homo\" (default " + string_of_cost(DEFAULT_CLUSTER_COST) + ")"
456
                 ) -> group(group) -> level();
Mathieu Giraud's avatar
Mathieu Giraud committed
457
458
459
460
461
462

  group = "Limits to report a clone (or a window)";
  app.add_option("-r", min_reads_clone, "minimal number of reads supporting a clone", true) -> group(group);
  app.add_option("--ratio", ratio_reads_clone, "minimal percentage of reads supporting a clone", true) -> group(group);

  group = "Limits to further analyze some clones";
463
464
  app.add_option("-y", max_representatives,
                 "maximal number of clones computed with a consensus sequence ('" NO_LIMIT "': no limit)", true) -> group(group) -> transform(string_NO_LIMIT);
465
466
467
468
469
470
471
472
473

  app.add_option("-z",
                 [&max_clones, &max_representatives](CLI::results_t res) {
                   max_clones = atoi_NO_LIMIT(res[0].c_str());
                   if ((max_representatives < max_clones) && (max_representatives != NO_LIMIT_VALUE))
                     max_representatives = max_clones ;
                   return true;
                   // TODO: return false on bad input
                 },
474
475
                 "maximal number of clones to be analyzed with a full V(D)J designation ('" NO_LIMIT "': no limit, do not use)"
                 ) -> group(group) -> set_type_name("INT=" + string_of_int(max_clones));
Mathieu Giraud's avatar
Mathieu Giraud committed
476
477
478
479
480
481
482

  app.add_flag_function("-A", [&](size_t n) {
      ratio_reads_clone = 0 ;
      min_reads_clone = 1 ;
      max_representatives = NO_LIMIT_VALUE ;
      max_clones = NO_LIMIT_VALUE ;
    },
483
    "reports and segments all clones (-r 0 --ratio 0 -y " NO_LIMIT " -z " NO_LIMIT "), to be used only on very small datasets (for example -AX 20)") -> group(group);
484

Mathieu Giraud's avatar
Mathieu Giraud committed
485
  app.add_option("-x", max_reads_processed,
486
                 "maximal number of reads to process ('" NO_LIMIT "': no limit, default), only first reads") -> group(group) -> transform(string_NO_LIMIT);
Mathieu Giraud's avatar
Mathieu Giraud committed
487
  app.add_option("-X", max_reads_processed_sample,
488
                 "maximal number of reads to process ('" NO_LIMIT "': no limit, default), sampled reads") -> group(group) -> transform(string_NO_LIMIT);
489

Mathieu Giraud's avatar
Mathieu Giraud committed
490
491
492
  
  group = "Detailed output per read (generally not recommended, large files, but may be used for filtering, as in -uu -X 1000)";
  app.add_flag("-U", output_segmented, "output segmented reads (in " SEGMENTED_FILENAME " file)") -> group(group);
Mathieu Giraud's avatar
Mathieu Giraud committed
493
494
495
496
497
498
499
500
501
502
  app.add_flag_function("-u", [&](size_t n) {
      output_unsegmented = output_unsegmented_detail_full ;       // -uuu
      output_unsegmented_detail_full = output_unsegmented_detail; // -uu
      output_unsegmented_detail = true;                           // -u
    }, R"Z(
        -u          output unsegmented reads, gathered by unsegmentation cause, except for very short and 'too few V/J' reads (in *)Z" UNSEGMENTED_DETAIL_FILENAME R"Z( files)
        -uu         output unsegmented reads, gathered by unsegmentation cause, all reads (in *)Z" UNSEGMENTED_DETAIL_FILENAME R"Z( files) (use only for debug)
        -uuu        output unsegmented reads, all reads, including a )Z" UNSEGMENTED_FILENAME R"Z( file (use only for debug)
)Z") -> group(group);

Mathieu Giraud's avatar
Mathieu Giraud committed
503
  app.add_flag("-K", output_affects, "output detailed k-mer affectation on all reads (in " AFFECTS_FILENAME " file) (use only for debug, for example -KX 100)") -> group(group);
504

505

Mathieu Giraud's avatar
Mathieu Giraud committed
506
  group = "Output";
507
508
  app.add_option("-o", out_dir, "output directory", true) -> group(group) -> set_type_name("PATH");
  app.add_option("-b", f_basename, "output basename (by default basename of the input file)") -> group(group) -> set_type_name("FILE");
Mathieu Giraud's avatar
Mathieu Giraud committed
509
  app.add_flag("-a", output_sequences_by_cluster, "output all sequences by cluster (" CLONE_FILENAME "*), to be used only on small datasets") -> group(group);
Mathieu Giraud's avatar
Mathieu Giraud committed
510
  app.add_flag_function("-v", [&](size_t n) { verbose += n ; }, "verbose mode") -> group(group);
511

512

513
514
515
516
517
  app.add_flag_function("-H", [&](size_t n) { throw CLI::CallForAdvancedHelp() ; },
                        "help, including advanced and experimental options"
                        "\n                              "
                        "The full help is available in the doc/algo.org file."
                        );
518

Mikaël Salson's avatar
Mikaël Salson committed
519
520


521
522
  app.set_footer(usage_examples(argv[0]));

Mathieu Giraud's avatar
Mathieu Giraud committed
523
524
  //$$ options: parsing
  CLI11_PARSE(app, argc, argv);
Mathieu Giraud's avatar
Mathieu Giraud committed
525

Mathieu Giraud's avatar
Mathieu Giraud committed
526
  //$$ options: post-processing+display
Mikaël Salson's avatar
Mikaël Salson committed
527

Mathieu Giraud's avatar
Mathieu Giraud committed
528
529
530
531
532
533
534
535
536
537
  if (cmd == COMMAND_CLONES)
    command = CMD_CLONES;
  else if (cmd == COMMAND_SEGMENT)
    command = CMD_SEGMENT;
  else if (cmd == COMMAND_WINDOWS)
    command = CMD_WINDOWS;
  else if (cmd == COMMAND_GERMLINES)
    command = CMD_GERMLINES;
  else {
    cerr << "Unknwown command " << optarg << endl;
538
    throw CLI::CallForHelp();
Mathieu Giraud's avatar
Mathieu Giraud committed
539
  }
540

Mathieu Giraud's avatar
Mathieu Giraud committed
541
542
543
  list <string> f_reps_V(v_reps_V.begin(), v_reps_V.end());
  list <string> f_reps_D(v_reps_D.begin(), v_reps_D.end());
  list <string> f_reps_J(v_reps_J.begin(), v_reps_J.end());
544

545

Mathieu Giraud's avatar
Mathieu Giraud committed
546
547
548
549
550
551
552
553
554
555
556
  for (string arg: multi_germlines)
    {
      multi_germline = true;

      struct stat buffer;
      if (stat(arg.c_str(), &buffer) == 0)
        {
          if( buffer.st_mode & S_IFDIR )
            {
              // argument is a directory
              multi_germline_paths_and_files.push_back(make_pair(arg, DEFAULT_MULTI_GERMLINE_FILE)) ;
557
558
              continue ;
            }
Mathieu Giraud's avatar
Mathieu Giraud committed
559
560
561
562
563
564
565
        }

      // argument is not a directory (and basename can include ':' with a filter)
      multi_germline_paths_and_files.push_back(make_pair(extract_dirname(arg), extract_basename(arg, false)));
    }


566
  if (!multi_germline && (!f_reps_V.size() || !f_reps_J.size()))
567
    {
568
      cerr << ERROR_STRING << "At least one germline must be given with -g or -V/(-D)/-J." << endl ;
569
      return 1;
570
571
    }

572
573
  if (options_s_k > 1)
    {
574
      cerr << ERROR_STRING << "Use at most one -s or -k option." << endl ;
575
      return 1;
576
577
    }

578
579
580
  for(string lab : windows_labels_explicit)
    windows_labels[lab] = string("-W");
  
Mikaël Salson's avatar
Mikaël Salson committed
581
582
  string out_seqdir = out_dir + "/seq/" ;

Mikaël Salson's avatar
Mikaël Salson committed
583
584
585
  if (verbose)
    cout << "# verbose " << verbose << endl ;

586
  if (f_reads == DEFAULT_READS)
Mikaël Salson's avatar
Mikaël Salson committed
587
588
589
    {
      cout << "# using default sequence file: " << f_reads << endl ;
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
590

591
592
593
594
595
  //  else
  //  {
  //    cerr << ERROR_STRING << "Wrong number of arguments." << endl ;
  //    return 1;
  //  }
Mathieu Giraud's avatar
Mathieu Giraud committed
596

597
  size_t min_cover_representative = (size_t) (min_reads_clone < (int) max_auditionned ? min_reads_clone : max_auditionned) ;
598

599
600
  // Default seeds

601
#ifdef NO_SPACED_SEEDS
602
  if (! seed_changed)
603
    {
604
      cerr << ERROR_STRING << PROGNAME << " was compiled with NO_SPACED_SEEDS: please provide a -k option." << endl;
605
      return 1;
606
607
608
609
  }
#endif
	  

Mathieu Giraud's avatar
Mathieu Giraud committed
610
611
612
613
#ifndef NO_SPACED_SEEDS
  // Check seed buffer  
  if (seed.size() >= MAX_SEED_SIZE)
    {
614
      cerr << ERROR_STRING << "Seed size is too large (MAX_SEED_SIZE)." << endl ;
615
      return 1;
Mathieu Giraud's avatar
Mathieu Giraud committed
616
617
618
    }
#endif

619

Mathieu Giraud's avatar
Mathieu Giraud committed
620
  if ((wmer_size< 0) && (wmer_size!= NO_LIMIT_VALUE))
621
    {
622
      cerr << ERROR_STRING << "Too small -w. The window size should be positive" << endl;
623
      return 1;
624
625
    }

Mikaël Salson's avatar
Mikaël Salson committed
626
627
628
629
  // Check that out_dir is an existing directory or creates it
  const char *out_cstr = out_dir.c_str();

  if (mkpath(out_cstr, 0755) == -1) {
630
    cerr << ERROR_STRING << "Directory creation: " << out_dir << endl; perror("");
631
    return 2;
Mikaël Salson's avatar
Mikaël Salson committed
632
633
  }

Mikaël Salson's avatar
Mikaël Salson committed
634
635
  const char *outseq_cstr = out_seqdir.c_str();
  if (mkpath(outseq_cstr, 0755) == -1) {
636
    cerr << ERROR_STRING << "Directory creation: " << out_seqdir << endl; perror("");
637
    return 2;
Mikaël Salson's avatar
Mikaël Salson committed
638
639
  }

640
641
642
643
644
  // Compute basename if not given as an option
  if (f_basename == "") {
    f_basename = extract_basename(f_reads);
  }

Mikaël Salson's avatar
Mikaël Salson committed
645
646
647
  out_dir += "/" ;

  /// Load labels ;
648
  load_into_map(windows_labels, windows_labels_file, "-l");
Mikaël Salson's avatar
Mikaël Salson committed
649
650

  switch(command) {
Mathieu Giraud's avatar
Mathieu Giraud committed
651
  case CMD_WINDOWS: cout << "Extracting windows" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
652
    break;
653
  case CMD_CLONES: cout << "Analysing clones" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
654
655
656
    break;
  case CMD_SEGMENT: cout << "Segmenting V(D)J" << endl;
    break;
657
658
  case CMD_GERMLINES: cout << "Discovering germlines" << endl;
    break;
Mikaël Salson's avatar
Mikaël Salson committed
659
660
  }

Mathieu Giraud's avatar
Mathieu Giraud committed
661
  cout << "Command line: ";
Mikaël Salson's avatar
Mikaël Salson committed
662
  for (int i=0; i < argc; i++) {
Mathieu Giraud's avatar
Mathieu Giraud committed
663
    cout << argv[i] << " ";
Mikaël Salson's avatar
Mikaël Salson committed
664
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
665
  cout << endl;
Mikaël Salson's avatar
Mikaël Salson committed
666
667
668
669
670
671
672
673
674
675
676
677

  //////////////////////////////////
  // Display time and date
  time_t rawtime;
  struct tm *timeinfo;
  char time_buffer[80];

  time (&rawtime );
  timeinfo = localtime (&rawtime);

  strftime (time_buffer, 80,"%F %T", timeinfo);

Mathieu Giraud's avatar
Mathieu Giraud committed
678
  cout << "# " << time_buffer << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
679
680


681
682
  //////////////////////////////////
  // Warning for non-optimal use
683

684
  if (max_clones == NO_LIMIT_VALUE || max_clones > WARN_MAX_CLONES)
685
686
    {
      cout << endl
687
	   << "* WARNING: " << PROGNAME << " was run with '-A' option or with a large '-z' option" << endl ;
688
689
690
691
692
    }
  
  if (command == CMD_SEGMENT)
    {
      cout << endl
693
	   << "* WARNING: " << PROGNAME << " was run with '-c segment' option" << endl ;
694
695
    }
  
696
  if (max_clones == NO_LIMIT_VALUE || max_clones > WARN_MAX_CLONES || command == CMD_SEGMENT)
697
    {
698
      cout << "* " << PROGNAME << " efficientl extracts windows overlapping the CDR3" << endl
699
           << "* to cluster reads into clones ('-c clones')." << endl
700
701
           << "* Computing accurate V(D)J designations for many sequences ('-c segment' or large '-z' values)" << endl
           << "* is slow and should be done only on small datasets or for testing purposes." << endl
702
703
704
	   << "* More information is provided in the 'doc/algo.org' file." << endl 
	   << endl ;
    }
705

706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727

  /////////////////////////////////////////
  //            JSON OUTPUT              //
  /////////////////////////////////////////

  string f_json = out_dir + f_basename + JSON_SUFFIX ;

  ostringstream stream_cmdline;
  for (int i=0; i < argc; i++) stream_cmdline << argv[i] << " ";

  json j = {
    {"vidjil_json_version", VIDJIL_JSON_VERSION},
    {"samples", {
        {"number", 1},
        {"original_names", {f_reads}},
        {"run_timestamp", {time_buffer}},
        {"producer", {soft_version}},
        {"commandline", {stream_cmdline.str()}}
      }}
  };


728
729
730
  /////////////////////////////////////////
  //            LOAD GERMLINES           //
  /////////////////////////////////////////
731

732
733
734
  if (command == CMD_GERMLINES)
    {
      multi_germline = true ;
735
      multi_germline_one_unique_index = true ;
736
737
    }

738
  MultiGermline *multigermline = new MultiGermline(indexType, !multi_germline_one_unique_index);
739

740
    {
741
      cout << "Load germlines and build Kmer indexes" << endl ;
742
743
744
    
      if (multi_germline)
	{
745
          for (pair <string, string> path_file: multi_germline_paths_and_files)
746
747
            {
              try {
748
                multigermline->build_from_json(path_file.first, path_file.second, GERMLINES_REGULAR,
749
                                               FIRST_IF_UNCHANGED("", seed, seed_changed),
750
                                               FIRST_IF_UNCHANGED(0, trim_sequences, trim_sequences_changed), (kmer_threshold != NO_LIMIT_VALUE));
751
              } catch (std::exception& e) {
752
                cerr << ERROR_STRING << PROGNAME << " cannot properly read " << path_file.first << "/" << path_file.second << ": " << e.what() << endl;
753
                delete multigermline;
754
                return 1;
755
756
              }
            }
757
758
759
760
761
	}
      else
	{
	  // Custom germline
	  Germline *germline;
762
	  germline = new Germline("custom", 'X',
763
764
                                  f_reps_V, f_reps_D, f_reps_J,
                                  seed, trim_sequences, (kmer_threshold != NO_LIMIT_VALUE));
765

766
          germline->new_index(indexType);
767

768
769
770
	  multigermline->insert(germline);
	}
    }
771

772
    cout << endl ;
773

774
    if (multi_germline_one_unique_index) {
775
      multigermline->build_with_one_index(seed, true);
776
    }
777

778
      if (multi_germline_unexpected_recombinations_12 || multi_germline_unexpected_recombinations_1U) {
779
780
781
        if (!multigermline->index) {
          multigermline->build_with_one_index(seed, false);
        }
782
      }
783

784
      if (multi_germline_unexpected_recombinations_12) {
785
        Germline *pseudo = new Germline(PSEUDO_UNEXPECTED, PSEUDO_UNEXPECTED_CODE, "", trim_sequences, (kmer_threshold != NO_LIMIT_VALUE));
786
        pseudo->seg_method = SEG_METHOD_MAX12 ;
787
        pseudo->set_index(multigermline->index);
788
        multigermline->germlines.push_back(pseudo);
789
790
791
      }

      if (multi_germline_unexpected_recombinations_1U) {
792
        Germline *pseudo_u = new Germline(PSEUDO_UNEXPECTED, PSEUDO_UNEXPECTED_CODE, "", trim_sequences, (kmer_threshold != NO_LIMIT_VALUE));
793
        pseudo_u->seg_method = SEG_METHOD_MAX1U ;
794
        // TODO: there should be more up/downstream regions for the PSEUDO_UNEXPECTED germline. And/or smaller seeds ?
795
        pseudo_u->set_index(multigermline->index);
796
        multigermline->germlines.push_back(pseudo_u);
797
798
    }

799
      // Should come after the initialization of regular (and possibly pseudo) germlines
800
    {
801
      for (pair <string, string> path_file: multi_germline_paths_and_files)
802
        multigermline->build_from_json(path_file.first, path_file.second, GERMLINES_INCOMPLETE,
803
                                       FIRST_IF_UNCHANGED("", seed, seed_changed),
804
                                       FIRST_IF_UNCHANGED(0, trim_sequences, trim_sequences_changed), (kmer_threshold != NO_LIMIT_VALUE));
805
      if ((! multigermline->one_index_per_germline) && (command != CMD_GERMLINES)) {
806
807
        multigermline->insert_in_one_index(multigermline->index, true);
      }
808
809
    }

810
811
    if (multi_germline_mark)
      multigermline->mark_cross_germlines_as_ambiguous();
812
813

    multigermline->finish();
814
    cout << "Germlines loaded: " ;
815
816
    cout << *multigermline ;
    cout << endl ;
817
818

    // Number of reads for e-value computation
819
    unsigned long long nb_reads_for_evalue = (expected_value > NO_LIMIT_VALUE) ? nb_sequences_in_file(f_reads, true) : 1 ;
820

821
    
822
823
  //////////////////////////////////
  //$$ Read sequence files
824
825
826
827

    int only_nth_read = 1 ;
    if (max_reads_processed_sample != NO_LIMIT_VALUE)
      {
828
        only_nth_read = nb_sequences_in_file(f_reads) / max_reads_processed_sample;
829
830
831
        if (only_nth_read == 0)
          only_nth_read = 1 ;

832
        max_reads_processed = max_reads_processed_sample ;
833
834
835
836
837

        if (only_nth_read > 1)
          cout << "Processing every " << only_nth_read
               << (only_nth_read == 2 ? "nd" : (only_nth_read == 3 ? "rd" : "th"))
               << " read" << endl ;
838
839
      }

840
  OnlineBioReader *reads;
841
842

  try {
843
    reads = OnlineBioReaderFactory::create(f_reads, 1, read_header_separator, max_reads_processed, only_nth_read);
844
  } catch (const invalid_argument e) {
845
    cerr << ERROR_STRING << PROGNAME << " cannot open reads file " << f_reads << ": " << e.what() << endl;
846
    return 1;
847
848
849
850
851
  }

  out_dir += "/";


852
853
854
855
856
  //////////////////////////////://////////
  //         DISCOVER GERMLINES          //
  /////////////////////////////////////////
  if (command == CMD_GERMLINES)
    {
857
      map <char, int> stats_kmer, stats_max;
858
      IKmerStore<KmerAffect> *index = multigermline->index ;
859
860

      // Initialize statistics, with two additional categories
861
862
      index->labels.push_back(make_pair(KmerAffect::getAmbiguous(), BIOREADER_AMBIGUOUS));
      index->labels.push_back(make_pair(KmerAffect::getUnknown(), BIOREADER_UNKNOWN));
863
      
864
      for (list< pair <KmerAffect, BioReader> >::const_iterator it = index->labels.begin(); it != index->labels.end(); ++it)
865
	{
866
867
868
	  char key = affect_char(it->first.affect) ;
	  stats_kmer[key] = 0 ;
	  stats_max[key] = 0 ;
869
870
	}