vidjil.cpp 56.7 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2
3
  This file is part of Vidjil-algo <http://www.vidjil.org>
  Copyright (C) 2011-2018 by Bonsai bioinformatics
4
5
6
7
8
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9

10
  "Vidjil-algo" is free software: you can redistribute it and/or modify
Mikaël Salson's avatar
Mikaël Salson committed
11
12
13
14
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

15
  "Vidjil-algo" is distributed in the hope that it will be useful,
Mikaël Salson's avatar
Mikaël Salson committed
16
17
18
19
20
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
21
  along with "Vidjil-algo". If not, see <http://www.gnu.org/licenses/>
Mikaël Salson's avatar
Mikaël Salson committed
22
23
*/

Mathieu Giraud's avatar
Mathieu Giraud committed
24
//$$ #include
Mikaël Salson's avatar
Mikaël Salson committed
25
26
27
28
29
30
31
32
33
34
35
36
37

#include<algorithm>
#include<utility>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <cstring>
#include <time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

38
#include "core/check-compiler.h"
Mikaël Salson's avatar
Mikaël Salson committed
39
#include "core/tools.h"
WebTogz's avatar
WebTogz committed
40
#include "core/json.h"
41
#include "core/germline.h"
Mikaël Salson's avatar
Mikaël Salson committed
42
43
#include "core/kmerstore.h"
#include "core/fasta.h"
44
#include "core/bioreader.hpp"
Mikaël Salson's avatar
Mikaël Salson committed
45
#include "core/segment.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
46
#include "core/windows.h"
Mikaël Salson's avatar
Mikaël Salson committed
47
48
49
50
51
52
53
#include "core/cluster-junctions.h"
#include "core/dynprog.h"
#include "core/read_score.h"
#include "core/read_chooser.h"
#include "core/compare-all.h"
#include "core/mkdir.h"
#include "core/labels.h"
Mikaël Salson's avatar
Mikaël Salson committed
54
#include "core/list_utils.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
55
#include "core/windowExtractor.h"
Mikaël Salson's avatar
Mikaël Salson committed
56

Mathieu Giraud's avatar
Mathieu Giraud committed
57
#include "lib/CLI11.hpp"
58
59
#include "lib/json.hpp"

Mikaël Salson's avatar
Mikaël Salson committed
60
61
62
63
64
65
66
#include "vidjil.h"

// RELEASE_TAG may be defined in the "release.h" file.
// If RELEASE_TAG is undefined, the version will be the git hash.
// #define RELEASE_TAG  "2013.04"
#include "release.h"

67
68
69
// GIT_VERSION should be defined in "git-version.h", created by "create-git-version-h.sh", to be used outside of releases
#include "git-version.h"

70
#define PROGNAME "vidjil-algo"
71
#define VIDJIL_JSON_VERSION "2016b"
Mikaël Salson's avatar
Mikaël Salson committed
72

Mathieu Giraud's avatar
Mathieu Giraud committed
73
74
//$$ #define (mainly default options)

75
#define DEFAULT_MULTI_GERMLINE_PATH "germline/"
76
#define DEFAULT_MULTI_GERMLINE_FILE "homo-sapiens.g"
Mikaël Salson's avatar
Mikaël Salson committed
77

78
#define DEFAULT_READ_HEADER_SEPARATOR " "
79
#define DEFAULT_READS  "./demo/Stanford_S22.fasta"
80
#define DEFAULT_MIN_READS_CLONE 5
81
#define DEFAULT_MAX_REPRESENTATIVES 100
82
#define DEFAULT_MAX_CLONES 100
83
#define DEFAULT_RATIO_READS_CLONE 0.0
84
#define NO_LIMIT "all"
Mikaël Salson's avatar
Mikaël Salson committed
85

Mathieu Giraud's avatar
Mathieu Giraud committed
86
#define COMMAND_WINDOWS "windows"
87
#define COMMAND_CLONES "clones"
Mikaël Salson's avatar
Mikaël Salson committed
88
#define COMMAND_SEGMENT "segment"
89
#define COMMAND_GERMLINES "germlines"
Mikaël Salson's avatar
Mikaël Salson committed
90
 
91
enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
Mikaël Salson's avatar
Mikaël Salson committed
92

93
94
95
#define DEFAULT_OUT_DIR "./out/" 

// Fixed filenames/suffixes
96
#define CLONES_FILENAME ".vdj.fa"
Mikaël Salson's avatar
Mikaël Salson committed
97
#define CLONE_FILENAME "clone.fa-"
98
99
#define WINDOWS_FILENAME ".windows.fa"
#define SEGMENTED_FILENAME ".segmented.vdj.fa"
100
#define UNSEGMENTED_FILENAME ".unsegmented.vdj.fa"
101
#define UNSEGMENTED_DETAIL_FILENAME ".fa"
102
#define AFFECTS_FILENAME ".affects"
103
104
105
#define EDGES_FILENAME ".edges"
#define COMP_FILENAME "comp.vidjil"
#define JSON_SUFFIX ".vidjil"
Mikaël Salson's avatar
Mikaël Salson committed
106

107
#define DEFAULT_K      0
108
#define DEFAULT_W      50
109
#define DEFAULT_SEED   DEFAULT_GERMLINE_SEED
Mikaël Salson's avatar
Mikaël Salson committed
110

Mikaël Salson's avatar
Mikaël Salson committed
111
#define DEFAULT_MAX_AUDITIONED 2000
Mathieu Giraud's avatar
Mathieu Giraud committed
112
113
#define DEFAULT_RATIO_REPRESENTATIVE 0.5

Mathieu Giraud's avatar
Mathieu Giraud committed
114
115
#define DEFAULT_KMER_THRESHOLD NO_LIMIT_VALUE

Mikaël Salson's avatar
Mikaël Salson committed
116
117
118
119
120
121
#define DEFAULT_EPSILON  0
#define DEFAULT_MINPTS   10

#define DEFAULT_CLUSTER_COST  Cluster
#define DEFAULT_SEGMENT_COST   VDJ

122
#define DEFAULT_TRIM 0
Mikaël Salson's avatar
Mikaël Salson committed
123

124
#define MAX_CLONES_FOR_SIMILARITY 20
125

126
// warn
127
#define WARN_MAX_CLONES 100
128
#define WARN_PERCENT_SEGMENTED 40
129
#define WARN_COVERAGE 0.6
130
#define WARN_NUM_CLONES_SIMILAR 10
131

Mikaël Salson's avatar
Mikaël Salson committed
132
133
134
// display
#define WIDTH_NB_READS 7
#define WIDTH_NB_CLONES 3
135

Mikaël Salson's avatar
Mikaël Salson committed
136
137

using namespace std ;
138
using json = nlohmann::json;
Mikaël Salson's avatar
Mikaël Salson committed
139

Mathieu Giraud's avatar
Mathieu Giraud committed
140
//$$ options: usage
Mikaël Salson's avatar
Mikaël Salson committed
141

Mathieu Giraud's avatar
Mathieu Giraud committed
142
extern char *optarg;
Mikaël Salson's avatar
Mikaël Salson committed
143
144
145

extern int optind, optopt, opterr;

146
int usage(char *progname, bool advanced)
Mikaël Salson's avatar
Mikaël Salson committed
147
{
Mathieu Giraud's avatar
Mathieu Giraud committed
148
  cout
Mikaël Salson's avatar
Mikaël Salson committed
149
       << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
150
       << "Examples (see doc/algo.org)" << endl
151
152
153
       << "  " << progname << " -c clones   -g germline/homo-sapiens.g   -2 -3 -r 1  demo/Demo-X5.fa           # (basic usage, detect the locus for each read," << endl
       << "                                                                                               #  cluster reads and report clones starting from the first read (-r 1)," << endl
       << "                                                                                               #  including unexpected recombinations (-2), assign V(D)J genes and try to detect the CDR3s (-3))" << endl
154
       << "  " << progname << " -c clones   -g germline/homo-sapiens.g:IGH    -3     demo/Stanford_S22.fasta   # (restrict to complete recombinations on the IGH locus)" << endl
155
156
157
       << "  " << progname << " -c clones   -g germline/homo-sapiens.g   -2 -3 -z 20 demo/LIL-L4.fastq.gz      # (basic usage, output detailed V(D)J analysis on the first 20 clones)" << endl
       << "  " << progname << " -c windows  -g germline/homo-sapiens.g   -y 0 -uu -U demo/LIL-L4.fastq.gz      # (splits all the reads into (large) files depending on the detection of V(D)J recombinations)" << endl
       << "  " << progname << " -c segment  -g germline/homo-sapiens.g   -2 -3 -X 50 demo/Stanford_S22.fasta   # (full analysis of each read, only for debug/testing, here on 50 sampled reads)" << endl
158
       << "  " << progname << " -c germlines -g germline/homo-sapiens.g              demo/Stanford_S22.fasta   # (statistics on the k-mers)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
159
    ;
160
  return 1;
Mikaël Salson's avatar
Mikaël Salson committed
161
162
}

Mathieu Giraud's avatar
Mathieu Giraud committed
163

164
int atoi_NO_LIMIT(const char *optarg)
Mathieu Giraud's avatar
Mathieu Giraud committed
165
{
166
167
  return strcmp(NO_LIMIT, optarg) ? atoi(optarg) : NO_LIMIT_VALUE ;
}
168
double atof_NO_LIMIT(const char *optarg)
169
170
{
  return strcmp(NO_LIMIT, optarg) ? atof(optarg) : NO_LIMIT_VALUE ;
Mathieu Giraud's avatar
Mathieu Giraud committed
171
172
}

173
174
175
176
177
178
179
180
181
182
string string_NO_LIMIT(string s)
{
  if (!strcmp(NO_LIMIT, s.c_str()))
    return NO_LIMIT_VALUE_STRING ;

  return s;
}



Mikaël Salson's avatar
Mikaël Salson committed
183
184
int main (int argc, char **argv)
{
185
  cout << "# " << PROGNAME << " -- V(D)J recombinations analysis <http://www.vidjil.org/>" << endl
186
       << "# Copyright (C) 2011-2018 by the Vidjil team" << endl
187
       << "# Bonsai bioinformatics at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille" << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
188
       << endl
189
       << "# " << PROGNAME << " is free software, and you are welcome to redistribute it" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
190
191
192
       << "# under certain conditions -- see http://git.vidjil.org/blob/master/doc/LICENSE" << endl
       << "# No lymphocyte was harmed in the making of this software," << endl
       << "# however this software is for research use only and comes with no warranty." << endl
193
       << endl
194
       << "# Please cite http://biomedcentral.com/1471-2164/15/409 if you use " << PROGNAME << "." << endl
Mikaël Salson's avatar
Mikaël Salson committed
195
196
       << endl ;

197
198
199
  //////////////////////////////////
  // Display version information or git log

200
201
  string soft_version = PROGNAME ;
  soft_version += " " ;
202
#ifdef RELEASE_TAG
203
  cout << "# version: " PROGNAME << " " << RELEASE_TAG << endl ;
204
205
206
207
208
209
210
211
212
213
  soft_version.append(RELEASE_TAG);
#else
  cout << "# development version" << endl ;
#ifdef GIT_VERSION
  cout << "# git: " << GIT_VERSION << endl ;
  soft_version.append("dev ");
  soft_version.append(GIT_VERSION);
#endif
#endif

Mathieu Giraud's avatar
Mathieu Giraud committed
214
215
  CLI::App app{"Vidjil"};

Mathieu Giraud's avatar
Mathieu Giraud committed
216
217
  //$$ options: defaults

Mathieu Giraud's avatar
Mathieu Giraud committed
218
219
220
221
  vector <string> v_reps_V ;
  vector <string> v_reps_D ;
  vector <string> v_reps_J ;

222
  list <pair <string, string>> multi_germline_paths_and_files ;
223

224
  string read_header_separator = DEFAULT_READ_HEADER_SEPARATOR ;
Mikaël Salson's avatar
Mikaël Salson committed
225
226
  string f_reads = DEFAULT_READS ;
  string seed = DEFAULT_SEED ;
227
  bool seed_changed = false;
228
  string f_basename = "";
Mikaël Salson's avatar
Mikaël Salson committed
229

230
  string out_dir = DEFAULT_OUT_DIR;
Mikaël Salson's avatar
Mikaël Salson committed
231
232
233
  
  string comp_filename = COMP_FILENAME;

234
  int wmer_size = DEFAULT_W ;
Mikaël Salson's avatar
Mikaël Salson committed
235

236
  IndexTypes indexType = KMER_INDEX;
237

Mikaël Salson's avatar
Mikaël Salson committed
238
239
240
241
  int epsilon = DEFAULT_EPSILON ;
  int minPts = DEFAULT_MINPTS ;
  Cost cluster_cost = DEFAULT_CLUSTER_COST ;
  Cost segment_cost = DEFAULT_SEGMENT_COST ;
242
  bool detect_CDR3 = false;
Mikaël Salson's avatar
Mikaël Salson committed
243
  
Mathieu Giraud's avatar
Mathieu Giraud committed
244
245
  bool save_comp = false;
  bool load_comp = false;
Mikaël Salson's avatar
Mikaël Salson committed
246
247
  
  int verbose = 0 ;
248
  int command = CMD_CLONES;
Mikaël Salson's avatar
Mikaël Salson committed
249

250
  int max_representatives = DEFAULT_MAX_REPRESENTATIVES ;
251
252
253
  int max_clones = DEFAULT_MAX_CLONES ;
  int min_reads_clone = DEFAULT_MIN_READS_CLONE ;
  float ratio_reads_clone = DEFAULT_RATIO_READS_CLONE;
Mikaël Salson's avatar
Mikaël Salson committed
254
255
  // int average_deletion = 4;     // Average number of deletion in V or J

256
257
  int max_reads_processed = NO_LIMIT_VALUE;
  int max_reads_processed_sample = NO_LIMIT_VALUE;
258

Mathieu Giraud's avatar
Mathieu Giraud committed
259
  float ratio_representative = DEFAULT_RATIO_REPRESENTATIVE;
Mikaël Salson's avatar
Mikaël Salson committed
260
  unsigned int max_auditionned = DEFAULT_MAX_AUDITIONED;
Mathieu Giraud's avatar
Mathieu Giraud committed
261

Mikaël Salson's avatar
Mikaël Salson committed
262
  int trim_sequences = DEFAULT_TRIM;
Mikaël Salson's avatar
Mikaël Salson committed
263

264
265
  bool trim_sequences_changed = false;
  
Mikaël Salson's avatar
Mikaël Salson committed
266
  bool output_sequences_by_cluster = false;
267
  bool output_segmented = false;
Mathieu Giraud's avatar
Mathieu Giraud committed
268
  bool output_unsegmented = false;
269
  bool output_unsegmented_detail = false;
270
  bool output_unsegmented_detail_full = false;
271
  bool output_affects = false;
272
273
  bool keep_unsegmented_as_clone = false;

274
275
  bool several_D = false;

Mathieu Giraud's avatar
Mathieu Giraud committed
276
  vector <string> multi_germlines ;
277
  bool multi_germline = false;
278
  bool multi_germline_mark = false;
279
  bool multi_germline_one_unique_index = false;
280
281
  bool multi_germline_unexpected_recombinations_12 = false;
  bool multi_germline_unexpected_recombinations_1U = false;
282

Mikaël Salson's avatar
Mikaël Salson committed
283
284
  string forced_edges = "" ;

285
  map <string, string> windows_labels ;
286
  vector <string> windows_labels_explicit ;
Mathieu Giraud's avatar
Mathieu Giraud committed
287
  string windows_labels_file = "" ;
288
  bool only_labeled_windows = false ;
Mikaël Salson's avatar
Mikaël Salson committed
289
290
291

  char c ;

292
293
  int options_s_k = 0 ;

294
  double expected_value = THRESHOLD_NB_EXPECTED;
295
  double expected_value_D = THRESHOLD_NB_EXPECTED_D;
296

297
298
  //json which contains the Levenshtein distances
  json jsonLevenshtein;
299
  bool jsonLevenshteinComputed = false ;
WebTogz's avatar
WebTogz committed
300

Mathieu Giraud's avatar
Mathieu Giraud committed
301
302
  int kmer_threshold = DEFAULT_KMER_THRESHOLD;

Mathieu Giraud's avatar
Mathieu Giraud committed
303
304
  //$$ options: definition wiht CLI11
  string group = "";
Mathieu Giraud's avatar
Mathieu Giraud committed
305

Mathieu Giraud's avatar
Mathieu Giraud committed
306
  // cerr << "Usage: " << progname << " [options] <reads.fa/.fq/.gz>" << endl << endl;
307

Mathieu Giraud's avatar
Mathieu Giraud committed
308
  group = "Files";
309
  app.add_option("reads", f_reads, "reads.fa/.fq/.gz") -> group(group) -> set_type_name("FILE");
310

311

Mathieu Giraud's avatar
Mathieu Giraud committed
312
  group = "Command selection";
Mathieu Giraud's avatar
Mathieu Giraud committed
313
314
315
316
317
318
319
  string cmd = COMMAND_CLONES;
  app.add_option("-c", cmd, "command"
                 "\n  \t\t" COMMAND_CLONES    "  \t locus detection, window extraction, clone clustering (default command, most efficient, all outputs)"
                 "\n  \t\t" COMMAND_WINDOWS   "  \t locus detection, window extraction"
                 "\n  \t\t" COMMAND_SEGMENT   "  \t detailed V(D)J designation (not recommended)"
                 "\n  \t\t" COMMAND_GERMLINES "  \t statistics on k-mers in different germlines")
    -> group(group) -> set_type_name("COMMAND");
Mikaël Salson's avatar
Mikaël Salson committed
320

Mathieu Giraud's avatar
Mathieu Giraud committed
321
  group = "Input" ;
322
  app.add_option("--header-sep", read_header_separator, "separator for headers in the reads file", true) -> group(group) -> level() -> set_type_name("CHAR")  ;
Mikaël Salson's avatar
Mikaël Salson committed
323

Mathieu Giraud's avatar
Mathieu Giraud committed
324
325
  
  group = "Germline presets (at least one -g or -V/(-D)/-J option must be given for all commands except -c " COMMAND_GERMLINES ")";
Mathieu Giraud's avatar
Mathieu Giraud committed
326
327
328
329
330
331
332
333
  app.add_option("-g", multi_germlines, R"Z(
         -g <.g file>(:filter)
                    multiple locus/germlines, with tuned parameters.
                    Common values are '-g germline/homo-sapiens.g' or '-g germline/mus-musculus.g'
                    The list of locus/recombinations can be restricted, such as in '-g germline/homo-sapiens.g:IGH,IGK,IGL'
         -g <path>
                    multiple locus/germlines, shortcut for '-g <path>/)Z" DEFAULT_MULTI_GERMLINE_FILE R"Z(',
                    processes human TRA, TRB, TRG, TRD, IGH, IGK and IGL locus, possibly with some incomplete/unusal recombination
334
)Z") -> group(group) -> set_type_name("GERMLINES");
335

336
337
338
  app.add_option("-V", v_reps_V, "custom V germline multi-fasta file") -> group(group) -> set_type_name("FILE");
  app.add_option("-D", v_reps_D, "custom D germline multi-fasta file (and resets -m and -w options), will segment into V(D)J components") -> group(group) -> set_type_name("FILE");
  app.add_option("-J", v_reps_J, "custom V germline multi-fasta file") -> group(group) -> set_type_name("FILE");
339

Mathieu Giraud's avatar
Mathieu Giraud committed
340
341
  group = "Locus/recombinations";
  app.add_flag("-d", several_D, "try to detect several D (experimental)") -> group(group);
342
  app.add_flag("-2",  multi_germline_unexpected_recombinations_12, "try to detect unexpected recombinations (must be used with -g)") -> group(group);
343

344

Mathieu Giraud's avatar
Mathieu Giraud committed
345
  group = "Experimental options (do not use)";
346
  app.add_flag("-I", multi_germline_mark, "ignore k-mers common to different germline systems (experimental, must be used with -g, do not use)") -> group(group) -> level();
347
  app.add_flag("-1", multi_germline_one_unique_index,
348
349
350
               "use a unique index for all germline systems (experimental, must be used with -g, do not use)") -> group(group) -> level();
  app.add_flag("-4", multi_germline_unexpected_recombinations_1U, "try to detect unexpected recombinations with translocations (experimental, must be used with -g, do not use)") -> group(group) -> level();
  app.add_flag("--keep", keep_unsegmented_as_clone, "keep unsegmented reads as clones, taking for junction the complete sequence, to be used on very small datasets (for example --keep -AX 20)") -> group(group) -> level();
351

Mathieu Giraud's avatar
Mathieu Giraud committed
352
353
354
355
356
357
358
  group = "Window prediction";
#ifndef NO_SPACED_SEEDS
  /*
       << "  (use either -s or -k option, but not both)" << endl
           << "  (all these options, except -w, are overriden when using -g)" << endl
  */
#endif
Mikaël Salson's avatar
Mikaël Salson committed
359

360
361
362
363
364
365
366
367
368
369
370
371
372
  app.add_option("-k",
                 [&](CLI::results_t res) {
                   int kmer_size ;
                   bool worked = CLI::detail::lexical_cast(res[0], kmer_size);
                   if (worked) {
                     seed = seed_contiguous(kmer_size);
                     seed_changed = true;
                     options_s_k++ ;
                   }
                   return worked;
                 },
                 "k-mer size used for the V/J affectation (default: 10, 12, 13, depends on germline)") -> group(group);

Mathieu Giraud's avatar
Mathieu Giraud committed
373
374
375
376
    
#ifndef NO_SPACED_SEEDS
  //      << "                (using -k option is equivalent to set with -s a contiguous seed with only '#' characters)" << endl
#endif
377
378
379
380
  app.add_option("-w", wmer_size,
                 "w-mer size used for the length of the extracted window ('" NO_LIMIT "': use all the read, no window clustering)") -> group(group) -> transform(string_NO_LIMIT);
  app.add_option("-e", expected_value,
                 "maximal e-value for determining if a V-J segmentation can be trusted", true) -> group(group) -> transform(string_NO_LIMIT);
381
382
383
384
385
386
387
  app.add_option("-t",
                 [&](CLI::results_t res) {
                   bool worked = CLI::detail::lexical_cast(res[0], trim_sequences);
                   trim_sequences_changed = true;
                   return true;
                 },
                 // trim_sequences,
Mathieu Giraud's avatar
Mathieu Giraud committed
388
                 "trim V and J genes (resp. 5' and 3' regions) to keep at most <int> nt  (0: no trim)") -> group(group);
Mikaël Salson's avatar
Mikaël Salson committed
389

390
391
392
393
394
395
396
397
398
399
  app.add_option("-s",
                 [&](CLI::results_t res) {
                   seed = res[0] ;
                   options_s_k++ ;
                   seed_changed = true;
                   return true;
                 },
                 "spaced seeds used for the V/J affectation (default: depends on germline)"
                 ) -> group(group) -> level();

Mathieu Giraud's avatar
Mathieu Giraud committed
400
/*
401
#ifdef NO_SPACED_SEEDS
402
403
        cerr << "To enable the option -s, please compile without NO_SPACED_SEEDS" << endl;
#endif
Mathieu Giraud's avatar
Mathieu Giraud committed
404
405
*/
  
Mikaël Salson's avatar
Mikaël Salson committed
406

407

408

409
  group = "Labeled sequences (windows related to these sequences will be kept even if -r/--ratio thresholds are not reached)";
410
411
412

  app.add_option("-W", windows_labels_explicit, "label the given sequence") -> group(group);

413
  app.add_option("-l", windows_labels_file, "label a set of sequences given in <file>") -> group(group) -> set_type_name("FILE");
Mathieu Giraud's avatar
Mathieu Giraud committed
414
  app.add_flag("-F", only_labeled_windows, "filter -- keep only the windows related to the labeled sequences") -> group(group);
Mikaël Salson's avatar
Mikaël Salson committed
415
416


Mathieu Giraud's avatar
Mathieu Giraud committed
417
418
  //  if (advanced)
  group = "Fine segmentation options (second pass)";
419

420
421
422
423
424
  app.add_option("-f",
                 [&segment_cost](CLI::results_t res) {
                   segment_cost = strToCost(res[0].c_str(), VDJ); 
                   return true;
                 },
425
                 "use custom Cost for fine segmenter : format \"match, subst, indels, del_end, homo\" (default " + string_of_cost(DEFAULT_SEGMENT_COST) + ")"
426
                 ) -> group(group);
427

428
  app.add_option("-E", expected_value_D, "maximal e-value for determining if a D segment can be trusted", true) -> group(group) -> level();
Mathieu Giraud's avatar
Mathieu Giraud committed
429

430
431
  app.add_option("-Z", kmer_threshold, "typical number of V genes, selected by k-mer comparison, to compare to the read ('" NO_LIMIT "': all genes, default)", true) -> group(group)
    -> transform(string_NO_LIMIT) -> level() -> set_type_name("NB") ;
Mathieu Giraud's avatar
Mathieu Giraud committed
432
433
  
  group = "Clone analysis (second pass)";
434
  app.add_flag("-3,--cdr3", detect_CDR3, "CDR3/JUNCTION detection (requires gapped V/J germlines)") -> group(group);
435

Mathieu Giraud's avatar
Mathieu Giraud committed
436
437
438
439
440
  
  // if (advanced)
  group = "Additional clustering (experimental)" ;

  app.add_flag_function("-q", [&](size_t n) { indexType = AC_AUTOMATON; });
441
  app.add_option("-n", epsilon, "minimum required neighbors for automatic clustering. No automatic clusterisation if =0.", true) -> group(group) -> level();
Mathieu Giraud's avatar
Mathieu Giraud committed
442

443
444
445
446
  app.add_option("-N", minPts, "", true) -> group(group) -> level();
  app.add_option("-S", save_comp, "generate and save comparative matrix for clustering") -> group(group) -> level();
  app.add_option("-L", load_comp, "load comparative matrix for clustering") -> group(group) -> level();
  app.add_option("--forced-edges", forced_edges, "manual clustering -- a file used to force some specific edges") -> group(group) -> level();
447
448
449
450
451
452

  app.add_option("-C",
                 [&cluster_cost](CLI::results_t res) {
                   cluster_cost = strToCost(res[0].c_str(), Cluster);
                   return true;
                 },
453
                 "use custom Cost for automatic clustering : format \"match, subst, indels, del_end, homo\" (default " + string_of_cost(DEFAULT_CLUSTER_COST) + ")"
454
                 ) -> group(group) -> level();
Mathieu Giraud's avatar
Mathieu Giraud committed
455
456
457
458
459
460

  group = "Limits to report a clone (or a window)";
  app.add_option("-r", min_reads_clone, "minimal number of reads supporting a clone", true) -> group(group);
  app.add_option("--ratio", ratio_reads_clone, "minimal percentage of reads supporting a clone", true) -> group(group);

  group = "Limits to further analyze some clones";
461
462
  app.add_option("-y", max_representatives,
                 "maximal number of clones computed with a consensus sequence ('" NO_LIMIT "': no limit)", true) -> group(group) -> transform(string_NO_LIMIT);
463
464
465
466
467
468
469
470
471

  app.add_option("-z",
                 [&max_clones, &max_representatives](CLI::results_t res) {
                   max_clones = atoi_NO_LIMIT(res[0].c_str());
                   if ((max_representatives < max_clones) && (max_representatives != NO_LIMIT_VALUE))
                     max_representatives = max_clones ;
                   return true;
                   // TODO: return false on bad input
                 },
472
473
                 "maximal number of clones to be analyzed with a full V(D)J designation ('" NO_LIMIT "': no limit, do not use)"
                 ) -> group(group) -> set_type_name("INT=" + string_of_int(max_clones));
Mathieu Giraud's avatar
Mathieu Giraud committed
474
475
476
477
478
479
480

  app.add_flag_function("-A", [&](size_t n) {
      ratio_reads_clone = 0 ;
      min_reads_clone = 1 ;
      max_representatives = NO_LIMIT_VALUE ;
      max_clones = NO_LIMIT_VALUE ;
    },
481
    "reports and segments all clones (-r 0 --ratio 0 -y " NO_LIMIT " -z " NO_LIMIT "), to be used only on very small datasets (for example -AX 20)") -> group(group);
482

Mathieu Giraud's avatar
Mathieu Giraud committed
483
  app.add_option("-x", max_reads_processed,
484
                 "maximal number of reads to process ('" NO_LIMIT "': no limit, default), only first reads") -> group(group) -> transform(string_NO_LIMIT);
Mathieu Giraud's avatar
Mathieu Giraud committed
485
  app.add_option("-X", max_reads_processed_sample,
486
                 "maximal number of reads to process ('" NO_LIMIT "': no limit, default), sampled reads") -> group(group) -> transform(string_NO_LIMIT);
487

Mathieu Giraud's avatar
Mathieu Giraud committed
488
489
490
  
  group = "Detailed output per read (generally not recommended, large files, but may be used for filtering, as in -uu -X 1000)";
  app.add_flag("-U", output_segmented, "output segmented reads (in " SEGMENTED_FILENAME " file)") -> group(group);
Mathieu Giraud's avatar
Mathieu Giraud committed
491
492
493
494
495
496
497
498
499
500
  app.add_flag_function("-u", [&](size_t n) {
      output_unsegmented = output_unsegmented_detail_full ;       // -uuu
      output_unsegmented_detail_full = output_unsegmented_detail; // -uu
      output_unsegmented_detail = true;                           // -u
    }, R"Z(
        -u          output unsegmented reads, gathered by unsegmentation cause, except for very short and 'too few V/J' reads (in *)Z" UNSEGMENTED_DETAIL_FILENAME R"Z( files)
        -uu         output unsegmented reads, gathered by unsegmentation cause, all reads (in *)Z" UNSEGMENTED_DETAIL_FILENAME R"Z( files) (use only for debug)
        -uuu        output unsegmented reads, all reads, including a )Z" UNSEGMENTED_FILENAME R"Z( file (use only for debug)
)Z") -> group(group);

Mathieu Giraud's avatar
Mathieu Giraud committed
501
  app.add_flag("-K", output_affects, "output detailed k-mer affectation on all reads (in " AFFECTS_FILENAME " file) (use only for debug, for example -KX 100)") -> group(group);
502

503

Mathieu Giraud's avatar
Mathieu Giraud committed
504
  group = "Output";
505
506
  app.add_option("-o", out_dir, "output directory", true) -> group(group) -> set_type_name("PATH");
  app.add_option("-b", f_basename, "output basename (by default basename of the input file)") -> group(group) -> set_type_name("FILE");
Mathieu Giraud's avatar
Mathieu Giraud committed
507
  app.add_flag("-a", output_sequences_by_cluster, "output all sequences by cluster (" CLONE_FILENAME "*), to be used only on small datasets") -> group(group);
Mathieu Giraud's avatar
Mathieu Giraud committed
508
  app.add_flag_function("-v", [&](size_t n) { verbose += n ; }, "verbose mode") -> group(group);
509

510

511
512
513
514
515
  app.add_flag_function("-H", [&](size_t n) { throw CLI::CallForAdvancedHelp() ; },
                        "help, including advanced and experimental options"
                        "\n                              "
                        "The full help is available in the doc/algo.org file."
                        );
516

Mikaël Salson's avatar
Mikaël Salson committed
517
518


Mathieu Giraud's avatar
Mathieu Giraud committed
519
520
  //$$ options: parsing
  CLI11_PARSE(app, argc, argv);
Mathieu Giraud's avatar
Mathieu Giraud committed
521

Mathieu Giraud's avatar
Mathieu Giraud committed
522
  //$$ options: post-processing+display
Mikaël Salson's avatar
Mikaël Salson committed
523

Mathieu Giraud's avatar
Mathieu Giraud committed
524
525
526
527
528
529
530
531
532
533
534
535
  if (cmd == COMMAND_CLONES)
    command = CMD_CLONES;
  else if (cmd == COMMAND_SEGMENT)
    command = CMD_SEGMENT;
  else if (cmd == COMMAND_WINDOWS)
    command = CMD_WINDOWS;
  else if (cmd == COMMAND_GERMLINES)
    command = CMD_GERMLINES;
  else {
    cerr << "Unknwown command " << optarg << endl;
    usage(argv[0], false);
  }
536

Mathieu Giraud's avatar
Mathieu Giraud committed
537
538
539
  list <string> f_reps_V(v_reps_V.begin(), v_reps_V.end());
  list <string> f_reps_D(v_reps_D.begin(), v_reps_D.end());
  list <string> f_reps_J(v_reps_J.begin(), v_reps_J.end());
540

541

Mathieu Giraud's avatar
Mathieu Giraud committed
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
  for (string arg: multi_germlines)
    {
      multi_germline = true;

      struct stat buffer;
      if (stat(arg.c_str(), &buffer) == 0)
        {
          if( buffer.st_mode & S_IFDIR )
            {
              // argument is a directory
              multi_germline_paths_and_files.push_back(make_pair(arg, DEFAULT_MULTI_GERMLINE_FILE)) ;
              break ;
                }
        }

      // argument is not a directory (and basename can include ':' with a filter)
      multi_germline_paths_and_files.push_back(make_pair(extract_dirname(arg), extract_basename(arg, false)));
    }


562
  if (!multi_germline && (!f_reps_V.size() || !f_reps_J.size()))
563
    {
564
      cerr << ERROR_STRING << "At least one germline must be given with -g or -V/(-D)/-J." << endl ;
565
      return 1;
566
567
    }

568
569
  if (options_s_k > 1)
    {
570
      cerr << ERROR_STRING << "Use at most one -s or -k option." << endl ;
571
      return 1;
572
573
    }

574
575
576
  for(string lab : windows_labels_explicit)
    windows_labels[lab] = string("-W");
  
Mikaël Salson's avatar
Mikaël Salson committed
577
578
  string out_seqdir = out_dir + "/seq/" ;

Mikaël Salson's avatar
Mikaël Salson committed
579
580
581
  if (verbose)
    cout << "# verbose " << verbose << endl ;

582
  if (f_reads == DEFAULT_READS)
Mikaël Salson's avatar
Mikaël Salson committed
583
584
585
    {
      cout << "# using default sequence file: " << f_reads << endl ;
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
586

587
588
589
590
591
  //  else
  //  {
  //    cerr << ERROR_STRING << "Wrong number of arguments." << endl ;
  //    return 1;
  //  }
Mathieu Giraud's avatar
Mathieu Giraud committed
592

593
  size_t min_cover_representative = (size_t) (min_reads_clone < (int) max_auditionned ? min_reads_clone : max_auditionned) ;
594

595
596
  // Default seeds

597
#ifdef NO_SPACED_SEEDS
598
  if (! seed_changed)
599
    {
600
      cerr << ERROR_STRING << PROGNAME << " was compiled with NO_SPACED_SEEDS: please provide a -k option." << endl;
601
      return 1;
602
603
604
605
  }
#endif
	  

Mathieu Giraud's avatar
Mathieu Giraud committed
606
607
608
609
#ifndef NO_SPACED_SEEDS
  // Check seed buffer  
  if (seed.size() >= MAX_SEED_SIZE)
    {
610
      cerr << ERROR_STRING << "Seed size is too large (MAX_SEED_SIZE)." << endl ;
611
      return 1;
Mathieu Giraud's avatar
Mathieu Giraud committed
612
613
614
    }
#endif

615

Mathieu Giraud's avatar
Mathieu Giraud committed
616
  if ((wmer_size< 0) && (wmer_size!= NO_LIMIT_VALUE))
617
    {
618
      cerr << ERROR_STRING << "Too small -w. The window size should be positive" << endl;
619
      return 1;
620
621
    }

Mikaël Salson's avatar
Mikaël Salson committed
622
623
624
625
  // Check that out_dir is an existing directory or creates it
  const char *out_cstr = out_dir.c_str();

  if (mkpath(out_cstr, 0755) == -1) {
626
    cerr << ERROR_STRING << "Directory creation: " << out_dir << endl; perror("");
627
    return 2;
Mikaël Salson's avatar
Mikaël Salson committed
628
629
  }

Mikaël Salson's avatar
Mikaël Salson committed
630
631
  const char *outseq_cstr = out_seqdir.c_str();
  if (mkpath(outseq_cstr, 0755) == -1) {
632
    cerr << ERROR_STRING << "Directory creation: " << out_seqdir << endl; perror("");
633
    return 2;
Mikaël Salson's avatar
Mikaël Salson committed
634
635
  }

636
637
638
639
640
  // Compute basename if not given as an option
  if (f_basename == "") {
    f_basename = extract_basename(f_reads);
  }

Mikaël Salson's avatar
Mikaël Salson committed
641
642
643
  out_dir += "/" ;

  /// Load labels ;
644
  load_into_map(windows_labels, windows_labels_file, "-l");
Mikaël Salson's avatar
Mikaël Salson committed
645
646

  switch(command) {
Mathieu Giraud's avatar
Mathieu Giraud committed
647
  case CMD_WINDOWS: cout << "Extracting windows" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
648
    break;
649
  case CMD_CLONES: cout << "Analysing clones" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
650
651
652
    break;
  case CMD_SEGMENT: cout << "Segmenting V(D)J" << endl;
    break;
653
654
  case CMD_GERMLINES: cout << "Discovering germlines" << endl;
    break;
Mikaël Salson's avatar
Mikaël Salson committed
655
656
  }

Mathieu Giraud's avatar
Mathieu Giraud committed
657
  cout << "Command line: ";
Mikaël Salson's avatar
Mikaël Salson committed
658
  for (int i=0; i < argc; i++) {
Mathieu Giraud's avatar
Mathieu Giraud committed
659
    cout << argv[i] << " ";
Mikaël Salson's avatar
Mikaël Salson committed
660
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
661
  cout << endl;
Mikaël Salson's avatar
Mikaël Salson committed
662
663
664
665
666
667
668
669
670
671
672
673

  //////////////////////////////////
  // Display time and date
  time_t rawtime;
  struct tm *timeinfo;
  char time_buffer[80];

  time (&rawtime );
  timeinfo = localtime (&rawtime);

  strftime (time_buffer, 80,"%F %T", timeinfo);

Mathieu Giraud's avatar
Mathieu Giraud committed
674
  cout << "# " << time_buffer << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
675
676


677
678
  //////////////////////////////////
  // Warning for non-optimal use
679

680
  if (max_clones == NO_LIMIT_VALUE || max_clones > WARN_MAX_CLONES)
681
682
    {
      cout << endl
683
	   << "* WARNING: " << PROGNAME << " was run with '-A' option or with a large '-z' option" << endl ;
684
685
686
687
688
    }
  
  if (command == CMD_SEGMENT)
    {
      cout << endl
689
	   << "* WARNING: " << PROGNAME << " was run with '-c segment' option" << endl ;
690
691
    }
  
692
  if (max_clones == NO_LIMIT_VALUE || max_clones > WARN_MAX_CLONES || command == CMD_SEGMENT)
693
    {
694
      cout << "* " << PROGNAME << " efficientl extracts windows overlapping the CDR3" << endl
695
           << "* to cluster reads into clones ('-c clones')." << endl
696
697
           << "* Computing accurate V(D)J designations for many sequences ('-c segment' or large '-z' values)" << endl
           << "* is slow and should be done only on small datasets or for testing purposes." << endl
698
699
700
	   << "* More information is provided in the 'doc/algo.org' file." << endl 
	   << endl ;
    }
701

702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723

  /////////////////////////////////////////
  //            JSON OUTPUT              //
  /////////////////////////////////////////

  string f_json = out_dir + f_basename + JSON_SUFFIX ;

  ostringstream stream_cmdline;
  for (int i=0; i < argc; i++) stream_cmdline << argv[i] << " ";

  json j = {
    {"vidjil_json_version", VIDJIL_JSON_VERSION},
    {"samples", {
        {"number", 1},
        {"original_names", {f_reads}},
        {"run_timestamp", {time_buffer}},
        {"producer", {soft_version}},
        {"commandline", {stream_cmdline.str()}}
      }}
  };


724
725
726
  /////////////////////////////////////////
  //            LOAD GERMLINES           //
  /////////////////////////////////////////
727

728
729
730
  if (command == CMD_GERMLINES)
    {
      multi_germline = true ;
731
      multi_germline_one_unique_index = true ;
732
733
    }

734
  MultiGermline *multigermline = new MultiGermline(indexType, !multi_germline_one_unique_index);
735

736
    {
737
      cout << "Load germlines and build Kmer indexes" << endl ;
738
739
740
    
      if (multi_germline)
	{
741
          for (pair <string, string> path_file: multi_germline_paths_and_files)
742
743
            {
              try {
744
                multigermline->build_from_json(path_file.first, path_file.second, GERMLINES_REGULAR,
745
                                               FIRST_IF_UNCHANGED("", seed, seed_changed),
746
                                               FIRST_IF_UNCHANGED(0, trim_sequences, trim_sequences_changed), (kmer_threshold != NO_LIMIT_VALUE));
747
              } catch (std::exception& e) {
748
                cerr << ERROR_STRING << PROGNAME << " cannot properly read " << path_file.first << "/" << path_file.second << ": " << e.what() << endl;
749
                delete multigermline;
750
                return 1;
751
752
              }
            }
753
754
755
756
757
	}
      else
	{
	  // Custom germline
	  Germline *germline;
758
	  germline = new Germline("custom", 'X',
759
760
                                  f_reps_V, f_reps_D, f_reps_J,
                                  seed, trim_sequences, (kmer_threshold != NO_LIMIT_VALUE));
761

762
          germline->new_index(indexType);
763

764
765
766
	  multigermline->insert(germline);
	}
    }
767

768
    cout << endl ;
769

770
    if (multi_germline_one_unique_index) {
771
      multigermline->build_with_one_index(seed, true);
772
    }
773

774
      if (multi_germline_unexpected_recombinations_12 || multi_germline_unexpected_recombinations_1U) {
775
776
777
        if (!multigermline->index) {
          multigermline->build_with_one_index(seed, false);
        }
778
      }
779

780
      if (multi_germline_unexpected_recombinations_12) {
781
        Germline *pseudo = new Germline(PSEUDO_UNEXPECTED, PSEUDO_UNEXPECTED_CODE, "", trim_sequences, (kmer_threshold != NO_LIMIT_VALUE));
782
        pseudo->seg_method = SEG_METHOD_MAX12 ;
783
        pseudo->set_index(multigermline->index);
784
        multigermline->germlines.push_back(pseudo);
785
786
787
      }

      if (multi_germline_unexpected_recombinations_1U) {
788
        Germline *pseudo_u = new Germline(PSEUDO_UNEXPECTED, PSEUDO_UNEXPECTED_CODE, "", trim_sequences, (kmer_threshold != NO_LIMIT_VALUE));
789
        pseudo_u->seg_method = SEG_METHOD_MAX1U ;
790
        // TODO: there should be more up/downstream regions for the PSEUDO_UNEXPECTED germline. And/or smaller seeds ?
791
        pseudo_u->set_index(multigermline->index);
792
        multigermline->germlines.push_back(pseudo_u);
793
794
    }

795
      // Should come after the initialization of regular (and possibly pseudo) germlines
796
    {
797
      for (pair <string, string> path_file: multi_germline_paths_and_files)
798
        multigermline->build_from_json(path_file.first, path_file.second, GERMLINES_INCOMPLETE,
799
                                       FIRST_IF_UNCHANGED("", seed, seed_changed),
800
                                       FIRST_IF_UNCHANGED(0, trim_sequences, trim_sequences_changed), (kmer_threshold != NO_LIMIT_VALUE));
801
      if ((! multigermline->one_index_per_germline) && (command != CMD_GERMLINES)) {
802
803
        multigermline->insert_in_one_index(multigermline->index, true);
      }
804
805
    }

806
807
    if (multi_germline_mark)
      multigermline->mark_cross_germlines_as_ambiguous();
808
809

    multigermline->finish();
810
    cout << "Germlines loaded: " ;
811
812
    cout << *multigermline ;
    cout << endl ;
813
814

    // Number of reads for e-value computation
815
    unsigned long long nb_reads_for_evalue = (expected_value > NO_LIMIT_VALUE) ? nb_sequences_in_file(f_reads, true) : 1 ;
816

817
    
818
819
  //////////////////////////////////
  //$$ Read sequence files
820
821
822
823

    int only_nth_read = 1 ;
    if (max_reads_processed_sample != NO_LIMIT_VALUE)
      {
824
        only_nth_read = nb_sequences_in_file(f_reads) / max_reads_processed_sample;
825
826
827
        if (only_nth_read == 0)
          only_nth_read = 1 ;

828
        max_reads_processed = max_reads_processed_sample ;
829
830
831
832
833

        if (only_nth_read > 1)
          cout << "Processing every " << only_nth_read
               << (only_nth_read == 2 ? "nd" : (only_nth_read == 3 ? "rd" : "th"))
               << " read" << endl ;
834
835
      }

836
  OnlineBioReader *reads;
837
838

  try {
839
    reads = OnlineBioReaderFactory::create(f_reads, 1, read_header_separator, max_reads_processed, only_nth_read);
840
  } catch (const invalid_argument e) {
841
    cerr << ERROR_STRING << PROGNAME << " cannot open reads file " << f_reads << ": " << e.what() << endl;
842
    return 1;
843
844
845
846
847
  }

  out_dir += "/";


848
849
850
851
852
  //////////////////////////////://////////
  //         DISCOVER GERMLINES          //
  /////////////////////////////////////////
  if (command == CMD_GERMLINES)
    {
853
      map <char, int> stats_kmer, stats_max;
854
      IKmerStore<KmerAffect> *index = multigermline->index ;
855
856

      // Initialize statistics, with two additional categories
857
858
      index->labels.push_back(make_pair(KmerAffect::getAmbiguous(), BIOREADER_AMBIGUOUS));
      index->labels.push_back(make_pair(KmerAffect::getUnknown(), BIOREADER_UNKNOWN));
859
      
860
      for (list< pair <KmerAffect, BioReader> >::const_iterator it = index->labels.begin(); it != index->labels.end(); ++it)
861
	{
862
863
864
	  char key = affect_char(it->first.affect) ;
	  stats_kmer[key] = 0 ;
	  stats_max[key] = 0 ;
865
866
	}
      
867
      // init forbidden for .max()
868
869
870
      set<KmerAffect> forbidden;
      forbidden.insert(KmerAffect::getAmbiguous());
      forbidden.insert(KmerAffect::getUnknown());
871
      
872
873
874
      // Loop through all reads

      int nb_reads = 0 ;