vidjil.cpp 56.4 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2
3
  This file is part of Vidjil-algo <http://www.vidjil.org>
  Copyright (C) 2011-2018 by Bonsai bioinformatics
4
5
6
7
8
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9

10
  "Vidjil-algo" is free software: you can redistribute it and/or modify
Mikaël Salson's avatar
Mikaël Salson committed
11
12
13
14
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

15
  "Vidjil-algo" is distributed in the hope that it will be useful,
Mikaël Salson's avatar
Mikaël Salson committed
16
17
18
19
20
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
21
  along with "Vidjil-algo". If not, see <http://www.gnu.org/licenses/>
Mikaël Salson's avatar
Mikaël Salson committed
22
23
*/

Mathieu Giraud's avatar
Mathieu Giraud committed
24
//$$ #include
Mikaël Salson's avatar
Mikaël Salson committed
25
26
27
28
29
30
31
32
33
34
35
36
37

#include<algorithm>
#include<utility>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <cstring>
#include <time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

38
#include "core/check-compiler.h"
Mikaël Salson's avatar
Mikaël Salson committed
39
#include "core/tools.h"
WebTogz's avatar
WebTogz committed
40
#include "core/json.h"
41
#include "core/germline.h"
Mikaël Salson's avatar
Mikaël Salson committed
42
43
#include "core/kmerstore.h"
#include "core/fasta.h"
44
#include "core/bioreader.hpp"
Mikaël Salson's avatar
Mikaël Salson committed
45
#include "core/segment.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
46
#include "core/windows.h"
Mikaël Salson's avatar
Mikaël Salson committed
47
48
49
50
51
52
53
#include "core/cluster-junctions.h"
#include "core/dynprog.h"
#include "core/read_score.h"
#include "core/read_chooser.h"
#include "core/compare-all.h"
#include "core/mkdir.h"
#include "core/labels.h"
Mikaël Salson's avatar
Mikaël Salson committed
54
#include "core/list_utils.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
55
#include "core/windowExtractor.h"
Mikaël Salson's avatar
Mikaël Salson committed
56

Mathieu Giraud's avatar
Mathieu Giraud committed
57
#include "lib/CLI11.hpp"
58
59
#include "lib/json.hpp"

Mikaël Salson's avatar
Mikaël Salson committed
60
61
62
63
64
65
66
#include "vidjil.h"

// RELEASE_TAG may be defined in the "release.h" file.
// If RELEASE_TAG is undefined, the version will be the git hash.
// #define RELEASE_TAG  "2013.04"
#include "release.h"

67
68
69
// GIT_VERSION should be defined in "git-version.h", created by "create-git-version-h.sh", to be used outside of releases
#include "git-version.h"

70
#define PROGNAME "vidjil-algo"
71
#define VIDJIL_JSON_VERSION "2016b"
Mikaël Salson's avatar
Mikaël Salson committed
72

Mathieu Giraud's avatar
Mathieu Giraud committed
73
74
//$$ #define (mainly default options)

75
#define DEFAULT_MULTI_GERMLINE_PATH "germline/"
76
#define DEFAULT_MULTI_GERMLINE_FILE "homo-sapiens.g"
Mikaël Salson's avatar
Mikaël Salson committed
77

78
#define DEFAULT_READ_HEADER_SEPARATOR " "
79
#define DEFAULT_READS  "./demo/Stanford_S22.fasta"
80
#define DEFAULT_MIN_READS_CLONE 5
81
#define DEFAULT_MAX_REPRESENTATIVES 100
82
#define DEFAULT_MAX_CLONES 100
83
#define DEFAULT_RATIO_READS_CLONE 0.0
84
#define NO_LIMIT "all"
Mikaël Salson's avatar
Mikaël Salson committed
85

Mathieu Giraud's avatar
Mathieu Giraud committed
86
#define COMMAND_WINDOWS "windows"
87
#define COMMAND_CLONES "clones"
Mikaël Salson's avatar
Mikaël Salson committed
88
#define COMMAND_SEGMENT "segment"
89
#define COMMAND_GERMLINES "germlines"
Mikaël Salson's avatar
Mikaël Salson committed
90
 
91
enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
Mikaël Salson's avatar
Mikaël Salson committed
92

93
94
95
#define DEFAULT_OUT_DIR "./out/" 

// Fixed filenames/suffixes
96
#define CLONES_FILENAME ".vdj.fa"
Mikaël Salson's avatar
Mikaël Salson committed
97
#define CLONE_FILENAME "clone.fa-"
98
99
#define WINDOWS_FILENAME ".windows.fa"
#define SEGMENTED_FILENAME ".segmented.vdj.fa"
100
#define UNSEGMENTED_FILENAME ".unsegmented.vdj.fa"
101
#define UNSEGMENTED_DETAIL_FILENAME ".fa"
102
#define AFFECTS_FILENAME ".affects"
103
104
105
#define EDGES_FILENAME ".edges"
#define COMP_FILENAME "comp.vidjil"
#define JSON_SUFFIX ".vidjil"
Mikaël Salson's avatar
Mikaël Salson committed
106

107
#define DEFAULT_K      0
108
#define DEFAULT_W      50
109
#define DEFAULT_SEED   DEFAULT_GERMLINE_SEED
Mikaël Salson's avatar
Mikaël Salson committed
110

Mikaël Salson's avatar
Mikaël Salson committed
111
#define DEFAULT_MAX_AUDITIONED 2000
Mathieu Giraud's avatar
Mathieu Giraud committed
112
113
#define DEFAULT_RATIO_REPRESENTATIVE 0.5

Mathieu Giraud's avatar
Mathieu Giraud committed
114
115
#define DEFAULT_KMER_THRESHOLD NO_LIMIT_VALUE

Mikaël Salson's avatar
Mikaël Salson committed
116
117
118
119
120
121
#define DEFAULT_EPSILON  0
#define DEFAULT_MINPTS   10

#define DEFAULT_CLUSTER_COST  Cluster
#define DEFAULT_SEGMENT_COST   VDJ

122
#define DEFAULT_TRIM 0
Mikaël Salson's avatar
Mikaël Salson committed
123

124
#define MAX_CLONES_FOR_SIMILARITY 20
125

126
// warn
127
#define WARN_MAX_CLONES 100
128
#define WARN_PERCENT_SEGMENTED 40
129
#define WARN_COVERAGE 0.6
130
#define WARN_NUM_CLONES_SIMILAR 10
131

Mikaël Salson's avatar
Mikaël Salson committed
132
133
134
// display
#define WIDTH_NB_READS 7
#define WIDTH_NB_CLONES 3
135

Mikaël Salson's avatar
Mikaël Salson committed
136
137

using namespace std ;
138
using json = nlohmann::json;
Mikaël Salson's avatar
Mikaël Salson committed
139

Mathieu Giraud's avatar
Mathieu Giraud committed
140
//$$ options: usage
Mikaël Salson's avatar
Mikaël Salson committed
141

Mathieu Giraud's avatar
Mathieu Giraud committed
142
extern char *optarg;
Mikaël Salson's avatar
Mikaël Salson committed
143
144
145

extern int optind, optopt, opterr;

146
int usage(char *progname, bool advanced)
Mikaël Salson's avatar
Mikaël Salson committed
147
{
Mathieu Giraud's avatar
Mathieu Giraud committed
148
  cout
Mikaël Salson's avatar
Mikaël Salson committed
149
       << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
150
       << "Examples (see doc/algo.org)" << endl
151
152
153
       << "  " << progname << " -c clones   -g germline/homo-sapiens.g   -2 -3 -r 1  demo/Demo-X5.fa           # (basic usage, detect the locus for each read," << endl
       << "                                                                                               #  cluster reads and report clones starting from the first read (-r 1)," << endl
       << "                                                                                               #  including unexpected recombinations (-2), assign V(D)J genes and try to detect the CDR3s (-3))" << endl
154
       << "  " << progname << " -c clones   -g germline/homo-sapiens.g:IGH    -3     demo/Stanford_S22.fasta   # (restrict to complete recombinations on the IGH locus)" << endl
155
156
157
       << "  " << progname << " -c clones   -g germline/homo-sapiens.g   -2 -3 -z 20 demo/LIL-L4.fastq.gz      # (basic usage, output detailed V(D)J analysis on the first 20 clones)" << endl
       << "  " << progname << " -c windows  -g germline/homo-sapiens.g   -y 0 -uu -U demo/LIL-L4.fastq.gz      # (splits all the reads into (large) files depending on the detection of V(D)J recombinations)" << endl
       << "  " << progname << " -c segment  -g germline/homo-sapiens.g   -2 -3 -X 50 demo/Stanford_S22.fasta   # (full analysis of each read, only for debug/testing, here on 50 sampled reads)" << endl
158
       << "  " << progname << " -c germlines -g germline/homo-sapiens.g              demo/Stanford_S22.fasta   # (statistics on the k-mers)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
159
    ;
160
  return 1;
Mikaël Salson's avatar
Mikaël Salson committed
161
162
}

Mathieu Giraud's avatar
Mathieu Giraud committed
163

164
int atoi_NO_LIMIT(const char *optarg)
Mathieu Giraud's avatar
Mathieu Giraud committed
165
{
166
167
  return strcmp(NO_LIMIT, optarg) ? atoi(optarg) : NO_LIMIT_VALUE ;
}
168
double atof_NO_LIMIT(const char *optarg)
169
170
{
  return strcmp(NO_LIMIT, optarg) ? atof(optarg) : NO_LIMIT_VALUE ;
Mathieu Giraud's avatar
Mathieu Giraud committed
171
172
}

173
174
175
176
177
178
179
180
181
182
string string_NO_LIMIT(string s)
{
  if (!strcmp(NO_LIMIT, s.c_str()))
    return NO_LIMIT_VALUE_STRING ;

  return s;
}



Mikaël Salson's avatar
Mikaël Salson committed
183
184
int main (int argc, char **argv)
{
185
  cout << "# " << PROGNAME << " -- V(D)J recombinations analysis <http://www.vidjil.org/>" << endl
186
       << "# Copyright (C) 2011-2018 by the Vidjil team" << endl
187
       << "# Bonsai bioinformatics at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille" << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
188
       << endl
189
       << "# " << PROGNAME << " is free software, and you are welcome to redistribute it" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
190
191
192
       << "# under certain conditions -- see http://git.vidjil.org/blob/master/doc/LICENSE" << endl
       << "# No lymphocyte was harmed in the making of this software," << endl
       << "# however this software is for research use only and comes with no warranty." << endl
193
       << endl
194
       << "# Please cite http://biomedcentral.com/1471-2164/15/409 if you use " << PROGNAME << "." << endl
Mikaël Salson's avatar
Mikaël Salson committed
195
196
       << endl ;

197
198
199
  //////////////////////////////////
  // Display version information or git log

200
201
  string soft_version = PROGNAME ;
  soft_version += " " ;
202
#ifdef RELEASE_TAG
203
  cout << "# version: " PROGNAME << " " << RELEASE_TAG << endl ;
204
205
206
207
208
209
210
211
212
213
  soft_version.append(RELEASE_TAG);
#else
  cout << "# development version" << endl ;
#ifdef GIT_VERSION
  cout << "# git: " << GIT_VERSION << endl ;
  soft_version.append("dev ");
  soft_version.append(GIT_VERSION);
#endif
#endif

Mathieu Giraud's avatar
Mathieu Giraud committed
214
215
  CLI::App app{"Vidjil"};

Mathieu Giraud's avatar
Mathieu Giraud committed
216
217
  //$$ options: defaults

Mathieu Giraud's avatar
Mathieu Giraud committed
218
219
220
221
  vector <string> v_reps_V ;
  vector <string> v_reps_D ;
  vector <string> v_reps_J ;

222
  list <pair <string, string>> multi_germline_paths_and_files ;
223

224
  string read_header_separator = DEFAULT_READ_HEADER_SEPARATOR ;
Mikaël Salson's avatar
Mikaël Salson committed
225
226
  string f_reads = DEFAULT_READS ;
  string seed = DEFAULT_SEED ;
227
  bool seed_changed = false;
228
  string f_basename = "";
Mikaël Salson's avatar
Mikaël Salson committed
229

230
  string out_dir = DEFAULT_OUT_DIR;
Mikaël Salson's avatar
Mikaël Salson committed
231
232
233
  
  string comp_filename = COMP_FILENAME;

234
  int wmer_size = DEFAULT_W ;
Mikaël Salson's avatar
Mikaël Salson committed
235

236
  IndexTypes indexType = KMER_INDEX;
237

Mikaël Salson's avatar
Mikaël Salson committed
238
239
240
241
  int epsilon = DEFAULT_EPSILON ;
  int minPts = DEFAULT_MINPTS ;
  Cost cluster_cost = DEFAULT_CLUSTER_COST ;
  Cost segment_cost = DEFAULT_SEGMENT_COST ;
242
  bool detect_CDR3 = false;
Mikaël Salson's avatar
Mikaël Salson committed
243
  
Mathieu Giraud's avatar
Mathieu Giraud committed
244
245
  bool save_comp = false;
  bool load_comp = false;
Mikaël Salson's avatar
Mikaël Salson committed
246
247
  
  int verbose = 0 ;
248
  int command = CMD_CLONES;
Mikaël Salson's avatar
Mikaël Salson committed
249

250
  int max_representatives = DEFAULT_MAX_REPRESENTATIVES ;
251
252
253
  int max_clones = DEFAULT_MAX_CLONES ;
  int min_reads_clone = DEFAULT_MIN_READS_CLONE ;
  float ratio_reads_clone = DEFAULT_RATIO_READS_CLONE;
Mikaël Salson's avatar
Mikaël Salson committed
254
255
  // int average_deletion = 4;     // Average number of deletion in V or J

256
257
  int max_reads_processed = NO_LIMIT_VALUE;
  int max_reads_processed_sample = NO_LIMIT_VALUE;
258

Mathieu Giraud's avatar
Mathieu Giraud committed
259
  float ratio_representative = DEFAULT_RATIO_REPRESENTATIVE;
Mikaël Salson's avatar
Mikaël Salson committed
260
  unsigned int max_auditionned = DEFAULT_MAX_AUDITIONED;
Mathieu Giraud's avatar
Mathieu Giraud committed
261

Mikaël Salson's avatar
Mikaël Salson committed
262
  int trim_sequences = DEFAULT_TRIM;
Mikaël Salson's avatar
Mikaël Salson committed
263

264
265
  bool trim_sequences_changed = false;
  
Mikaël Salson's avatar
Mikaël Salson committed
266
  bool output_sequences_by_cluster = false;
267
  bool output_segmented = false;
Mathieu Giraud's avatar
Mathieu Giraud committed
268
  bool output_unsegmented = false;
269
  bool output_unsegmented_detail = false;
270
  bool output_unsegmented_detail_full = false;
271
  bool output_affects = false;
272
273
  bool keep_unsegmented_as_clone = false;

274
275
  bool several_D = false;

Mathieu Giraud's avatar
Mathieu Giraud committed
276
  vector <string> multi_germlines ;
277
  bool multi_germline = false;
278
  bool multi_germline_mark = false;
279
  bool multi_germline_one_unique_index = false;
280
281
  bool multi_germline_unexpected_recombinations_12 = false;
  bool multi_germline_unexpected_recombinations_1U = false;
282

Mikaël Salson's avatar
Mikaël Salson committed
283
284
  string forced_edges = "" ;

285
  map <string, string> windows_labels ;
Mathieu Giraud's avatar
Mathieu Giraud committed
286
  string windows_labels_file = "" ;
287
  bool only_labeled_windows = false ;
Mikaël Salson's avatar
Mikaël Salson committed
288
289
290

  char c ;

291
292
  int options_s_k = 0 ;

293
  double expected_value = THRESHOLD_NB_EXPECTED;
294
  double expected_value_D = THRESHOLD_NB_EXPECTED_D;
295

296
297
  //json which contains the Levenshtein distances
  json jsonLevenshtein;
298
  bool jsonLevenshteinComputed = false ;
WebTogz's avatar
WebTogz committed
299

Mathieu Giraud's avatar
Mathieu Giraud committed
300
301
  int kmer_threshold = DEFAULT_KMER_THRESHOLD;

Mathieu Giraud's avatar
Mathieu Giraud committed
302
303
  //$$ options: definition wiht CLI11
  string group = "";
Mathieu Giraud's avatar
Mathieu Giraud committed
304

Mathieu Giraud's avatar
Mathieu Giraud committed
305
  // cerr << "Usage: " << progname << " [options] <reads.fa/.fq/.gz>" << endl << endl;
306

Mathieu Giraud's avatar
Mathieu Giraud committed
307
  group = "Files";
308
  app.add_option("reads", f_reads, "reads.fa/.fq/.gz") -> group(group) -> set_type_name("FILE");
309

310

Mathieu Giraud's avatar
Mathieu Giraud committed
311
  group = "Command selection";
Mathieu Giraud's avatar
Mathieu Giraud committed
312
313
314
315
316
317
318
  string cmd = COMMAND_CLONES;
  app.add_option("-c", cmd, "command"
                 "\n  \t\t" COMMAND_CLONES    "  \t locus detection, window extraction, clone clustering (default command, most efficient, all outputs)"
                 "\n  \t\t" COMMAND_WINDOWS   "  \t locus detection, window extraction"
                 "\n  \t\t" COMMAND_SEGMENT   "  \t detailed V(D)J designation (not recommended)"
                 "\n  \t\t" COMMAND_GERMLINES "  \t statistics on k-mers in different germlines")
    -> group(group) -> set_type_name("COMMAND");
Mikaël Salson's avatar
Mikaël Salson committed
319

Mathieu Giraud's avatar
Mathieu Giraud committed
320
  group = "Input" ;
321
  app.add_option("--separator", read_header_separator, "separator for headers in the reads file", true) -> group(group) -> level() -> set_type_name("CHAR")  ;
Mikaël Salson's avatar
Mikaël Salson committed
322

Mathieu Giraud's avatar
Mathieu Giraud committed
323
324
  
  group = "Germline presets (at least one -g or -V/(-D)/-J option must be given for all commands except -c " COMMAND_GERMLINES ")";
Mathieu Giraud's avatar
Mathieu Giraud committed
325
326
327
328
329
330
331
332
333
  app.add_option("-g", multi_germlines, R"Z(
         -g <.g file>(:filter)
                    multiple locus/germlines, with tuned parameters.
                    Common values are '-g germline/homo-sapiens.g' or '-g germline/mus-musculus.g'
                    The list of locus/recombinations can be restricted, such as in '-g germline/homo-sapiens.g:IGH,IGK,IGL'
         -g <path>
                    multiple locus/germlines, shortcut for '-g <path>/)Z" DEFAULT_MULTI_GERMLINE_FILE R"Z(',
                    processes human TRA, TRB, TRG, TRD, IGH, IGK and IGL locus, possibly with some incomplete/unusal recombination
)Z") -> group(group) -> expected(1) -> set_type_name("GERMLINES");
334

335
336
337
  app.add_option("-V", v_reps_V, "custom V germline multi-fasta file") -> group(group) -> set_type_name("FILE");
  app.add_option("-D", v_reps_D, "custom D germline multi-fasta file (and resets -m and -w options), will segment into V(D)J components") -> group(group) -> set_type_name("FILE");
  app.add_option("-J", v_reps_J, "custom V germline multi-fasta file") -> group(group) -> set_type_name("FILE");
338

Mathieu Giraud's avatar
Mathieu Giraud committed
339
340
  group = "Locus/recombinations";
  app.add_flag("-d", several_D, "try to detect several D (experimental)") -> group(group);
341
  app.add_flag("-2",  multi_germline_unexpected_recombinations_12, "try to detect unexpected recombinations (must be used with -g)") -> group(group);
342

343

Mathieu Giraud's avatar
Mathieu Giraud committed
344
  group = "Experimental options (do not use)";
345
  app.add_flag("-I", multi_germline_mark, "ignore k-mers common to different germline systems (experimental, must be used with -g, do not use)") -> group(group) -> level();
346
  app.add_flag("-1", multi_germline_one_unique_index,
347
348
349
               "use a unique index for all germline systems (experimental, must be used with -g, do not use)") -> group(group) -> level();
  app.add_flag("-4", multi_germline_unexpected_recombinations_1U, "try to detect unexpected recombinations with translocations (experimental, must be used with -g, do not use)") -> group(group) -> level();
  app.add_flag("--keep", keep_unsegmented_as_clone, "keep unsegmented reads as clones, taking for junction the complete sequence, to be used on very small datasets (for example --keep -AX 20)") -> group(group) -> level();
350

Mathieu Giraud's avatar
Mathieu Giraud committed
351
352
353
354
355
356
357
  group = "Window prediction";
#ifndef NO_SPACED_SEEDS
  /*
       << "  (use either -s or -k option, but not both)" << endl
           << "  (all these options, except -w, are overriden when using -g)" << endl
  */
#endif
Mikaël Salson's avatar
Mikaël Salson committed
358

359
360
361
362
363
364
365
366
367
368
369
370
371
  app.add_option("-k",
                 [&](CLI::results_t res) {
                   int kmer_size ;
                   bool worked = CLI::detail::lexical_cast(res[0], kmer_size);
                   if (worked) {
                     seed = seed_contiguous(kmer_size);
                     seed_changed = true;
                     options_s_k++ ;
                   }
                   return worked;
                 },
                 "k-mer size used for the V/J affectation (default: 10, 12, 13, depends on germline)") -> group(group);

Mathieu Giraud's avatar
Mathieu Giraud committed
372
373
374
375
    
#ifndef NO_SPACED_SEEDS
  //      << "                (using -k option is equivalent to set with -s a contiguous seed with only '#' characters)" << endl
#endif
376
377
378
379
  app.add_option("-w", wmer_size,
                 "w-mer size used for the length of the extracted window ('" NO_LIMIT "': use all the read, no window clustering)") -> group(group) -> transform(string_NO_LIMIT);
  app.add_option("-e", expected_value,
                 "maximal e-value for determining if a V-J segmentation can be trusted", true) -> group(group) -> transform(string_NO_LIMIT);
Mathieu Giraud's avatar
Mathieu Giraud committed
380
381
  app.add_option("-t", trim_sequences, // trim_sequences_changed = true
                 "trim V and J genes (resp. 5' and 3' regions) to keep at most <int> nt  (0: no trim)") -> group(group);
Mikaël Salson's avatar
Mikaël Salson committed
382

383
384
385
386
387
388
389
390
391
392
  app.add_option("-s",
                 [&](CLI::results_t res) {
                   seed = res[0] ;
                   options_s_k++ ;
                   seed_changed = true;
                   return true;
                 },
                 "spaced seeds used for the V/J affectation (default: depends on germline)"
                 ) -> group(group) -> level();

Mathieu Giraud's avatar
Mathieu Giraud committed
393
/*
394
#ifdef NO_SPACED_SEEDS
395
396
        cerr << "To enable the option -s, please compile without NO_SPACED_SEEDS" << endl;
#endif
Mathieu Giraud's avatar
Mathieu Giraud committed
397
398
*/
  
Mikaël Salson's avatar
Mikaël Salson committed
399

400

401

Mathieu Giraud's avatar
Mathieu Giraud committed
402
403
404
405
406
407
  group = "Labeled sequences (windows related to these sequences will be kept even if -r/-% thresholds are not reached)";
  /*
  app.add_option("-W",
                 // TODO:  windows_labels[string(optarg)] = string("-W");
                 , "label the given sequence") -> group(group);
  */
408
  app.add_option("-l", windows_labels_file, "label a set of sequences given in <file>") -> group(group) -> set_type_name("FILE");
Mathieu Giraud's avatar
Mathieu Giraud committed
409
  app.add_flag("-F", only_labeled_windows, "filter -- keep only the windows related to the labeled sequences") -> group(group);
Mikaël Salson's avatar
Mikaël Salson committed
410
411


Mathieu Giraud's avatar
Mathieu Giraud committed
412
413
  //  if (advanced)
  group = "Fine segmentation options (second pass)";
414

415
416
417
418
419
  app.add_option("-f",
                 [&segment_cost](CLI::results_t res) {
                   segment_cost = strToCost(res[0].c_str(), VDJ); 
                   return true;
                 },
420
                 "use custom Cost for fine segmenter : format \"match, subst, indels, del_end, homo\" (default " + string_of_cost(DEFAULT_SEGMENT_COST) + ")"
421
                 ) -> group(group);
422

423
  app.add_option("-E", expected_value_D, "maximal e-value for determining if a D segment can be trusted", true) -> group(group) -> level();
Mathieu Giraud's avatar
Mathieu Giraud committed
424

425
426
  app.add_option("-Z", kmer_threshold, "typical number of V genes, selected by k-mer comparison, to compare to the read ('" NO_LIMIT "': all genes, default)", true) -> group(group)
    -> transform(string_NO_LIMIT) -> level() -> set_type_name("NB") ;
Mathieu Giraud's avatar
Mathieu Giraud committed
427
428
  
  group = "Clone analysis (second pass)";
429
  app.add_flag("-3,--cdr3", detect_CDR3, "CDR3/JUNCTION detection (requires gapped V/J germlines)") -> group(group);
430

Mathieu Giraud's avatar
Mathieu Giraud committed
431
432
433
434
435
  
  // if (advanced)
  group = "Additional clustering (experimental)" ;

  app.add_flag_function("-q", [&](size_t n) { indexType = AC_AUTOMATON; });
436
  app.add_option("-n", epsilon, "minimum required neighbors for automatic clustering. No automatic clusterisation if =0.", true) -> group(group) -> level();
Mathieu Giraud's avatar
Mathieu Giraud committed
437

438
439
440
441
  app.add_option("-N", minPts, "", true) -> group(group) -> level();
  app.add_option("-S", save_comp, "generate and save comparative matrix for clustering") -> group(group) -> level();
  app.add_option("-L", load_comp, "load comparative matrix for clustering") -> group(group) -> level();
  app.add_option("--forced-edges", forced_edges, "manual clustering -- a file used to force some specific edges") -> group(group) -> level();
442
443
444
445
446
447

  app.add_option("-C",
                 [&cluster_cost](CLI::results_t res) {
                   cluster_cost = strToCost(res[0].c_str(), Cluster);
                   return true;
                 },
448
                 "use custom Cost for automatic clustering : format \"match, subst, indels, del_end, homo\" (default " + string_of_cost(DEFAULT_CLUSTER_COST) + ")"
449
                 ) -> group(group) -> level();
Mathieu Giraud's avatar
Mathieu Giraud committed
450
451
452
453
454
455

  group = "Limits to report a clone (or a window)";
  app.add_option("-r", min_reads_clone, "minimal number of reads supporting a clone", true) -> group(group);
  app.add_option("--ratio", ratio_reads_clone, "minimal percentage of reads supporting a clone", true) -> group(group);

  group = "Limits to further analyze some clones";
456
457
  app.add_option("-y", max_representatives,
                 "maximal number of clones computed with a consensus sequence ('" NO_LIMIT "': no limit)", true) -> group(group) -> transform(string_NO_LIMIT);
458
459
460
461
462
463
464
465
466

  app.add_option("-z",
                 [&max_clones, &max_representatives](CLI::results_t res) {
                   max_clones = atoi_NO_LIMIT(res[0].c_str());
                   if ((max_representatives < max_clones) && (max_representatives != NO_LIMIT_VALUE))
                     max_representatives = max_clones ;
                   return true;
                   // TODO: return false on bad input
                 },
Mathieu Giraud's avatar
Mathieu Giraud committed
467
                 "maximal number of clones to be analyzed with a full V(D)J designation ('" NO_LIMIT "': no limit, do not use)", true)-> group(group);
Mathieu Giraud's avatar
Mathieu Giraud committed
468
469
470
471
472
473
474
475

  app.add_flag_function("-A", [&](size_t n) {
      ratio_reads_clone = 0 ;
      min_reads_clone = 1 ;
      max_representatives = NO_LIMIT_VALUE ;
      max_clones = NO_LIMIT_VALUE ;
    },
    "reports and segments all clones (-r 0 -% 0 -y " NO_LIMIT " -z " NO_LIMIT "), to be used only on very small datasets (for example -AX 20)") -> group(group);
476

477
478
479
480
  app.add_option("-x", max_reads_processed_sample,
                 "maximal number of reads to process ('" NO_LIMIT "': no limit, default), only first reads") -> group(group) -> transform(string_NO_LIMIT);
  app.add_option("-X", max_reads_processed,
                 "maximal number of reads to process ('" NO_LIMIT "': no limit, default), sampled reads") -> group(group) -> transform(string_NO_LIMIT);
481

Mathieu Giraud's avatar
Mathieu Giraud committed
482
483
484
  
  group = "Detailed output per read (generally not recommended, large files, but may be used for filtering, as in -uu -X 1000)";
  app.add_flag("-U", output_segmented, "output segmented reads (in " SEGMENTED_FILENAME " file)") -> group(group);
Mathieu Giraud's avatar
Mathieu Giraud committed
485
486
487
488
489
490
491
492
493
494
  app.add_flag_function("-u", [&](size_t n) {
      output_unsegmented = output_unsegmented_detail_full ;       // -uuu
      output_unsegmented_detail_full = output_unsegmented_detail; // -uu
      output_unsegmented_detail = true;                           // -u
    }, R"Z(
        -u          output unsegmented reads, gathered by unsegmentation cause, except for very short and 'too few V/J' reads (in *)Z" UNSEGMENTED_DETAIL_FILENAME R"Z( files)
        -uu         output unsegmented reads, gathered by unsegmentation cause, all reads (in *)Z" UNSEGMENTED_DETAIL_FILENAME R"Z( files) (use only for debug)
        -uuu        output unsegmented reads, all reads, including a )Z" UNSEGMENTED_FILENAME R"Z( file (use only for debug)
)Z") -> group(group);

Mathieu Giraud's avatar
Mathieu Giraud committed
495
  app.add_flag("-K", output_affects, "output detailed k-mer affectation on all reads (in " AFFECTS_FILENAME " file) (use only for debug, for example -KX 100)") -> group(group);
496

497

Mathieu Giraud's avatar
Mathieu Giraud committed
498
  group = "Output";
499
500
  app.add_option("-o", out_dir, "output directory", true) -> group(group) -> set_type_name("PATH");
  app.add_option("-b", f_basename, "output basename (by default basename of the input file)") -> group(group) -> set_type_name("FILE");
Mathieu Giraud's avatar
Mathieu Giraud committed
501
  app.add_flag("-a", output_sequences_by_cluster, "output all sequences by cluster (" CLONE_FILENAME "*), to be used only on small datasets") -> group(group);
Mathieu Giraud's avatar
Mathieu Giraud committed
502
  app.add_flag_function("-v", [&](size_t n) { verbose += n ; }, "verbose mode") -> group(group);
503

504

505
506
507
508
509
  app.add_flag_function("-H", [&](size_t n) { throw CLI::CallForAdvancedHelp() ; },
                        "help, including advanced and experimental options"
                        "\n                              "
                        "The full help is available in the doc/algo.org file."
                        );
510

Mikaël Salson's avatar
Mikaël Salson committed
511
512


Mathieu Giraud's avatar
Mathieu Giraud committed
513
514
  //$$ options: parsing
  CLI11_PARSE(app, argc, argv);
Mathieu Giraud's avatar
Mathieu Giraud committed
515

Mathieu Giraud's avatar
Mathieu Giraud committed
516
  //$$ options: post-processing+display
Mikaël Salson's avatar
Mikaël Salson committed
517

Mathieu Giraud's avatar
Mathieu Giraud committed
518
519
520
521
522
523
524
525
526
527
528
529
  if (cmd == COMMAND_CLONES)
    command = CMD_CLONES;
  else if (cmd == COMMAND_SEGMENT)
    command = CMD_SEGMENT;
  else if (cmd == COMMAND_WINDOWS)
    command = CMD_WINDOWS;
  else if (cmd == COMMAND_GERMLINES)
    command = CMD_GERMLINES;
  else {
    cerr << "Unknwown command " << optarg << endl;
    usage(argv[0], false);
  }
530

Mathieu Giraud's avatar
Mathieu Giraud committed
531
532
533
  list <string> f_reps_V(v_reps_V.begin(), v_reps_V.end());
  list <string> f_reps_D(v_reps_D.begin(), v_reps_D.end());
  list <string> f_reps_J(v_reps_J.begin(), v_reps_J.end());
534

535

Mathieu Giraud's avatar
Mathieu Giraud committed
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
  for (string arg: multi_germlines)
    {
      multi_germline = true;

      struct stat buffer;
      if (stat(arg.c_str(), &buffer) == 0)
        {
          if( buffer.st_mode & S_IFDIR )
            {
              // argument is a directory
              multi_germline_paths_and_files.push_back(make_pair(arg, DEFAULT_MULTI_GERMLINE_FILE)) ;
              break ;
                }
        }

      // argument is not a directory (and basename can include ':' with a filter)
      multi_germline_paths_and_files.push_back(make_pair(extract_dirname(arg), extract_basename(arg, false)));
    }


556
  if (!multi_germline && (!f_reps_V.size() || !f_reps_J.size()))
557
    {
558
      cerr << ERROR_STRING << "At least one germline must be given with -g or -V/(-D)/-J." << endl ;
559
      return 1;
560
561
    }

562
563
  if (options_s_k > 1)
    {
564
      cerr << ERROR_STRING << "Use at most one -s or -k option." << endl ;
565
      return 1;
566
567
    }

Mikaël Salson's avatar
Mikaël Salson committed
568
569
  string out_seqdir = out_dir + "/seq/" ;

Mikaël Salson's avatar
Mikaël Salson committed
570
571
572
  if (verbose)
    cout << "# verbose " << verbose << endl ;

573
  if (f_reads == DEFAULT_READS)
Mikaël Salson's avatar
Mikaël Salson committed
574
575
576
    {
      cout << "# using default sequence file: " << f_reads << endl ;
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
577

578
579
580
581
582
  //  else
  //  {
  //    cerr << ERROR_STRING << "Wrong number of arguments." << endl ;
  //    return 1;
  //  }
Mathieu Giraud's avatar
Mathieu Giraud committed
583

584
  size_t min_cover_representative = (size_t) (min_reads_clone < (int) max_auditionned ? min_reads_clone : max_auditionned) ;
585

586
587
  // Default seeds

588
#ifdef NO_SPACED_SEEDS
589
  if (! seed_changed)
590
    {
591
      cerr << ERROR_STRING << PROGNAME << " was compiled with NO_SPACED_SEEDS: please provide a -k option." << endl;
592
      return 1;
593
594
595
596
  }
#endif
	  

Mathieu Giraud's avatar
Mathieu Giraud committed
597
598
599
600
#ifndef NO_SPACED_SEEDS
  // Check seed buffer  
  if (seed.size() >= MAX_SEED_SIZE)
    {
601
      cerr << ERROR_STRING << "Seed size is too large (MAX_SEED_SIZE)." << endl ;
602
      return 1;
Mathieu Giraud's avatar
Mathieu Giraud committed
603
604
605
    }
#endif

606

Mathieu Giraud's avatar
Mathieu Giraud committed
607
  if ((wmer_size< 0) && (wmer_size!= NO_LIMIT_VALUE))
608
    {
609
      cerr << ERROR_STRING << "Too small -w. The window size should be positive" << endl;
610
      return 1;
611
612
    }

Mikaël Salson's avatar
Mikaël Salson committed
613
614
615
616
  // Check that out_dir is an existing directory or creates it
  const char *out_cstr = out_dir.c_str();

  if (mkpath(out_cstr, 0755) == -1) {
617
    cerr << ERROR_STRING << "Directory creation: " << out_dir << endl; perror("");
618
    return 2;
Mikaël Salson's avatar
Mikaël Salson committed
619
620
  }

Mikaël Salson's avatar
Mikaël Salson committed
621
622
  const char *outseq_cstr = out_seqdir.c_str();
  if (mkpath(outseq_cstr, 0755) == -1) {
623
    cerr << ERROR_STRING << "Directory creation: " << out_seqdir << endl; perror("");
624
    return 2;
Mikaël Salson's avatar
Mikaël Salson committed
625
626
  }

627
628
629
630
631
  // Compute basename if not given as an option
  if (f_basename == "") {
    f_basename = extract_basename(f_reads);
  }

Mikaël Salson's avatar
Mikaël Salson committed
632
633
634
  out_dir += "/" ;

  /// Load labels ;
635
  load_into_map(windows_labels, windows_labels_file, "-l");
Mikaël Salson's avatar
Mikaël Salson committed
636
637

  switch(command) {
Mathieu Giraud's avatar
Mathieu Giraud committed
638
  case CMD_WINDOWS: cout << "Extracting windows" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
639
    break;
640
  case CMD_CLONES: cout << "Analysing clones" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
641
642
643
    break;
  case CMD_SEGMENT: cout << "Segmenting V(D)J" << endl;
    break;
644
645
  case CMD_GERMLINES: cout << "Discovering germlines" << endl;
    break;
Mikaël Salson's avatar
Mikaël Salson committed
646
647
  }

Mathieu Giraud's avatar
Mathieu Giraud committed
648
  cout << "Command line: ";
Mikaël Salson's avatar
Mikaël Salson committed
649
  for (int i=0; i < argc; i++) {
Mathieu Giraud's avatar
Mathieu Giraud committed
650
    cout << argv[i] << " ";
Mikaël Salson's avatar
Mikaël Salson committed
651
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
652
  cout << endl;
Mikaël Salson's avatar
Mikaël Salson committed
653
654
655
656
657
658
659
660
661
662
663
664

  //////////////////////////////////
  // Display time and date
  time_t rawtime;
  struct tm *timeinfo;
  char time_buffer[80];

  time (&rawtime );
  timeinfo = localtime (&rawtime);

  strftime (time_buffer, 80,"%F %T", timeinfo);

Mathieu Giraud's avatar
Mathieu Giraud committed
665
  cout << "# " << time_buffer << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
666
667


668
669
  //////////////////////////////////
  // Warning for non-optimal use
670

671
  if (max_clones == NO_LIMIT_VALUE || max_clones > WARN_MAX_CLONES)
672
673
    {
      cout << endl
674
	   << "* WARNING: " << PROGNAME << " was run with '-A' option or with a large '-z' option" << endl ;
675
676
677
678
679
    }
  
  if (command == CMD_SEGMENT)
    {
      cout << endl
680
	   << "* WARNING: " << PROGNAME << " was run with '-c segment' option" << endl ;
681
682
    }
  
683
  if (max_clones == NO_LIMIT_VALUE || max_clones > WARN_MAX_CLONES || command == CMD_SEGMENT)
684
    {
685
      cout << "* " << PROGNAME << " efficientl extracts windows overlapping the CDR3" << endl
686
           << "* to cluster reads into clones ('-c clones')." << endl
687
688
           << "* Computing accurate V(D)J designations for many sequences ('-c segment' or large '-z' values)" << endl
           << "* is slow and should be done only on small datasets or for testing purposes." << endl
689
690
691
	   << "* More information is provided in the 'doc/algo.org' file." << endl 
	   << endl ;
    }
692

693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714

  /////////////////////////////////////////
  //            JSON OUTPUT              //
  /////////////////////////////////////////

  string f_json = out_dir + f_basename + JSON_SUFFIX ;

  ostringstream stream_cmdline;
  for (int i=0; i < argc; i++) stream_cmdline << argv[i] << " ";

  json j = {
    {"vidjil_json_version", VIDJIL_JSON_VERSION},
    {"samples", {
        {"number", 1},
        {"original_names", {f_reads}},
        {"run_timestamp", {time_buffer}},
        {"producer", {soft_version}},
        {"commandline", {stream_cmdline.str()}}
      }}
  };


715
716
717
  /////////////////////////////////////////
  //            LOAD GERMLINES           //
  /////////////////////////////////////////
718

719
720
721
  if (command == CMD_GERMLINES)
    {
      multi_germline = true ;
722
      multi_germline_one_unique_index = true ;
723
724
    }

725
  MultiGermline *multigermline = new MultiGermline(indexType, !multi_germline_one_unique_index);
726

727
    {
728
      cout << "Load germlines and build Kmer indexes" << endl ;
729
730
731
    
      if (multi_germline)
	{
732
          for (pair <string, string> path_file: multi_germline_paths_and_files)
733
734
            {
              try {
735
                multigermline->build_from_json(path_file.first, path_file.second, GERMLINES_REGULAR,
736
                                               FIRST_IF_UNCHANGED("", seed, seed_changed),
737
                                               FIRST_IF_UNCHANGED(0, trim_sequences, trim_sequences_changed), (kmer_threshold != NO_LIMIT_VALUE));
738
              } catch (std::exception& e) {
739
                cerr << ERROR_STRING << PROGNAME << " cannot properly read " << path_file.first << "/" << path_file.second << ": " << e.what() << endl;
740
                delete multigermline;
741
                return 1;
742
743
              }
            }
744
745
746
747
748
	}
      else
	{
	  // Custom germline
	  Germline *germline;
749
	  germline = new Germline("custom", 'X',
750
751
                                  f_reps_V, f_reps_D, f_reps_J,
                                  seed, trim_sequences, (kmer_threshold != NO_LIMIT_VALUE));
752

753
          germline->new_index(indexType);
754

755
756
757
	  multigermline->insert(germline);
	}
    }
758

759
    cout << endl ;
760

761
    if (multi_germline_one_unique_index) {
762
      multigermline->build_with_one_index(seed, true);
763
    }
764

765
      if (multi_germline_unexpected_recombinations_12 || multi_germline_unexpected_recombinations_1U) {
766
767
768
        if (!multigermline->index) {
          multigermline->build_with_one_index(seed, false);
        }
769
      }
770

771
      if (multi_germline_unexpected_recombinations_12) {
772
        Germline *pseudo = new Germline(PSEUDO_UNEXPECTED, PSEUDO_UNEXPECTED_CODE, "", trim_sequences, (kmer_threshold != NO_LIMIT_VALUE));
773
        pseudo->seg_method = SEG_METHOD_MAX12 ;
774
        pseudo->set_index(multigermline->index);
775
        multigermline->germlines.push_back(pseudo);
776
777
778
      }

      if (multi_germline_unexpected_recombinations_1U) {
779
        Germline *pseudo_u = new Germline(PSEUDO_UNEXPECTED, PSEUDO_UNEXPECTED_CODE, "", trim_sequences, (kmer_threshold != NO_LIMIT_VALUE));
780
        pseudo_u->seg_method = SEG_METHOD_MAX1U ;
781
        // TODO: there should be more up/downstream regions for the PSEUDO_UNEXPECTED germline. And/or smaller seeds ?
782
        pseudo_u->set_index(multigermline->index);
783
        multigermline->germlines.push_back(pseudo_u);
784
785
    }

786
      // Should come after the initialization of regular (and possibly pseudo) germlines
787
    {
788
      for (pair <string, string> path_file: multi_germline_paths_and_files)
789
        multigermline->build_from_json(path_file.first, path_file.second, GERMLINES_INCOMPLETE,
790
                                       FIRST_IF_UNCHANGED("", seed, seed_changed),
791
                                       FIRST_IF_UNCHANGED(0, trim_sequences, trim_sequences_changed), (kmer_threshold != NO_LIMIT_VALUE));
792
      if ((! multigermline->one_index_per_germline) && (command != CMD_GERMLINES)) {
793
794
        multigermline->insert_in_one_index(multigermline->index, true);
      }
795
796
    }

797
798
    if (multi_germline_mark)
      multigermline->mark_cross_germlines_as_ambiguous();
799
800

    multigermline->finish();
801
    cout << "Germlines loaded: " ;
802
803
    cout << *multigermline ;
    cout << endl ;
804
805

    // Number of reads for e-value computation
806
    unsigned long long nb_reads_for_evalue = (expected_value > NO_LIMIT_VALUE) ? nb_sequences_in_file(f_reads, true) : 1 ;
807

808
    
809
810
  //////////////////////////////////
  //$$ Read sequence files
811
812
813
814

    int only_nth_read = 1 ;
    if (max_reads_processed_sample != NO_LIMIT_VALUE)
      {
815
        only_nth_read = nb_sequences_in_file(f_reads) / max_reads_processed_sample;
816
817
818
        if (only_nth_read == 0)
          only_nth_read = 1 ;

819
        max_reads_processed = max_reads_processed_sample ;
820
821
822
823
824

        if (only_nth_read > 1)
          cout << "Processing every " << only_nth_read
               << (only_nth_read == 2 ? "nd" : (only_nth_read == 3 ? "rd" : "th"))
               << " read" << endl ;
825
826
      }

827
  OnlineBioReader *reads;
828
829

  try {
830
    reads = OnlineBioReaderFactory::create(f_reads, 1, read_header_separator, max_reads_processed, only_nth_read);
831
  } catch (const invalid_argument e) {
832
    cerr << ERROR_STRING << PROGNAME << " cannot open reads file " << f_reads << ": " << e.what() << endl;
833
    return 1;
834
835
836
837
838
  }

  out_dir += "/";


839
840
841
842
843
  //////////////////////////////://////////
  //         DISCOVER GERMLINES          //
  /////////////////////////////////////////
  if (command == CMD_GERMLINES)
    {
844
      map <char, int> stats_kmer, stats_max;
845
      IKmerStore<KmerAffect> *index = multigermline->index ;
846
847

      // Initialize statistics, with two additional categories
848
849
      index->labels.push_back(make_pair(KmerAffect::getAmbiguous(), BIOREADER_AMBIGUOUS));
      index->labels.push_back(make_pair(KmerAffect::getUnknown(), BIOREADER_UNKNOWN));
850
      
851
      for (list< pair <KmerAffect, BioReader> >::const_iterator it = index->labels.begin(); it != index->labels.end(); ++it)
852
	{
853
854
855
	  char key = affect_char(it->first.affect) ;
	  stats_kmer[key] = 0 ;
	  stats_max[key] = 0 ;
856
857
	}
      
858
      // init forbidden for .max()
859
860
861
      set<KmerAffect> forbidden;
      forbidden.insert(KmerAffect::getAmbiguous());
      forbidden.insert(KmerAffect::getUnknown());
862
      
863
864
865
      // Loop through all reads

      int nb_reads = 0 ;
866
867
868
      int total_length = 0 ;
      int s = index->getS();

869
870
      int kmer_size = seed_weight(seed);

871
872
873
874
875
      while (reads->hasNext())
	{
	  reads->next();
	  nb_reads++;
	  string seq = reads->getSequence().sequence;
876
	  total_length += seq.length() - s + 1;
877

878
	  KmerAffectAnalyser *kaa = new KmerAffectAnalyser(*index, seq);
879
880
881

	  for (int i = 0; i < kaa->count(); i++) 
	    {