vidjil.cpp 66.4 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2
  This file is part of Vidjil-algo <http://www.vidjil.org>
3
  Copyright (C) 2011-2019 by VidjilNet consortium and Bonsai bioinformatics
4 5 6 7 8
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9

10
  "Vidjil-algo" is free software: you can redistribute it and/or modify
Mikaël Salson's avatar
Mikaël Salson committed
11 12 13 14
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

15
  "Vidjil-algo" is distributed in the hope that it will be useful,
Mikaël Salson's avatar
Mikaël Salson committed
16 17 18 19 20
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
21
  along with "Vidjil-algo". If not, see <http://www.gnu.org/licenses/>
Mikaël Salson's avatar
Mikaël Salson committed
22 23
*/

Mathieu Giraud's avatar
Mathieu Giraud committed
24
//$$ #include
Mikaël Salson's avatar
Mikaël Salson committed
25 26 27 28 29 30 31 32 33 34 35 36 37

#include<algorithm>
#include<utility>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <cstring>
#include <time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

38
#include "core/check-compiler.h"
Mikaël Salson's avatar
Mikaël Salson committed
39
#include "core/tools.h"
WebTogz's avatar
WebTogz committed
40
#include "core/json.h"
41
#include "core/germline.h"
Mikaël Salson's avatar
Mikaël Salson committed
42 43
#include "core/kmerstore.h"
#include "core/fasta.h"
44
#include "core/bioreader.hpp"
Mikaël Salson's avatar
Mikaël Salson committed
45
#include "core/segment.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
46
#include "core/windows.h"
Mikaël Salson's avatar
Mikaël Salson committed
47 48 49 50 51 52 53
#include "core/cluster-junctions.h"
#include "core/dynprog.h"
#include "core/read_score.h"
#include "core/read_chooser.h"
#include "core/compare-all.h"
#include "core/mkdir.h"
#include "core/labels.h"
Mikaël Salson's avatar
Mikaël Salson committed
54
#include "core/list_utils.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
55
#include "core/windowExtractor.h"
56
#include "core/output.h"
Mikaël Salson's avatar
Mikaël Salson committed
57

Mathieu Giraud's avatar
Mathieu Giraud committed
58
#include "lib/CLI11.hpp"
59
#include "lib/json.hpp"
60
#include "lib/CLI11_json.hpp"
61

Mikaël Salson's avatar
Mikaël Salson committed
62 63 64 65 66 67 68
#include "vidjil.h"

// RELEASE_TAG may be defined in the "release.h" file.
// If RELEASE_TAG is undefined, the version will be the git hash.
// #define RELEASE_TAG  "2013.04"
#include "release.h"

69 70 71
// GIT_VERSION should be defined in "git-version.h", created by "create-git-version-h.sh", to be used outside of releases
#include "git-version.h"

72
#define PROGNAME "vidjil-algo"
73
#define VIDJIL_JSON_VERSION "2016b"
74
#define DOCUMENTATION "doc/vidjil-algo.md"
Mikaël Salson's avatar
Mikaël Salson committed
75

Mathieu Giraud's avatar
Mathieu Giraud committed
76 77
//$$ #define (mainly default options)

78
#define DEFAULT_MULTI_GERMLINE_PATH "germline/"
79
#define DEFAULT_MULTI_GERMLINE_FILE "homo-sapiens.g"
Mikaël Salson's avatar
Mikaël Salson committed
80

81
#define DEFAULT_READ_HEADER_SEPARATOR " "
82
#define DEFAULT_READS  "./demo/Stanford_S22.fasta"
83
#define DEFAULT_MIN_READS_CLONE 5
84
#define DEFAULT_MAX_REPRESENTATIVES 100
85
#define DEFAULT_MAX_CLONES 100
86
#define DEFAULT_RATIO_READS_CLONE 0.0
87
#define NO_LIMIT "all"
Mikaël Salson's avatar
Mikaël Salson committed
88

Mathieu Giraud's avatar
Mathieu Giraud committed
89
#define COMMAND_WINDOWS "windows"
90
#define COMMAND_CLONES "clones"
91
#define COMMAND_SEGMENT "designations"
92
#define COMMAND_GERMLINES "germlines"
Mikaël Salson's avatar
Mikaël Salson committed
93
 
94
enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
Mikaël Salson's avatar
Mikaël Salson committed
95

96 97 98
#define DEFAULT_OUT_DIR "./out/" 

// Fixed filenames/suffixes
99
#define CLONES_FILENAME ".vdj.fa"
Mikaël Salson's avatar
Mikaël Salson committed
100
#define CLONE_FILENAME "clone.fa-"
101 102
#define WINDOWS_FILENAME ".windows.fa"
#define SEGMENTED_FILENAME ".segmented.vdj.fa"
103
#define UNSEGMENTED_FILENAME ".unsegmented.vdj.fa"
104
#define UNSEGMENTED_DETAIL_FILENAME ".fa"
105
#define AFFECTS_FILENAME ".affects"
106 107
#define EDGES_FILENAME ".edges"
#define COMP_FILENAME "comp.vidjil"
108
#define AIRR_SUFFIX ".tsv"
109
#define JSON_SUFFIX ".vidjil"
110
#define GZ_SUFFIX ".gz"
Mikaël Salson's avatar
Mikaël Salson committed
111

112
#define DEFAULT_K      0
113
#define DEFAULT_W      50
Mikaël Salson's avatar
Mikaël Salson committed
114

Mikaël Salson's avatar
Mikaël Salson committed
115
#define DEFAULT_MAX_AUDITIONED 2000
Mathieu Giraud's avatar
Mathieu Giraud committed
116
#define DEFAULT_RATIO_REPRESENTATIVE 0.5
117 118 119 120 121
#define DEFAULT_MIN_COVER_REPRESENTATIVE 3 // At least 3 reads to support a
                                           // representative (consisting of at
                                           // least
                                           // DEFAULT_RATIO_REPRESENTATIVE of
                                           // the clone's reads)
Mathieu Giraud's avatar
Mathieu Giraud committed
122

123
#define DEFAULT_KMER_THRESHOLD 1
Mathieu Giraud's avatar
Mathieu Giraud committed
124

Mikaël Salson's avatar
Mikaël Salson committed
125 126 127 128 129 130
#define DEFAULT_EPSILON  0
#define DEFAULT_MINPTS   10

#define DEFAULT_CLUSTER_COST  Cluster
#define DEFAULT_SEGMENT_COST   VDJ

131
#define DEFAULT_TRIM 0
Mikaël Salson's avatar
Mikaël Salson committed
132

133
#define MAX_CLONES_FOR_SIMILARITY 20
134

135
// warn
Mathieu Giraud's avatar
Mathieu Giraud committed
136
#define WARN_MAX_CLONES 5000
137
#define WARN_PERCENT_SEGMENTED 40
138
#define WARN_COVERAGE 0.6
139
#define WARN_NUM_CLONES_SIMILAR 10
140

Mikaël Salson's avatar
Mikaël Salson committed
141
// display
142
#define CLONES_ON_STDOUT 50
Mikaël Salson's avatar
Mikaël Salson committed
143 144
#define WIDTH_NB_READS 7
#define WIDTH_NB_CLONES 3
145
#define PAD_HELP "\n                              "
Mikaël Salson's avatar
Mikaël Salson committed
146 147

using namespace std ;
148
using json = nlohmann::json;
Mikaël Salson's avatar
Mikaël Salson committed
149

Mathieu Giraud's avatar
Mathieu Giraud committed
150
//$$ options: usage
Mikaël Salson's avatar
Mikaël Salson committed
151

Mathieu Giraud's avatar
Mathieu Giraud committed
152
extern char *optarg;
Mikaël Salson's avatar
Mikaël Salson committed
153 154 155

extern int optind, optopt, opterr;

156
string usage_examples(char *progname)
Mikaël Salson's avatar
Mikaël Salson committed
157
{
158 159
  stringstream ss;
  ss
160
       << "Examples (see " DOCUMENTATION ")" << endl
161
       << "  " << progname << " -c clones       -g germline/homo-sapiens.g   -2 -3 -r 1  demo/Demo-X5.fa           # (basic usage, detect the locus for each read," << endl
162 163
       << "                                                                                               #  cluster reads and report clones starting from the first read (-r 1)," << endl
       << "                                                                                               #  including unexpected recombinations (-2), assign V(D)J genes and try to detect the CDR3s (-3))" << endl
164 165 166 167 168
       << "  " << progname << " -c clones       -g germline/homo-sapiens.g:IGH    -3     demo/Stanford_S22.fasta   # (restrict to complete recombinations on the IGH locus)" << endl
       << "  " << progname << " -c clones       -g germline/homo-sapiens.g   -2 -3 -z 20 demo/LIL-L4.fastq.gz      # (basic usage, output detailed V(D)J analysis on the first 20 clones)" << endl
       << "  " << progname << " -c windows      -g germline/homo-sapiens.g   -y 0 -uu -U demo/LIL-L4.fastq.gz      # (splits all the reads into (large) files depending on the detection of V(D)J recombinations)" << endl
       << "  " << progname << " -c designations -g germline/homo-sapiens.g   -2 -3 -X 50 demo/Stanford_S22.fasta   # (full analysis of each read, here on 50 sampled reads)" << endl
       << "  " << progname << " -c germlines    -g germline/homo-sapiens.g               demo/Stanford_S22.fasta   # (statistics on the k-mers)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
169
    ;
170 171

  return ss.str();
Mikaël Salson's avatar
Mikaël Salson committed
172 173
}

174 175 176 177
inline std::string failure_message_doc(const CLI::App *app, const CLI::Error &e) {
    std::string header = ERROR_STRING + std::string(e.what()) + "\n";
    header += "For more information, ";
    if(app->get_help_ptr() != nullptr)
178
        header += "run with " + app->get_help_ptr()->get_name() + " or ";
179 180 181
    header += "see " DOCUMENTATION ".\n";
    return header;
}
Mathieu Giraud's avatar
Mathieu Giraud committed
182

183
int atoi_NO_LIMIT(const char *optarg)
Mathieu Giraud's avatar
Mathieu Giraud committed
184
{
185 186
  return strcmp(NO_LIMIT, optarg) ? atoi(optarg) : NO_LIMIT_VALUE ;
}
187
double atof_NO_LIMIT(const char *optarg)
188 189
{
  return strcmp(NO_LIMIT, optarg) ? atof(optarg) : NO_LIMIT_VALUE ;
Mathieu Giraud's avatar
Mathieu Giraud committed
190 191
}

192 193 194 195 196 197 198 199 200 201
string string_NO_LIMIT(string s)
{
  if (!strcmp(NO_LIMIT, s.c_str()))
    return NO_LIMIT_VALUE_STRING ;

  return s;
}



Mikaël Salson's avatar
Mikaël Salson committed
202 203
int main (int argc, char **argv)
{
204
  cout << "# " << PROGNAME << " -- V(D)J recombinations analysis <http://www.vidjil.org/>" << endl
205
       << "# Copyright (C) 2011-2019 by the Vidjil team" << endl
206
       << "# Bonsai bioinformatics at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille" << endl 
207
       << "# VidjilNet consortium" << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
208
       << endl
209
       << "# " << PROGNAME << " is free software, and you are welcome to redistribute it" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
210 211 212
       << "# under certain conditions -- see http://git.vidjil.org/blob/master/doc/LICENSE" << endl
       << "# No lymphocyte was harmed in the making of this software," << endl
       << "# however this software is for research use only and comes with no warranty." << endl
213
       << endl
214
       << "# Please cite http://biomedcentral.com/1471-2164/15/409 if you use " << PROGNAME << "." << endl
Mikaël Salson's avatar
Mikaël Salson committed
215 216
       << endl ;

217 218 219
  //////////////////////////////////
  // Display version information or git log

220 221
  string soft_version = PROGNAME ;
  soft_version += " " ;
222
#ifdef RELEASE_TAG
223
  cout << "# version: " PROGNAME << " " << RELEASE_TAG << endl ;
224 225 226 227 228 229 230 231 232 233
  soft_version.append(RELEASE_TAG);
#else
  cout << "# development version" << endl ;
#ifdef GIT_VERSION
  cout << "# git: " << GIT_VERSION << endl ;
  soft_version.append("dev ");
  soft_version.append(GIT_VERSION);
#endif
#endif

234
  CLI::App app{"# vidjil-algo -- V(D)J recombinations analysis", argv[0]};
235
  app.config_formatter(std::make_shared<ConfigJSON>());
236 237
  app.get_formatter()->label("REQUIRED", "");
  app.get_formatter()->label("Positionnals", "");
238
  app.failure_message(failure_message_doc);
Mathieu Giraud's avatar
Mathieu Giraud committed
239

Mathieu Giraud's avatar
Mathieu Giraud committed
240
  //$$ options: defaults
Mathieu Giraud's avatar
Mathieu Giraud committed
241
  float ratio_representative = DEFAULT_RATIO_REPRESENTATIVE;
Mikaël Salson's avatar
Mikaël Salson committed
242
  unsigned int max_auditionned = DEFAULT_MAX_AUDITIONED;
243
  // int average_deletion = 4;     // Average number of deletion in V or J
Mathieu Giraud's avatar
Mathieu Giraud committed
244

245
  //$$ options: definition with CLI11
Mathieu Giraud's avatar
Mathieu Giraud committed
246
  string group = "";
Mathieu Giraud's avatar
Mathieu Giraud committed
247

248
  // ----------------------------------------------------------------------------------------------------------------------
249
  string f_reads = DEFAULT_READS ;
250 251 252 253 254 255
  app.add_option("reads_file", f_reads, R"Z(reads file, in one of the following formats:
                                  - FASTA (.fa/.fasta, .fa.gz/.fasta.gz)
                                  - FASTQ (.fq/.fastq, .fq.gz/.fastq.gz)
                                  - BAM (.bam)
                              Paired-end reads should be merged before given as an input to vidjil-algo.
                 )Z")
256
    -> required() -> type_name("");
257

258

259
  // ----------------------------------------------------------------------------------------------------------------------
Mathieu Giraud's avatar
Mathieu Giraud committed
260
  group = "Command selection";
261

Mathieu Giraud's avatar
Mathieu Giraud committed
262 263 264 265
  string cmd = COMMAND_CLONES;
  app.add_option("-c", cmd, "command"
                 "\n  \t\t" COMMAND_CLONES    "  \t locus detection, window extraction, clone clustering (default command, most efficient, all outputs)"
                 "\n  \t\t" COMMAND_WINDOWS   "  \t locus detection, window extraction"
266
                 "\n  \t\t" COMMAND_SEGMENT   "  \t detailed V(D)J designation, without prior clustering (not as efficient)"
Mathieu Giraud's avatar
Mathieu Giraud committed
267
                 "\n  \t\t" COMMAND_GERMLINES "  \t statistics on k-mers in different germlines")
268
    -> group(group) -> type_name("COMMAND");
Mikaël Salson's avatar
Mikaël Salson committed
269

270
  // ----------------------------------------------------------------------------------------------------------------------
Mathieu Giraud's avatar
Mathieu Giraud committed
271
  group = "Input" ;
272

Mathieu Giraud's avatar
Mathieu Giraud committed
273 274 275
  app.set_config("--config", "", "read a (.json) config.vidjil file with options") -> type_name("FILE")
    -> group(group) -> level();

276
  string read_header_separator = DEFAULT_READ_HEADER_SEPARATOR ;
277
  app.add_option("--header-sep", read_header_separator, "separator for headers in the reads file", false)
278
    -> group(group) -> level() -> type_name("CHAR='" DEFAULT_READ_HEADER_SEPARATOR "'");
Mikaël Salson's avatar
Mikaël Salson committed
279

280 281 282 283 284 285 286 287 288 289 290
  int max_reads_processed = NO_LIMIT_VALUE;
  int max_reads_processed_sample = NO_LIMIT_VALUE;

  app.add_option("--first-reads,-x", max_reads_processed,
                 "maximal number of reads to process ('" NO_LIMIT "': no limit, default), only first reads")
    -> group(group) -> transform(string_NO_LIMIT);

  app.add_option("--sampled-reads,-X", max_reads_processed_sample,
                 "maximal number of reads to process ('" NO_LIMIT "': no limit, default), sampled reads")
    -> group(group) -> transform(string_NO_LIMIT);

291 292

  // ----------------------------------------------------------------------------------------------------------------------
Mathieu Giraud's avatar
Mathieu Giraud committed
293
  group = "Germline presets (at least one -g or -V/(-D)/-J option must be given)";
294

295
  vector <string> multi_germlines ;
296
  app.add_option("--germline,-g", multi_germlines, R"Z(
297
         -g <.g FILE>(:FILTER)
Mathieu Giraud's avatar
Mathieu Giraud committed
298 299 300
                    multiple locus/germlines, with tuned parameters.
                    Common values are '-g germline/homo-sapiens.g' or '-g germline/mus-musculus.g'
                    The list of locus/recombinations can be restricted, such as in '-g germline/homo-sapiens.g:IGH,IGK,IGL'
301 302
         -g PATH
                    multiple locus/germlines, shortcut for '-g PATH/)Z" DEFAULT_MULTI_GERMLINE_FILE R"Z(',
303
                    processes human TRA, TRB, TRG, TRD, IGH, IGK and IGL locus, possibly with incomplete/unusal recombinations)Z")
304
    -> group(group) -> type_name("GERMLINES");
305

306 307 308 309
  vector <string> v_reps_V ;
  vector <string> v_reps_D ;
  vector <string> v_reps_J ;
   
310 311
  app.add_option("-V", v_reps_V,
                 "custom V germline multi-fasta file(s)")
312
    -> group(group) -> type_name("FILE");
313

314

315
  app.add_option("-D", v_reps_D,
316
                 "custom D germline multi-fasta file(s), analyze into V(D)J components")
317
    -> group(group) -> type_name("FILE");
318

319 320
  app.add_option("-J", v_reps_J,
                 "custom V germline multi-fasta file(s)")
321
    -> group(group) -> type_name("FILE");
322

323

324
  bool multi_germline_unexpected_recombinations_12 = false;
325
  app.add_flag("-2", multi_germline_unexpected_recombinations_12, "try to detect unexpected recombinations") -> group(group);
326

327

328 329
  // ----------------------------------------------------------------------------------------------------------------------
  group = "Recombination detection (\"window\" prediction, first pass)";
330 331 332
  group += "\n    (use either -s or -k option, but not both)";
  group += "\n    (using -k option is equivalent to set with -s a contiguous seed with only '#' characters)" ;
  group += "\n    (all these options, except -w, are overriden when using -g)";
333 334 335

  int options_s_k = 0 ;

336
  IndexTypes indexType = AC_AUTOMATON;
337
  app.add_flag_function("--plain-index",
338
                        [&](size_t n) { UNUSED(n); indexType = KMER_INDEX; },
339
                        "use a plain index (pre-2019 method) instead of the recommended Aho-Corasick-like automaton")
340
    -> group(group) -> level();
341

342 343
  string seed = DEFAULT_SEED ;
  bool seed_changed = false;
344
  app.add_option("--kmer,-k",
345 346 347 348 349 350 351 352 353 354
                 [&](CLI::results_t res) {
                   int kmer_size ;
                   bool worked = CLI::detail::lexical_cast(res[0], kmer_size);
                   if (worked) {
                     seed = seed_contiguous(kmer_size);
                     seed_changed = true;
                     options_s_k++ ;
                   }
                   return worked;
                 },
355
                 "k-mer size used for the V/J affectation (default: 10, 12, 13, depends on germline)")
356
    -> group(group) -> level() -> type_name("INT");
357

358
  int wmer_size = DEFAULT_W ;
359
  app.add_option("--window,-w", wmer_size,
360
                 "w-mer size used for the length of the extracted window ('" NO_LIMIT "': use all the read, no window clustering)")
361
    -> group(group) -> level() -> transform(string_NO_LIMIT);
362

363 364 365 366 367
  double expected_value_kmer = NO_LIMIT_VALUE;
  app.add_option("--e-value-kmer", expected_value_kmer,
                 "maximal e-value for the k-mer heuristics ('" NO_LIMIT "': use same value than '-e')", true)
    -> group(group) -> level() -> transform(string_NO_LIMIT);

368 369
  int trim_sequences = DEFAULT_TRIM;
  bool trim_sequences_changed = false;
370
  app.add_option("--trim",
371
                 [&](CLI::results_t res) {
Mikaël Salson's avatar
Mikaël Salson committed
372
                   CLI::detail::lexical_cast(res[0], trim_sequences);
373 374 375 376
                   trim_sequences_changed = true;
                   return true;
                 },
                 // trim_sequences,
377
                 "trim V and J genes (resp. 5' and 3' regions) to keep at most <INT> nt  (0: no trim)")
378
    -> group(group) -> level() -> type_name("INT");
Mikaël Salson's avatar
Mikaël Salson committed
379

380
  app.add_option("--seed,-s",
381 382 383 384 385 386
                 [&](CLI::results_t res) {
                   seed = res[0] ;
                   options_s_k++ ;
                   seed_changed = true;
                   return true;
                 },
Mathieu Giraud's avatar
Mathieu Giraud committed
387
                 "seed, possibly spaced, used for the V/J affectation (default: depends on germline), given either explicitely or by an alias"
388
                 PAD_HELP + string_of_map(seedMap, " ")
389
                 )
390
    -> group(group) -> level() -> type_name("SEED=" DEFAULT_SEED);
391

Mikaël Salson's avatar
Mikaël Salson committed
392

393 394
  // ----------------------------------------------------------------------------------------------------------------------
  group = "Recombination detection, experimental options (do not use)";
395

396 397 398 399
  bool multi_germline_mark = false;
  bool multi_germline_one_unique_index = false;
  bool multi_germline_unexpected_recombinations_1U = false;

400
  app.add_flag("-I", multi_germline_mark,
401
               "ignore k-mers common to different germline systems (experimental, do not use)")
402 403
    -> group(group) -> level();

404
  app.add_flag("-1", multi_germline_one_unique_index,
405
               "use a unique index for all germline systems (experimental, do not use)")
406 407 408
    -> group(group) -> level();

  app.add_flag("-4", multi_germline_unexpected_recombinations_1U,
409
               "try to detect unexpected recombinations with translocations (experimental, do not use)")
410 411
    -> group(group) -> level();

412
  bool keep_unsegmented_as_clone = false;
413 414
  app.add_flag("--not-analyzed-as-clones", keep_unsegmented_as_clone,
               "consider not analyzed reads as clones, taking for junction the complete sequence, to be used on very small datasets (for example --not-analyzed-as-clones -AX 20)")
415
    -> group(group) -> level();
416

417

418
  // ----------------------------------------------------------------------------------------------------------------------
419
  group = "Labeled sequences (windows related to these sequences will be kept even if -r/--ratio thresholds are not reached)";
420 421 422

  vector <string> windows_labels_explicit ;
  string windows_labels_file = "" ;
Mathieu Giraud's avatar
Mathieu Giraud committed
423
  string windows_labels_json = "" ;
424

425 426
  app.add_option("--label", windows_labels_explicit, "label the given sequence(s)") -> group(group) -> level() -> type_name("SEQUENCE");
  app.add_option("--label-file", windows_labels_file, "label a set of sequences given in <file>") -> group(group) -> level() -> type_name("FILE");
Mathieu Giraud's avatar
Mathieu Giraud committed
427 428
  app.add_option("--label-json", windows_labels_json, "read a (.json) label.vidjil (experimental)") -> type_name("FILE")
      -> group(group) -> level();
429 430

  bool only_labeled_windows = false ;
431
  app.add_flag("--label-filter", only_labeled_windows, "filter -- keep only the windows related to the labeled sequences") -> group(group) -> level();
Mikaël Salson's avatar
Mikaël Salson committed
432 433


434
  // ----------------------------------------------------------------------------------------------------------------------
435
  group = "Limits to report and to analyze clones (second pass)";
436
  int max_clones_id = NO_LIMIT_VALUE ;
437 438 439
  int min_reads_clone = DEFAULT_MIN_READS_CLONE ;
  float ratio_reads_clone = DEFAULT_RATIO_READS_CLONE;

440 441
  app.add_option("--min-reads,-r", min_reads_clone, "minimal number of reads supporting a clone", true) -> group(group);
  app.add_option("--min-ratio", ratio_reads_clone, "minimal percentage of reads supporting a clone", true) -> group(group);
442
  app.add_option("--max-clones", max_clones_id, "maximal number of output clones ('" NO_LIMIT "': no maximum, default)", false) -> group(group);
443

444 445 446
  int max_clones = DEFAULT_MAX_CLONES ;
  int max_representatives = DEFAULT_MAX_REPRESENTATIVES ;

447
  app.add_option("--max-consensus,-y", max_representatives,
448 449
                 "maximal number of clones computed with a consensus sequence ('" NO_LIMIT "': no limit)", true)
    -> group(group) -> transform(string_NO_LIMIT);
450

451
  app.add_option("--max-designations,-z",
452 453 454 455 456 457 458
                 [&max_clones, &max_representatives](CLI::results_t res) {
                   max_clones = atoi_NO_LIMIT(res[0].c_str());
                   if ((max_representatives < max_clones) && (max_representatives != NO_LIMIT_VALUE))
                     max_representatives = max_clones ;
                   return true;
                   // TODO: return false on bad input
                 },
459
                 "maximal number of clones to be analyzed with a full V(D)J designation ('" NO_LIMIT "': no limit, do not use)")
460
    -> group(group) -> type_name("INT=" + string_of_int(max_clones));
461

462
  app.add_flag_function("--all", [&](size_t n) {
463
      UNUSED(n);
464 465 466 467 468
      ratio_reads_clone = 0 ;
      min_reads_clone = 1 ;
      max_representatives = NO_LIMIT_VALUE ;
      max_clones = NO_LIMIT_VALUE ;
    },
469
    "reports and analyzes all clones"
470 471
    PAD_HELP "(--min-reads 1 --min-ratio 0 --max-clones " NO_LIMIT" --max-consensus " NO_LIMIT " --max-designations " NO_LIMIT "),"
    PAD_HELP "to be used only on small datasets (for example --all -X 1000)")
472
    -> group(group);
473

474
  VirtualReadScore *readScorer = &DEFAULT_READ_SCORE;
475 476 477
  ReadQualityScore readQualityScore;
  app.add_flag_function("--consensus-on-longest-sequences",
                        [&readScorer, &readQualityScore](size_t n) {
478
                          UNUSED(n);
479 480
                          readScorer = &readQualityScore;
                        }, "for large clones, use a sample of the longest and highest quality reads to compute the consensus sequence (instead of a random sample)")
481
    ->group(group) -> level();
482 483 484

  // ----------------------------------------------------------------------------------------------------------------------
  group = "Clone analysis (second pass)";
485

486 487 488 489 490
  double expected_value = THRESHOLD_NB_EXPECTED;
  app.add_option("--e-value,-e", expected_value,
                 "maximal e-value for determining if a V-J segmentation can be trusted", true)
    -> group(group) -> transform(string_NO_LIMIT);

491
  Cost segment_cost = DEFAULT_SEGMENT_COST ;
492
  app.add_option("--analysis-cost",
493 494 495 496
                 [&segment_cost](CLI::results_t res) {
                   segment_cost = strToCost(res[0].c_str(), VDJ); 
                   return true;
                 },
497
                 "use custom Cost for clone analysis: format \"match, subst, indels, del_end, homo\" (default " + string_of_cost(DEFAULT_SEGMENT_COST) + ")")
498
    -> group(group) -> level() -> type_name("COST");
Mathieu Giraud's avatar
Mathieu Giraud committed
499

500
  double expected_value_D = THRESHOLD_NB_EXPECTED_D;
501
  app.add_option("--analysis-e-value-D,-E", expected_value_D,
502 503
                 "maximal e-value for determining if a D segment can be trusted", true)
    -> group(group) -> level();
504

505
  int kmer_threshold = DEFAULT_KMER_THRESHOLD;
506 507
  app.add_option("--analysis-filter", kmer_threshold,
                 "typical number of V genes, filtered by k-mer comparison, to compare to the read ('" NO_LIMIT "': all genes)", true)
508
    -> group(group) -> transform(string_NO_LIMIT) -> level();
509

510 511 512
  bool several_D = false;
  app.add_flag("-d,--several-D", several_D, "try to detect several D (experimental)") -> group(group);

513
  bool detect_CDR3 = false;
514 515
  app.add_flag("-3,--cdr3", detect_CDR3, "CDR3/JUNCTION detection (requires gapped V/J germlines)")
    -> group(group);
516

517
  int alternative_genes = 0;
518
  app.add_option("--alternative-genes", alternative_genes, "number of alternative V(D)J genes to show beyond the most similar one", true)
519
    -> group(group) -> level();
520 521
  // ----------------------------------------------------------------------------------------------------------------------
  group = "Additional clustering (third pass, experimental)" ;
522 523 524

  int epsilon = DEFAULT_EPSILON ;
  int minPts = DEFAULT_MINPTS ;
525 526
  app.add_option("--cluster-epsilon", epsilon, "minimum required neighbors for automatic clustering. No automatic clusterisation if =0.", true) -> group(group) -> level();
  app.add_option("--cluster-N", minPts, "minimum required neighbors for automatic clustering", true) -> group(group) -> level();
527 528 529

  bool save_comp = false;
  bool load_comp = false;
530 531
  app.add_flag("--cluster-save-matrix", save_comp, "generate and save comparative matrix for clustering") -> group(group) -> level();
  app.add_flag("--cluster-load-matrix", load_comp, "load comparative matrix for clustering") -> group(group) -> level();
532 533

  string forced_edges = "" ;
534
  app.add_option("--cluster-forced-edges", forced_edges, "manual clustering -- a file used to force some specific edges") -> group(group) -> level() -> type_name("FILE");
535

536
  Cost cluster_cost = DEFAULT_CLUSTER_COST ;
537
  app.add_option("--cluster-cost",
538 539 540 541
                 [&cluster_cost](CLI::results_t res) {
                   cluster_cost = strToCost(res[0].c_str(), Cluster);
                   return true;
                 },
542
                 "use custom Cost for automatic clustering : format \"match, subst, indels, del_end, homo\" (default " + string_of_cost(DEFAULT_CLUSTER_COST) + ")")
543
    -> group(group) -> level() -> type_name("COST");
Mathieu Giraud's avatar
Mathieu Giraud committed
544 545

  
546
  // ----------------------------------------------------------------------------------------------------------------------
Mathieu Giraud's avatar
Mathieu Giraud committed
547
  group = "Detailed output per read (generally not recommended, large files, but may be used for filtering, as in -uu -X 1000)";
548 549

  bool output_segmented = false;
550 551
  app.add_flag("--out-analyzed,-U", output_segmented,
               "output analyzed reads (in " SEGMENTED_FILENAME " file)")
552 553
    -> group(group);

554 555 556 557
  bool output_unsegmented = false;
  bool output_unsegmented_detail = false;
  bool output_unsegmented_detail_full = false;

558
  app.add_flag_function("--out-unanalyzed,-u", [&](size_t n) {
559 560 561
      output_unsegmented = (n >= 3);             // -uuu
      output_unsegmented_detail_full = (n >= 2); // -uu
      output_unsegmented_detail = (n >= 1);      // -u
Mathieu Giraud's avatar
Mathieu Giraud committed
562
    }, R"Z(
563 564 565
        -u          output unanalyzed reads, gathered by cause, except for very short and 'too few V/J' reads (in *)Z" UNSEGMENTED_DETAIL_FILENAME R"Z( files)
        -uu         output unanalyzed reads, gathered by cause, all reads (in *)Z" UNSEGMENTED_DETAIL_FILENAME R"Z( files) (use only for debug)
        -uuu        output unanalyzed reads, all reads, including a )Z" UNSEGMENTED_FILENAME R"Z( file (use only for debug))Z")
566
    -> group(group);
Mathieu Giraud's avatar
Mathieu Giraud committed
567

568 569 570
  bool output_sequences_by_cluster = false;
  app.add_flag("--out-reads", output_sequences_by_cluster, "output all reads by clones (" CLONE_FILENAME "*), to be used only on small datasets") -> group(group);

571
  bool output_affects = false;
572 573 574
  app.add_flag("--out-affects,-K", output_affects,
               "output detailed k-mer affectation for each read (in " AFFECTS_FILENAME " file) (use only for debug, for example -KX 100)")
    -> group(group) -> level();
575

576

577
  // ----------------------------------------------------------------------------------------------------------------------
Mathieu Giraud's avatar
Mathieu Giraud committed
578
  group = "Output";
579 580 581 582

  string out_dir = DEFAULT_OUT_DIR;
  string f_basename = "";

583 584
  app.add_option("--dir,-o", out_dir, "output directory", true) -> group(group) -> type_name("PATH");
  app.add_option("--base,-b", f_basename, "output basename (by default basename of the input file)") -> group(group) -> type_name("STRING");
585

586
  bool out_gz = false;
587
  app.add_flag("--gz", out_gz, "output compressed .tsv.gz and .vidjil.gz files") -> group(group) -> level();
588

Mathieu Giraud's avatar
Mathieu Giraud committed
589
  bool no_airr = false;
Mathieu Giraud's avatar
Mathieu Giraud committed
590
  bool no_vidjil = false;
Mathieu Giraud's avatar
Mathieu Giraud committed
591
  app.add_flag("--no-airr", no_airr, "do not output AIRR .tsv") -> group(group) -> level();
592
  app.add_flag("--no-vidjil", no_vidjil, "do not output clones in .vidjil") -> group(group) -> level();
593 594

  int verbose = 0 ;
595
  app.add_flag_function("--verbose,-v", [&](size_t n) { verbose += n ; }, "verbose mode") -> group(group);
596

Mathieu Giraud's avatar
Mathieu Giraud committed
597 598
  bool __only_on_exit__clean_memory; // Do not use except on exit, see #3729
  app.add_flag("--clean-memory", __only_on_exit__clean_memory, "clean memory on exit") -> group(group) -> level();
599

600 601 602 603 604 605 606 607 608 609 610 611
  // ----------------------------------------------------------------------------------------------------------------------
  group = "Presets";

  app.add_option("--grep-reads",
    [&only_labeled_windows,&windows_labels_explicit,&output_sequences_by_cluster](CLI::results_t res) {
      only_labeled_windows = true;
      windows_labels_explicit.push_back(res[0].c_str());
      output_sequences_by_cluster = true;
      return true;
    },
    "output, by clone, reads related to the given window sequence, even when they are below the thresholds"
    PAD_HELP "(equivalent to --label SEQUENCE -label-filter --out-reads)")
612
    -> group(group) -> level() -> type_name("SEQUENCE");
613

614 615
  // ----------------------------------------------------------------------------------------------------------------------
  group = "Help";
616
  app.set_help_flag("--help,-h", "help")
617 618
    -> group(group);

619
  app.add_flag_function("--help-advanced,-H", [&](size_t n) { UNUSED(n); throw CLI::CallForAdvancedHelp() ; },
620 621
                        "help, including advanced and experimental options"
                        "\n                              "
622
                        "The full help is available in " DOCUMENTATION ".")
623
    -> group(group);
Mikaël Salson's avatar
Mikaël Salson committed
624 625


626 627 628 629 630 631 632 633 634 635
  // Deprecated options
  bool deprecated = false;

#define DEPRECATED(options, text) app.add_flag_function((options), [&](size_t n) { UNUSED(n); deprecated = true ; return app.exit(CLI::ConstructionError((text), 1));}) -> level(3);

  DEPRECATED("-t", "'-t' is deprecated, please use '--trim'");
  DEPRECATED("-A", "'-A' is deprecated, please use '--all'");
  DEPRECATED("-a", "'-a' is deprecated, please use '--out-reads'");
  DEPRECATED("-l", "'-l' is deprecated, please use '--label'");

636
  // ----------------------------------------------------------------------------------------------------------------------
637
  app.footer(usage_examples(argv[0]));
638

Mathieu Giraud's avatar
Mathieu Giraud committed
639 640
  //$$ options: parsing
  CLI11_PARSE(app, argc, argv);
Mathieu Giraud's avatar
Mathieu Giraud committed
641

Mathieu Giraud's avatar
Mathieu Giraud committed
642
  //$$ options: post-processing+display
Mikaël Salson's avatar
Mikaël Salson committed
643

644
  int command = CMD_CLONES;
Mathieu Giraud's avatar
Mathieu Giraud committed
645 646 647 648 649 650 651 652
  if (cmd == COMMAND_CLONES)
    command = CMD_CLONES;
  else if (cmd == COMMAND_SEGMENT)
    command = CMD_SEGMENT;
  else if (cmd == COMMAND_WINDOWS)
    command = CMD_WINDOWS;
  else if (cmd == COMMAND_GERMLINES)
    command = CMD_GERMLINES;
653 654
  else if (cmd == "segment")
    return app.exit(CLI::ConstructionError("'-c segment' is deprecated, please use '-c designations'", 1));
Mathieu Giraud's avatar
Mathieu Giraud committed
655
  else {
656
    return app.exit(CLI::ConstructionError("Unknown command " + cmd, 1));
Mathieu Giraud's avatar
Mathieu Giraud committed
657
  }
658
  if (deprecated) return 1 ;
659

Mathieu Giraud's avatar
Mathieu Giraud committed
660 661 662
  list <string> f_reps_V(v_reps_V.begin(), v_reps_V.end());
  list <string> f_reps_D(v_reps_D.begin(), v_reps_D.end());
  list <string> f_reps_J(v_reps_J.begin(), v_reps_J.end());
663

664

665 666 667
  list <pair <string, string>> multi_germline_paths_and_files ;
  bool multi_germline = false;

Mathieu Giraud's avatar
Mathieu Giraud committed
668 669 670 671 672 673 674 675 676 677 678
  for (string arg: multi_germlines)
    {
      multi_germline = true;

      struct stat buffer;
      if (stat(arg.c_str(), &buffer) == 0)
        {
          if( buffer.st_mode & S_IFDIR )
            {
              // argument is a directory
              multi_germline_paths_and_files.push_back(make_pair(arg, DEFAULT_MULTI_GERMLINE_FILE)) ;
679 680
              continue ;
            }
Mathieu Giraud's avatar
Mathieu Giraud committed
681 682 683 684 685 686 687
        }

      // argument is not a directory (and basename can include ':' with a filter)
      multi_germline_paths_and_files.push_back(make_pair(extract_dirname(arg), extract_basename(arg, false)));
    }


688
  if (!multi_germline && (!f_reps_V.size() || !f_reps_J.size()))
689
    {
690
      return app.exit(CLI::ConstructionError("At least one germline must be given with -g or -V/(-D)/-J.", 1));
691 692
    }

693 694
  if (options_s_k > 1)
    {
695
      return app.exit(CLI::ConstructionError("Use at most one -s or -k option.", 1));
696 697
    }

698 699
  map <string, string> windows_labels ;

700
  for(string lab : windows_labels_explicit)
Mathieu Giraud's avatar
Mathieu Giraud committed
701
    windows_labels[lab] = string("--label");
702
  
Mikaël Salson's avatar
Mikaël Salson committed
703 704
  string out_seqdir = out_dir + "/seq/" ;

Mikaël Salson's avatar
Mikaël Salson committed
705 706 707
  if (verbose)
    cout << "# verbose " << verbose << endl ;

708
  if (f_reads == DEFAULT_READS)
Mikaël Salson's avatar
Mikaël Salson committed
709 710 711
    {
      cout << "# using default sequence file: " << f_reads << endl ;
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
712

713
  size_t min_cover_representative = (size_t) min(min_reads_clone, DEFAULT_MIN_COVER_REPRESENTATIVE);
714

Mathieu Giraud's avatar
Mathieu Giraud committed
715 716 717
  // Check seed buffer  
  if (seed.size() >= MAX_SEED_SIZE)
    {
718
      return app.exit(CLI::ConstructionError("Seed size is too large (MAX_SEED_SIZE).", 1));
Mathieu Giraud's avatar
Mathieu Giraud committed
719
    }
720

Mathieu Giraud's avatar
Mathieu Giraud committed
721
  if ((wmer_size< 0) && (wmer_size!= NO_LIMIT_VALUE))
722
    {
723
      return app.exit(CLI::ConstructionError("Too small -w. The window size should be positive.", 1));
724 725
    }

Mikaël Salson's avatar
Mikaël Salson committed
726 727 728 729
  // Check that out_dir is an existing directory or creates it
  const char *out_cstr = out_dir.c_str();

  if (mkpath(out_cstr, 0755) == -1) {
730
    cerr << ERROR_STRING << "Directory creation: " << out_dir << endl; perror("");
731
    return 2;
Mikaël Salson's avatar
Mikaël Salson committed
732 733
  }

Mikaël Salson's avatar
Mikaël Salson committed
734 735
  const char *outseq_cstr = out_seqdir.c_str();
  if (mkpath(outseq_cstr, 0755) == -1) {
736
    cerr << ERROR_STRING << "Directory creation: " << out_seqdir << endl; perror("");
737
    return 2;
Mikaël Salson's avatar
Mikaël Salson committed
738 739
  }

740 741 742 743 744
  // Compute basename if not given as an option
  if (f_basename == "") {
    f_basename = extract_basename(f_reads);
  }

Mikaël Salson's avatar
Mikaël Salson committed
745 746 747
  out_dir += "/" ;

  /// Load labels ;
748
  load_into_map(windows_labels, windows_labels_file, "-l");
Mathieu Giraud's avatar
Mathieu Giraud committed
749
  json j_labels = load_into_map_from_json(windows_labels, windows_labels_json);
Mikaël Salson's avatar
Mikaël Salson committed
750 751

  switch(command) {
Mathieu Giraud's avatar
Mathieu Giraud committed
752
  case CMD_WINDOWS: cout << "Extracting windows" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
753
    break;
754
  case CMD_CLONES: cout << "Analysing clones" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
755 756 757
    break;
  case CMD_SEGMENT: cout << "Segmenting V(D)J" << endl;
    break;
758 759
  case CMD_GERMLINES: cout << "Discovering germlines" << endl;
    break;
Mikaël Salson's avatar
Mikaël Salson committed
760 761
  }

Mathieu Giraud's avatar
Mathieu Giraud committed
762
  cout << "Command line: ";
Mikaël Salson's avatar
Mikaël Salson committed
763
  for (int i=0; i < argc; i++) {
Mathieu Giraud's avatar
Mathieu Giraud committed
764
    cout << argv[i] << " ";
Mikaël Salson's avatar
Mikaël Salson committed
765
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
766
  cout << endl;
Mikaël Salson's avatar
Mikaël Salson committed
767

768 769
  // Dump configuration
  json j_config = json::parse(app.config_to_str(true, true));
Mathieu Giraud's avatar
Mathieu Giraud committed
770 771
  if (!j_labels.empty())
    j_config["labels"] = j_labels;
772

Mikaël Salson's avatar
Mikaël Salson committed
773 774 775 776 777 778 779 780 781 782 783
  //////////////////////////////////
  // Display time and date
  time_t rawtime;
  struct tm *timeinfo;
  char time_buffer[80];

  time (&rawtime );
  timeinfo = localtime (&rawtime);

  strftime (time_buffer, 80,"%F %T", timeinfo);

Mathieu Giraud's avatar
Mathieu Giraud committed
784
  cout << "# " << time_buffer << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
785 786


787 788
  //////////////////////////////////
  // Warning for non-optimal use
789

790
  if (max_clones == NO_LIMIT_VALUE || max_clones > WARN_MAX_CLONES)
791 792
    {
      cout << endl
793
	   << "* WARNING: " <<