vidjil.cpp 55.6 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2
3
  This file is part of Vidjil-algo <http://www.vidjil.org>
  Copyright (C) 2011-2018 by Bonsai bioinformatics
4
5
6
7
8
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9

10
  "Vidjil-algo" is free software: you can redistribute it and/or modify
Mikaël Salson's avatar
Mikaël Salson committed
11
12
13
14
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

15
  "Vidjil-algo" is distributed in the hope that it will be useful,
Mikaël Salson's avatar
Mikaël Salson committed
16
17
18
19
20
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
21
  along with "Vidjil-algo". If not, see <http://www.gnu.org/licenses/>
Mikaël Salson's avatar
Mikaël Salson committed
22
23
*/

Mathieu Giraud's avatar
Mathieu Giraud committed
24
//$$ #include
Mikaël Salson's avatar
Mikaël Salson committed
25
26
27
28
29
30
31
32
33
34
35
36
37

#include<algorithm>
#include<utility>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <cstring>
#include <time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

38
#include "core/check-compiler.h"
Mikaël Salson's avatar
Mikaël Salson committed
39
#include "core/tools.h"
WebTogz's avatar
WebTogz committed
40
#include "core/json.h"
41
#include "core/germline.h"
Mikaël Salson's avatar
Mikaël Salson committed
42
43
#include "core/kmerstore.h"
#include "core/fasta.h"
44
#include "core/bioreader.hpp"
Mikaël Salson's avatar
Mikaël Salson committed
45
#include "core/segment.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
46
#include "core/windows.h"
Mikaël Salson's avatar
Mikaël Salson committed
47
48
49
50
51
52
53
#include "core/cluster-junctions.h"
#include "core/dynprog.h"
#include "core/read_score.h"
#include "core/read_chooser.h"
#include "core/compare-all.h"
#include "core/mkdir.h"
#include "core/labels.h"
Mikaël Salson's avatar
Mikaël Salson committed
54
#include "core/list_utils.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
55
#include "core/windowExtractor.h"
Mikaël Salson's avatar
Mikaël Salson committed
56

Mathieu Giraud's avatar
Mathieu Giraud committed
57
#include "lib/CLI11.hpp"
58
59
#include "lib/json.hpp"

Mikaël Salson's avatar
Mikaël Salson committed
60
61
62
63
64
65
66
#include "vidjil.h"

// RELEASE_TAG may be defined in the "release.h" file.
// If RELEASE_TAG is undefined, the version will be the git hash.
// #define RELEASE_TAG  "2013.04"
#include "release.h"

67
68
69
// GIT_VERSION should be defined in "git-version.h", created by "create-git-version-h.sh", to be used outside of releases
#include "git-version.h"

70
#define PROGNAME "vidjil-algo"
71
#define VIDJIL_JSON_VERSION "2016b"
Mikaël Salson's avatar
Mikaël Salson committed
72

Mathieu Giraud's avatar
Mathieu Giraud committed
73
74
//$$ #define (mainly default options)

75
#define DEFAULT_MULTI_GERMLINE_PATH "germline/"
76
#define DEFAULT_MULTI_GERMLINE_FILE "homo-sapiens.g"
Mikaël Salson's avatar
Mikaël Salson committed
77

78
#define DEFAULT_READ_HEADER_SEPARATOR " "
79
#define DEFAULT_READS  "./demo/Stanford_S22.fasta"
80
#define DEFAULT_MIN_READS_CLONE 5
81
#define DEFAULT_MAX_REPRESENTATIVES 100
82
#define DEFAULT_MAX_CLONES 100
83
#define DEFAULT_RATIO_READS_CLONE 0.0
84
#define NO_LIMIT "all"
Mikaël Salson's avatar
Mikaël Salson committed
85

Mathieu Giraud's avatar
Mathieu Giraud committed
86
#define COMMAND_WINDOWS "windows"
87
#define COMMAND_CLONES "clones"
Mikaël Salson's avatar
Mikaël Salson committed
88
#define COMMAND_SEGMENT "segment"
89
#define COMMAND_GERMLINES "germlines"
Mikaël Salson's avatar
Mikaël Salson committed
90
 
91
enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
Mikaël Salson's avatar
Mikaël Salson committed
92

93
94
95
#define DEFAULT_OUT_DIR "./out/" 

// Fixed filenames/suffixes
96
#define CLONES_FILENAME ".vdj.fa"
Mikaël Salson's avatar
Mikaël Salson committed
97
#define CLONE_FILENAME "clone.fa-"
98
99
#define WINDOWS_FILENAME ".windows.fa"
#define SEGMENTED_FILENAME ".segmented.vdj.fa"
100
#define UNSEGMENTED_FILENAME ".unsegmented.vdj.fa"
101
#define UNSEGMENTED_DETAIL_FILENAME ".fa"
102
#define AFFECTS_FILENAME ".affects"
103
104
105
#define EDGES_FILENAME ".edges"
#define COMP_FILENAME "comp.vidjil"
#define JSON_SUFFIX ".vidjil"
Mikaël Salson's avatar
Mikaël Salson committed
106

107
#define DEFAULT_K      0
108
#define DEFAULT_W      50
109
#define DEFAULT_SEED   DEFAULT_GERMLINE_SEED
Mikaël Salson's avatar
Mikaël Salson committed
110

Mikaël Salson's avatar
Mikaël Salson committed
111
#define DEFAULT_MAX_AUDITIONED 2000
Mathieu Giraud's avatar
Mathieu Giraud committed
112
113
#define DEFAULT_RATIO_REPRESENTATIVE 0.5

Mathieu Giraud's avatar
Mathieu Giraud committed
114
115
#define DEFAULT_KMER_THRESHOLD NO_LIMIT_VALUE

Mikaël Salson's avatar
Mikaël Salson committed
116
117
118
119
120
121
#define DEFAULT_EPSILON  0
#define DEFAULT_MINPTS   10

#define DEFAULT_CLUSTER_COST  Cluster
#define DEFAULT_SEGMENT_COST   VDJ

122
#define DEFAULT_TRIM 0
Mikaël Salson's avatar
Mikaël Salson committed
123

124
#define MAX_CLONES_FOR_SIMILARITY 20
125

126
// warn
127
#define WARN_MAX_CLONES 100
128
#define WARN_PERCENT_SEGMENTED 40
129
#define WARN_COVERAGE 0.6
130
#define WARN_NUM_CLONES_SIMILAR 10
131

Mikaël Salson's avatar
Mikaël Salson committed
132
133
134
// display
#define WIDTH_NB_READS 7
#define WIDTH_NB_CLONES 3
135

Mikaël Salson's avatar
Mikaël Salson committed
136
137

using namespace std ;
138
using json = nlohmann::json;
Mikaël Salson's avatar
Mikaël Salson committed
139

Mathieu Giraud's avatar
Mathieu Giraud committed
140
//$$ options: usage
Mikaël Salson's avatar
Mikaël Salson committed
141

Mathieu Giraud's avatar
Mathieu Giraud committed
142
extern char *optarg;
Mikaël Salson's avatar
Mikaël Salson committed
143
144
145

extern int optind, optopt, opterr;

146
int usage(char *progname, bool advanced)
Mikaël Salson's avatar
Mikaël Salson committed
147
{
Mathieu Giraud's avatar
Mathieu Giraud committed
148
  cout
Mikaël Salson's avatar
Mikaël Salson committed
149
       << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
150
       << "Examples (see doc/algo.org)" << endl
151
152
153
       << "  " << progname << " -c clones   -g germline/homo-sapiens.g   -2 -3 -r 1  demo/Demo-X5.fa           # (basic usage, detect the locus for each read," << endl
       << "                                                                                               #  cluster reads and report clones starting from the first read (-r 1)," << endl
       << "                                                                                               #  including unexpected recombinations (-2), assign V(D)J genes and try to detect the CDR3s (-3))" << endl
154
       << "  " << progname << " -c clones   -g germline/homo-sapiens.g:IGH    -3     demo/Stanford_S22.fasta   # (restrict to complete recombinations on the IGH locus)" << endl
155
156
157
       << "  " << progname << " -c clones   -g germline/homo-sapiens.g   -2 -3 -z 20 demo/LIL-L4.fastq.gz      # (basic usage, output detailed V(D)J analysis on the first 20 clones)" << endl
       << "  " << progname << " -c windows  -g germline/homo-sapiens.g   -y 0 -uu -U demo/LIL-L4.fastq.gz      # (splits all the reads into (large) files depending on the detection of V(D)J recombinations)" << endl
       << "  " << progname << " -c segment  -g germline/homo-sapiens.g   -2 -3 -X 50 demo/Stanford_S22.fasta   # (full analysis of each read, only for debug/testing, here on 50 sampled reads)" << endl
158
       << "  " << progname << " -c germlines -g germline/homo-sapiens.g              demo/Stanford_S22.fasta   # (statistics on the k-mers)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
159
    ;
160
  return 1;
Mikaël Salson's avatar
Mikaël Salson committed
161
162
}

Mathieu Giraud's avatar
Mathieu Giraud committed
163
164
165

int atoi_NO_LIMIT(char *optarg)
{
166
167
  return strcmp(NO_LIMIT, optarg) ? atoi(optarg) : NO_LIMIT_VALUE ;
}
168
double atof_NO_LIMIT(char *optarg)
169
170
{
  return strcmp(NO_LIMIT, optarg) ? atof(optarg) : NO_LIMIT_VALUE ;
Mathieu Giraud's avatar
Mathieu Giraud committed
171
172
}

173
174
175
176
177
178
179
180
181
182
string string_NO_LIMIT(string s)
{
  if (!strcmp(NO_LIMIT, s.c_str()))
    return NO_LIMIT_VALUE_STRING ;

  return s;
}



Mikaël Salson's avatar
Mikaël Salson committed
183
184
int main (int argc, char **argv)
{
185
  cout << "# " << PROGNAME << " -- V(D)J recombinations analysis <http://www.vidjil.org/>" << endl
186
       << "# Copyright (C) 2011-2018 by the Vidjil team" << endl
187
       << "# Bonsai bioinformatics at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille" << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
188
       << endl
189
       << "# " << PROGNAME << " is free software, and you are welcome to redistribute it" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
190
191
192
       << "# under certain conditions -- see http://git.vidjil.org/blob/master/doc/LICENSE" << endl
       << "# No lymphocyte was harmed in the making of this software," << endl
       << "# however this software is for research use only and comes with no warranty." << endl
193
       << endl
194
       << "# Please cite http://biomedcentral.com/1471-2164/15/409 if you use " << PROGNAME << "." << endl
Mikaël Salson's avatar
Mikaël Salson committed
195
196
       << endl ;

197
198
199
  //////////////////////////////////
  // Display version information or git log

200
201
  string soft_version = PROGNAME ;
  soft_version += " " ;
202
#ifdef RELEASE_TAG
203
  cout << "# version: " PROGNAME << " " << RELEASE_TAG << endl ;
204
205
206
207
208
209
210
211
212
213
  soft_version.append(RELEASE_TAG);
#else
  cout << "# development version" << endl ;
#ifdef GIT_VERSION
  cout << "# git: " << GIT_VERSION << endl ;
  soft_version.append("dev ");
  soft_version.append(GIT_VERSION);
#endif
#endif

Mathieu Giraud's avatar
Mathieu Giraud committed
214
215
  CLI::App app{"Vidjil"};

Mathieu Giraud's avatar
Mathieu Giraud committed
216
217
  //$$ options: defaults

Mathieu Giraud's avatar
Mathieu Giraud committed
218
219
220
221
  vector <string> v_reps_V ;
  vector <string> v_reps_D ;
  vector <string> v_reps_J ;

222
  list <pair <string, string>> multi_germline_paths_and_files ;
223

224
  string read_header_separator = DEFAULT_READ_HEADER_SEPARATOR ;
Mikaël Salson's avatar
Mikaël Salson committed
225
226
  string f_reads = DEFAULT_READS ;
  string seed = DEFAULT_SEED ;
227
  bool seed_changed = false;
228
  string f_basename = "";
Mikaël Salson's avatar
Mikaël Salson committed
229

230
  string out_dir = DEFAULT_OUT_DIR;
Mikaël Salson's avatar
Mikaël Salson committed
231
232
233
  
  string comp_filename = COMP_FILENAME;

234
  int wmer_size = DEFAULT_W ;
Mikaël Salson's avatar
Mikaël Salson committed
235

236
  IndexTypes indexType = KMER_INDEX;
237

Mikaël Salson's avatar
Mikaël Salson committed
238
239
240
241
  int epsilon = DEFAULT_EPSILON ;
  int minPts = DEFAULT_MINPTS ;
  Cost cluster_cost = DEFAULT_CLUSTER_COST ;
  Cost segment_cost = DEFAULT_SEGMENT_COST ;
242
  bool detect_CDR3 = false;
Mikaël Salson's avatar
Mikaël Salson committed
243
  
Mathieu Giraud's avatar
Mathieu Giraud committed
244
245
  bool save_comp = false;
  bool load_comp = false;
Mikaël Salson's avatar
Mikaël Salson committed
246
247
  
  int verbose = 0 ;
248
  int command = CMD_CLONES;
Mikaël Salson's avatar
Mikaël Salson committed
249

250
  int max_representatives = DEFAULT_MAX_REPRESENTATIVES ;
251
252
253
  int max_clones = DEFAULT_MAX_CLONES ;
  int min_reads_clone = DEFAULT_MIN_READS_CLONE ;
  float ratio_reads_clone = DEFAULT_RATIO_READS_CLONE;
Mikaël Salson's avatar
Mikaël Salson committed
254
255
  // int average_deletion = 4;     // Average number of deletion in V or J

256
257
  int max_reads_processed = NO_LIMIT_VALUE;
  int max_reads_processed_sample = NO_LIMIT_VALUE;
258

Mathieu Giraud's avatar
Mathieu Giraud committed
259
  float ratio_representative = DEFAULT_RATIO_REPRESENTATIVE;
Mikaël Salson's avatar
Mikaël Salson committed
260
  unsigned int max_auditionned = DEFAULT_MAX_AUDITIONED;
Mathieu Giraud's avatar
Mathieu Giraud committed
261

Mikaël Salson's avatar
Mikaël Salson committed
262
  int trim_sequences = DEFAULT_TRIM;
Mikaël Salson's avatar
Mikaël Salson committed
263

264
265
  bool trim_sequences_changed = false;
  
Mikaël Salson's avatar
Mikaël Salson committed
266
  bool output_sequences_by_cluster = false;
267
  bool output_segmented = false;
Mathieu Giraud's avatar
Mathieu Giraud committed
268
  bool output_unsegmented = false;
269
  bool output_unsegmented_detail = false;
270
  bool output_unsegmented_detail_full = false;
271
  bool output_affects = false;
272
273
  bool keep_unsegmented_as_clone = false;

274
275
  bool several_D = false;

Mathieu Giraud's avatar
Mathieu Giraud committed
276
  vector <string> multi_germlines ;
277
  bool multi_germline = false;
278
  bool multi_germline_mark = false;
279
  bool multi_germline_one_unique_index = false;
280
281
  bool multi_germline_unexpected_recombinations_12 = false;
  bool multi_germline_unexpected_recombinations_1U = false;
282

Mikaël Salson's avatar
Mikaël Salson committed
283
284
  string forced_edges = "" ;

285
  map <string, string> windows_labels ;
Mathieu Giraud's avatar
Mathieu Giraud committed
286
  string windows_labels_file = "" ;
287
  bool only_labeled_windows = false ;
Mikaël Salson's avatar
Mikaël Salson committed
288
289
290

  char c ;

291
292
  int options_s_k = 0 ;

293
  double expected_value = THRESHOLD_NB_EXPECTED;
294
  double expected_value_D = THRESHOLD_NB_EXPECTED_D;
295

296
297
  //json which contains the Levenshtein distances
  json jsonLevenshtein;
298
  bool jsonLevenshteinComputed = false ;
WebTogz's avatar
WebTogz committed
299

Mathieu Giraud's avatar
Mathieu Giraud committed
300
301
  int kmer_threshold = DEFAULT_KMER_THRESHOLD;

Mathieu Giraud's avatar
Mathieu Giraud committed
302
303
  //$$ options: definition wiht CLI11
  string group = "";
Mathieu Giraud's avatar
Mathieu Giraud committed
304

Mathieu Giraud's avatar
Mathieu Giraud committed
305
  // cerr << "Usage: " << progname << " [options] <reads.fa/.fq/.gz>" << endl << endl;
306

Mathieu Giraud's avatar
Mathieu Giraud committed
307
  group = "Files";
308
  app.add_option("reads", f_reads, "reads.fa/.fq/.gz") -> group(group) -> set_type_name("FILE");
309

310

Mathieu Giraud's avatar
Mathieu Giraud committed
311
  group = "Command selection";
Mathieu Giraud's avatar
Mathieu Giraud committed
312
313
314
315
316
317
318
  string cmd = COMMAND_CLONES;
  app.add_option("-c", cmd, "command"
                 "\n  \t\t" COMMAND_CLONES    "  \t locus detection, window extraction, clone clustering (default command, most efficient, all outputs)"
                 "\n  \t\t" COMMAND_WINDOWS   "  \t locus detection, window extraction"
                 "\n  \t\t" COMMAND_SEGMENT   "  \t detailed V(D)J designation (not recommended)"
                 "\n  \t\t" COMMAND_GERMLINES "  \t statistics on k-mers in different germlines")
    -> group(group) -> set_type_name("COMMAND");
Mikaël Salson's avatar
Mikaël Salson committed
319

Mathieu Giraud's avatar
Mathieu Giraud committed
320
  group = "Input" ;
321
  app.add_option("--separator", read_header_separator, "separator for headers in the reads file", true) -> group(group) -> level() -> set_type_name("CHAR")  ;
Mikaël Salson's avatar
Mikaël Salson committed
322

Mathieu Giraud's avatar
Mathieu Giraud committed
323
324
  
  group = "Germline presets (at least one -g or -V/(-D)/-J option must be given for all commands except -c " COMMAND_GERMLINES ")";
Mathieu Giraud's avatar
Mathieu Giraud committed
325
326
327
328
329
330
331
332
333
  app.add_option("-g", multi_germlines, R"Z(
         -g <.g file>(:filter)
                    multiple locus/germlines, with tuned parameters.
                    Common values are '-g germline/homo-sapiens.g' or '-g germline/mus-musculus.g'
                    The list of locus/recombinations can be restricted, such as in '-g germline/homo-sapiens.g:IGH,IGK,IGL'
         -g <path>
                    multiple locus/germlines, shortcut for '-g <path>/)Z" DEFAULT_MULTI_GERMLINE_FILE R"Z(',
                    processes human TRA, TRB, TRG, TRD, IGH, IGK and IGL locus, possibly with some incomplete/unusal recombination
)Z") -> group(group) -> expected(1) -> set_type_name("GERMLINES");
334

335
336
337
  app.add_option("-V", v_reps_V, "custom V germline multi-fasta file") -> group(group) -> set_type_name("FILE");
  app.add_option("-D", v_reps_D, "custom D germline multi-fasta file (and resets -m and -w options), will segment into V(D)J components") -> group(group) -> set_type_name("FILE");
  app.add_option("-J", v_reps_J, "custom V germline multi-fasta file") -> group(group) -> set_type_name("FILE");
338

Mathieu Giraud's avatar
Mathieu Giraud committed
339
340
  group = "Locus/recombinations";
  app.add_flag("-d", several_D, "try to detect several D (experimental)") -> group(group);
341
  app.add_flag("-2",  multi_germline_unexpected_recombinations_12, "try to detect unexpected recombinations (must be used with -g)") -> group(group);
342

343

Mathieu Giraud's avatar
Mathieu Giraud committed
344
  group = "Experimental options (do not use)";
345
  app.add_flag("-I", multi_germline_mark, "ignore k-mers common to different germline systems (experimental, must be used with -g, do not use)") -> group(group) -> level();
346
  app.add_flag("-1", multi_germline_one_unique_index,
347
348
349
               "use a unique index for all germline systems (experimental, must be used with -g, do not use)") -> group(group) -> level();
  app.add_flag("-4", multi_germline_unexpected_recombinations_1U, "try to detect unexpected recombinations with translocations (experimental, must be used with -g, do not use)") -> group(group) -> level();
  app.add_flag("--keep", keep_unsegmented_as_clone, "keep unsegmented reads as clones, taking for junction the complete sequence, to be used on very small datasets (for example --keep -AX 20)") -> group(group) -> level();
350

Mathieu Giraud's avatar
Mathieu Giraud committed
351
352
353
354
355
356
357
  group = "Window prediction";
#ifndef NO_SPACED_SEEDS
  /*
       << "  (use either -s or -k option, but not both)" << endl
           << "  (all these options, except -w, are overriden when using -g)" << endl
  */
#endif
Mikaël Salson's avatar
Mikaël Salson committed
358

Mathieu Giraud's avatar
Mathieu Giraud committed
359
360
361
362
363
364
365
366
367
  int kmer_size = 0; // TODO: vierer
  app.add_option("-k", kmer_size, "k-mer size used for the V/J affectation (default: 10, 12, 13, depends on germline)") -> group(group);
  // seed = seed_contiguous(kmer_size);
  //        seed_changed = true;
  //	options_s_k++ ;
    
#ifndef NO_SPACED_SEEDS
  //      << "                (using -k option is equivalent to set with -s a contiguous seed with only '#' characters)" << endl
#endif
368
369
370
371
  app.add_option("-w", wmer_size,
                 "w-mer size used for the length of the extracted window ('" NO_LIMIT "': use all the read, no window clustering)") -> group(group) -> transform(string_NO_LIMIT);
  app.add_option("-e", expected_value,
                 "maximal e-value for determining if a V-J segmentation can be trusted", true) -> group(group) -> transform(string_NO_LIMIT);
Mathieu Giraud's avatar
Mathieu Giraud committed
372
373
  app.add_option("-t", trim_sequences, // trim_sequences_changed = true
                 "trim V and J genes (resp. 5' and 3' regions) to keep at most <int> nt  (0: no trim)") -> group(group);
Mikaël Salson's avatar
Mikaël Salson committed
374

375
376
377
378
379
380
381
382
383
384
  app.add_option("-s",
                 [&](CLI::results_t res) {
                   seed = res[0] ;
                   options_s_k++ ;
                   seed_changed = true;
                   return true;
                 },
                 "spaced seeds used for the V/J affectation (default: depends on germline)"
                 ) -> group(group) -> level();

Mathieu Giraud's avatar
Mathieu Giraud committed
385
/*
386
#ifdef NO_SPACED_SEEDS
387
388
        cerr << "To enable the option -s, please compile without NO_SPACED_SEEDS" << endl;
#endif
Mathieu Giraud's avatar
Mathieu Giraud committed
389
390
*/
  
Mikaël Salson's avatar
Mikaël Salson committed
391

392

393

Mathieu Giraud's avatar
Mathieu Giraud committed
394
395
396
397
398
399
  group = "Labeled sequences (windows related to these sequences will be kept even if -r/-% thresholds are not reached)";
  /*
  app.add_option("-W",
                 // TODO:  windows_labels[string(optarg)] = string("-W");
                 , "label the given sequence") -> group(group);
  */
400
  app.add_option("-l", windows_labels_file, "label a set of sequences given in <file>") -> group(group) -> set_type_name("FILE");
Mathieu Giraud's avatar
Mathieu Giraud committed
401
  app.add_flag("-F", only_labeled_windows, "filter -- keep only the windows related to the labeled sequences") -> group(group);
Mikaël Salson's avatar
Mikaël Salson committed
402
403


Mathieu Giraud's avatar
Mathieu Giraud committed
404
405
  //  if (advanced)
  group = "Fine segmentation options (second pass)";
406
407
408


  //       << "  -f <string>   use custom Cost for fine segmenter : format \"match, subst, indels, del_end, homo\" (default "<< DEFAULT_SEGMENT_COST <<" )"<< endl
Mathieu Giraud's avatar
Mathieu Giraud committed
409
  //      case 'f''	segment_cost=strToCost(optarg, VDJ);      break;
410
  app.add_option("-E", expected_value_D, "maximal e-value for determining if a D segment can be trusted", true) -> group(group) -> level();
Mathieu Giraud's avatar
Mathieu Giraud committed
411

412
413
  app.add_option("-Z", kmer_threshold, "typical number of V genes, selected by k-mer comparison, to compare to the read ('" NO_LIMIT "': all genes, default)", true) -> group(group)
    -> transform(string_NO_LIMIT) -> level() -> set_type_name("NB") ;
Mathieu Giraud's avatar
Mathieu Giraud committed
414
415
  
  group = "Clone analysis (second pass)";
416
  app.add_flag("-3,--cdr3", detect_CDR3, "CDR3/JUNCTION detection (requires gapped V/J germlines)") -> group(group);
417

Mathieu Giraud's avatar
Mathieu Giraud committed
418
419
420
421
422
  
  // if (advanced)
  group = "Additional clustering (experimental)" ;

  app.add_flag_function("-q", [&](size_t n) { indexType = AC_AUTOMATON; });
423
  app.add_option("-n", epsilon, "minimum required neighbors for automatic clustering. No automatic clusterisation if =0.", true) -> group(group) -> level();
Mathieu Giraud's avatar
Mathieu Giraud committed
424

425
426
427
428
  app.add_option("-N", minPts, "", true) -> group(group) -> level();
  app.add_option("-S", save_comp, "generate and save comparative matrix for clustering") -> group(group) -> level();
  app.add_option("-L", load_comp, "load comparative matrix for clustering") -> group(group) -> level();
  app.add_option("--forced-edges", forced_edges, "manual clustering -- a file used to force some specific edges") -> group(group) -> level();
429
  // << "  -C <string>   use custom Cost for automatic clustering : format \"match, subst, indels, del_end, homo\" (default "<< DEFAULT_CLUSTER_COST <<" )"<< endl
Mathieu Giraud's avatar
Mathieu Giraud committed
430
431
432
433
434
435
436
  // cluster_cost=strToCost(optarg, Cluster);

  group = "Limits to report a clone (or a window)";
  app.add_option("-r", min_reads_clone, "minimal number of reads supporting a clone", true) -> group(group);
  app.add_option("--ratio", ratio_reads_clone, "minimal percentage of reads supporting a clone", true) -> group(group);

  group = "Limits to further analyze some clones";
437
438
  app.add_option("-y", max_representatives,
                 "maximal number of clones computed with a consensus sequence ('" NO_LIMIT "': no limit)", true) -> group(group) -> transform(string_NO_LIMIT);
Mathieu Giraud's avatar
Mathieu Giraud committed
439
440
441
442
443
444
445
  app.add_option("-z", max_clones, //
                 // [](CLI::results_t res) {
                 //    max_clones = atoi_NO_LIMIT(optarg);
                 //    if ((max_representatives < max_clones) && (max_representatives != NO_LIMIT_VALUE))
                 //    max_representatives = max_clones ;
                 // }
                 "maximal number of clones to be analyzed with a full V(D)J designation ('" NO_LIMIT "': no limit, do not use)", true)-> group(group);
Mathieu Giraud's avatar
Mathieu Giraud committed
446
447
448
449
450
451
452
453

  app.add_flag_function("-A", [&](size_t n) {
      ratio_reads_clone = 0 ;
      min_reads_clone = 1 ;
      max_representatives = NO_LIMIT_VALUE ;
      max_clones = NO_LIMIT_VALUE ;
    },
    "reports and segments all clones (-r 0 -% 0 -y " NO_LIMIT " -z " NO_LIMIT "), to be used only on very small datasets (for example -AX 20)") -> group(group);
454

455
456
457
458
  app.add_option("-x", max_reads_processed_sample,
                 "maximal number of reads to process ('" NO_LIMIT "': no limit, default), only first reads") -> group(group) -> transform(string_NO_LIMIT);
  app.add_option("-X", max_reads_processed,
                 "maximal number of reads to process ('" NO_LIMIT "': no limit, default), sampled reads") -> group(group) -> transform(string_NO_LIMIT);
459

Mathieu Giraud's avatar
Mathieu Giraud committed
460
461
462
  
  group = "Detailed output per read (generally not recommended, large files, but may be used for filtering, as in -uu -X 1000)";
  app.add_flag("-U", output_segmented, "output segmented reads (in " SEGMENTED_FILENAME " file)") -> group(group);
Mathieu Giraud's avatar
Mathieu Giraud committed
463
464
465
466
467
468
469
470
471
472
  app.add_flag_function("-u", [&](size_t n) {
      output_unsegmented = output_unsegmented_detail_full ;       // -uuu
      output_unsegmented_detail_full = output_unsegmented_detail; // -uu
      output_unsegmented_detail = true;                           // -u
    }, R"Z(
        -u          output unsegmented reads, gathered by unsegmentation cause, except for very short and 'too few V/J' reads (in *)Z" UNSEGMENTED_DETAIL_FILENAME R"Z( files)
        -uu         output unsegmented reads, gathered by unsegmentation cause, all reads (in *)Z" UNSEGMENTED_DETAIL_FILENAME R"Z( files) (use only for debug)
        -uuu        output unsegmented reads, all reads, including a )Z" UNSEGMENTED_FILENAME R"Z( file (use only for debug)
)Z") -> group(group);

Mathieu Giraud's avatar
Mathieu Giraud committed
473
  app.add_flag("-K", output_affects, "output detailed k-mer affectation on all reads (in " AFFECTS_FILENAME " file) (use only for debug, for example -KX 100)") -> group(group);
474

475

Mathieu Giraud's avatar
Mathieu Giraud committed
476
  group = "Output";
477
478
  app.add_option("-o", out_dir, "output directory", true) -> group(group) -> set_type_name("PATH");
  app.add_option("-b", f_basename, "output basename (by default basename of the input file)") -> group(group) -> set_type_name("FILE");
Mathieu Giraud's avatar
Mathieu Giraud committed
479
  app.add_flag("-a", output_sequences_by_cluster, "output all sequences by cluster (" CLONE_FILENAME "*), to be used only on small datasets") -> group(group);
Mathieu Giraud's avatar
Mathieu Giraud committed
480
  app.add_flag_function("-v", [&](size_t n) { verbose += n ; }, "verbose mode") -> group(group);
481

482

483
484
485
486
487
  app.add_flag_function("-H", [&](size_t n) { throw CLI::CallForAdvancedHelp() ; },
                        "help, including advanced and experimental options"
                        "\n                              "
                        "The full help is available in the doc/algo.org file."
                        );
488

Mikaël Salson's avatar
Mikaël Salson committed
489
490


Mathieu Giraud's avatar
Mathieu Giraud committed
491
492
  //$$ options: parsing
  CLI11_PARSE(app, argc, argv);
Mathieu Giraud's avatar
Mathieu Giraud committed
493

Mathieu Giraud's avatar
Mathieu Giraud committed
494
  //$$ options: post-processing+display
Mikaël Salson's avatar
Mikaël Salson committed
495

Mathieu Giraud's avatar
Mathieu Giraud committed
496
497
498
499
500
501
502
503
504
505
506
507
  if (cmd == COMMAND_CLONES)
    command = CMD_CLONES;
  else if (cmd == COMMAND_SEGMENT)
    command = CMD_SEGMENT;
  else if (cmd == COMMAND_WINDOWS)
    command = CMD_WINDOWS;
  else if (cmd == COMMAND_GERMLINES)
    command = CMD_GERMLINES;
  else {
    cerr << "Unknwown command " << optarg << endl;
    usage(argv[0], false);
  }
508

Mathieu Giraud's avatar
Mathieu Giraud committed
509
510
511
  list <string> f_reps_V(v_reps_V.begin(), v_reps_V.end());
  list <string> f_reps_D(v_reps_D.begin(), v_reps_D.end());
  list <string> f_reps_J(v_reps_J.begin(), v_reps_J.end());
512

513

Mathieu Giraud's avatar
Mathieu Giraud committed
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
  for (string arg: multi_germlines)
    {
      multi_germline = true;

      struct stat buffer;
      if (stat(arg.c_str(), &buffer) == 0)
        {
          if( buffer.st_mode & S_IFDIR )
            {
              // argument is a directory
              multi_germline_paths_and_files.push_back(make_pair(arg, DEFAULT_MULTI_GERMLINE_FILE)) ;
              break ;
                }
        }

      // argument is not a directory (and basename can include ':' with a filter)
      multi_germline_paths_and_files.push_back(make_pair(extract_dirname(arg), extract_basename(arg, false)));
    }


534
  if (!multi_germline && (!f_reps_V.size() || !f_reps_J.size()))
535
    {
536
      cerr << ERROR_STRING << "At least one germline must be given with -g or -V/(-D)/-J." << endl ;
537
      return 1;
538
539
    }

540
541
  if (options_s_k > 1)
    {
542
      cerr << ERROR_STRING << "Use at most one -s or -k option." << endl ;
543
      return 1;
544
545
    }

Mikaël Salson's avatar
Mikaël Salson committed
546
547
  string out_seqdir = out_dir + "/seq/" ;

Mikaël Salson's avatar
Mikaël Salson committed
548
549
550
  if (verbose)
    cout << "# verbose " << verbose << endl ;

551
  if (f_reads == DEFAULT_READS)
Mikaël Salson's avatar
Mikaël Salson committed
552
553
554
    {
      cout << "# using default sequence file: " << f_reads << endl ;
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
555

556
557
558
559
560
  //  else
  //  {
  //    cerr << ERROR_STRING << "Wrong number of arguments." << endl ;
  //    return 1;
  //  }
Mathieu Giraud's avatar
Mathieu Giraud committed
561

562
  size_t min_cover_representative = (size_t) (min_reads_clone < (int) max_auditionned ? min_reads_clone : max_auditionned) ;
563

564
565
  // Default seeds

566
#ifdef NO_SPACED_SEEDS
567
  if (! seed_changed)
568
    {
569
      cerr << ERROR_STRING << PROGNAME << " was compiled with NO_SPACED_SEEDS: please provide a -k option." << endl;
570
      return 1;
571
572
573
574
  }
#endif
	  

Mathieu Giraud's avatar
Mathieu Giraud committed
575
576
577
578
#ifndef NO_SPACED_SEEDS
  // Check seed buffer  
  if (seed.size() >= MAX_SEED_SIZE)
    {
579
      cerr << ERROR_STRING << "Seed size is too large (MAX_SEED_SIZE)." << endl ;
580
      return 1;
Mathieu Giraud's avatar
Mathieu Giraud committed
581
582
583
    }
#endif

584

Mathieu Giraud's avatar
Mathieu Giraud committed
585
  if ((wmer_size< 0) && (wmer_size!= NO_LIMIT_VALUE))
586
    {
587
      cerr << ERROR_STRING << "Too small -w. The window size should be positive" << endl;
588
      return 1;
589
590
    }

Mikaël Salson's avatar
Mikaël Salson committed
591
592
593
594
  // Check that out_dir is an existing directory or creates it
  const char *out_cstr = out_dir.c_str();

  if (mkpath(out_cstr, 0755) == -1) {
595
    cerr << ERROR_STRING << "Directory creation: " << out_dir << endl; perror("");
596
    return 2;
Mikaël Salson's avatar
Mikaël Salson committed
597
598
  }

Mikaël Salson's avatar
Mikaël Salson committed
599
600
  const char *outseq_cstr = out_seqdir.c_str();
  if (mkpath(outseq_cstr, 0755) == -1) {
601
    cerr << ERROR_STRING << "Directory creation: " << out_seqdir << endl; perror("");
602
    return 2;
Mikaël Salson's avatar
Mikaël Salson committed
603
604
  }

605
606
607
608
609
  // Compute basename if not given as an option
  if (f_basename == "") {
    f_basename = extract_basename(f_reads);
  }

Mikaël Salson's avatar
Mikaël Salson committed
610
611
612
  out_dir += "/" ;

  /// Load labels ;
613
  load_into_map(windows_labels, windows_labels_file, "-l");
Mikaël Salson's avatar
Mikaël Salson committed
614
615

  switch(command) {
Mathieu Giraud's avatar
Mathieu Giraud committed
616
  case CMD_WINDOWS: cout << "Extracting windows" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
617
    break;
618
  case CMD_CLONES: cout << "Analysing clones" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
619
620
621
    break;
  case CMD_SEGMENT: cout << "Segmenting V(D)J" << endl;
    break;
622
623
  case CMD_GERMLINES: cout << "Discovering germlines" << endl;
    break;
Mikaël Salson's avatar
Mikaël Salson committed
624
625
  }

Mathieu Giraud's avatar
Mathieu Giraud committed
626
  cout << "Command line: ";
Mikaël Salson's avatar
Mikaël Salson committed
627
  for (int i=0; i < argc; i++) {
Mathieu Giraud's avatar
Mathieu Giraud committed
628
    cout << argv[i] << " ";
Mikaël Salson's avatar
Mikaël Salson committed
629
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
630
  cout << endl;
Mikaël Salson's avatar
Mikaël Salson committed
631
632
633
634
635
636
637
638
639
640
641
642

  //////////////////////////////////
  // Display time and date
  time_t rawtime;
  struct tm *timeinfo;
  char time_buffer[80];

  time (&rawtime );
  timeinfo = localtime (&rawtime);

  strftime (time_buffer, 80,"%F %T", timeinfo);

Mathieu Giraud's avatar
Mathieu Giraud committed
643
  cout << "# " << time_buffer << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
644
645


646
647
  //////////////////////////////////
  // Warning for non-optimal use
648

649
  if (max_clones == NO_LIMIT_VALUE || max_clones > WARN_MAX_CLONES)
650
651
    {
      cout << endl
652
	   << "* WARNING: " << PROGNAME << " was run with '-A' option or with a large '-z' option" << endl ;
653
654
655
656
657
    }
  
  if (command == CMD_SEGMENT)
    {
      cout << endl
658
	   << "* WARNING: " << PROGNAME << " was run with '-c segment' option" << endl ;
659
660
    }
  
661
  if (max_clones == NO_LIMIT_VALUE || max_clones > WARN_MAX_CLONES || command == CMD_SEGMENT)
662
    {
663
      cout << "* " << PROGNAME << " efficientl extracts windows overlapping the CDR3" << endl
664
           << "* to cluster reads into clones ('-c clones')." << endl
665
666
           << "* Computing accurate V(D)J designations for many sequences ('-c segment' or large '-z' values)" << endl
           << "* is slow and should be done only on small datasets or for testing purposes." << endl
667
668
669
	   << "* More information is provided in the 'doc/algo.org' file." << endl 
	   << endl ;
    }
670

671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692

  /////////////////////////////////////////
  //            JSON OUTPUT              //
  /////////////////////////////////////////

  string f_json = out_dir + f_basename + JSON_SUFFIX ;

  ostringstream stream_cmdline;
  for (int i=0; i < argc; i++) stream_cmdline << argv[i] << " ";

  json j = {
    {"vidjil_json_version", VIDJIL_JSON_VERSION},
    {"samples", {
        {"number", 1},
        {"original_names", {f_reads}},
        {"run_timestamp", {time_buffer}},
        {"producer", {soft_version}},
        {"commandline", {stream_cmdline.str()}}
      }}
  };


693
694
695
  /////////////////////////////////////////
  //            LOAD GERMLINES           //
  /////////////////////////////////////////
696

697
698
699
  if (command == CMD_GERMLINES)
    {
      multi_germline = true ;
700
      multi_germline_one_unique_index = true ;
701
702
    }

703
  MultiGermline *multigermline = new MultiGermline(indexType, !multi_germline_one_unique_index);
704

705
    {
706
      cout << "Load germlines and build Kmer indexes" << endl ;
707
708
709
    
      if (multi_germline)
	{
710
          for (pair <string, string> path_file: multi_germline_paths_and_files)
711
712
            {
              try {
713
                multigermline->build_from_json(path_file.first, path_file.second, GERMLINES_REGULAR,
714
                                               FIRST_IF_UNCHANGED("", seed, seed_changed),
715
                                               FIRST_IF_UNCHANGED(0, trim_sequences, trim_sequences_changed), (kmer_threshold != NO_LIMIT_VALUE));
716
              } catch (std::exception& e) {
717
                cerr << ERROR_STRING << PROGNAME << " cannot properly read " << path_file.first << "/" << path_file.second << ": " << e.what() << endl;
718
                delete multigermline;
719
                return 1;
720
721
              }
            }
722
723
724
725
726
	}
      else
	{
	  // Custom germline
	  Germline *germline;
727
	  germline = new Germline("custom", 'X',
728
729
                                  f_reps_V, f_reps_D, f_reps_J,
                                  seed, trim_sequences, (kmer_threshold != NO_LIMIT_VALUE));
730

731
          germline->new_index(indexType);
732

733
734
735
	  multigermline->insert(germline);
	}
    }
736

737
    cout << endl ;
738

739
    if (multi_germline_one_unique_index) {
740
      multigermline->build_with_one_index(seed, true);
741
    }
742

743
      if (multi_germline_unexpected_recombinations_12 || multi_germline_unexpected_recombinations_1U) {
744
745
746
        if (!multigermline->index) {
          multigermline->build_with_one_index(seed, false);
        }
747
      }
748

749
      if (multi_germline_unexpected_recombinations_12) {
750
        Germline *pseudo = new Germline(PSEUDO_UNEXPECTED, PSEUDO_UNEXPECTED_CODE, "", trim_sequences, (kmer_threshold != NO_LIMIT_VALUE));
751
        pseudo->seg_method = SEG_METHOD_MAX12 ;
752
        pseudo->set_index(multigermline->index);
753
        multigermline->germlines.push_back(pseudo);
754
755
756
      }

      if (multi_germline_unexpected_recombinations_1U) {
757
        Germline *pseudo_u = new Germline(PSEUDO_UNEXPECTED, PSEUDO_UNEXPECTED_CODE, "", trim_sequences, (kmer_threshold != NO_LIMIT_VALUE));
758
        pseudo_u->seg_method = SEG_METHOD_MAX1U ;
759
        // TODO: there should be more up/downstream regions for the PSEUDO_UNEXPECTED germline. And/or smaller seeds ?
760
        pseudo_u->set_index(multigermline->index);
761
        multigermline->germlines.push_back(pseudo_u);
762
763
    }

764
      // Should come after the initialization of regular (and possibly pseudo) germlines
765
    {
766
      for (pair <string, string> path_file: multi_germline_paths_and_files)
767
        multigermline->build_from_json(path_file.first, path_file.second, GERMLINES_INCOMPLETE,
768
                                       FIRST_IF_UNCHANGED("", seed, seed_changed),
769
                                       FIRST_IF_UNCHANGED(0, trim_sequences, trim_sequences_changed), (kmer_threshold != NO_LIMIT_VALUE));
770
      if ((! multigermline->one_index_per_germline) && (command != CMD_GERMLINES)) {
771
772
        multigermline->insert_in_one_index(multigermline->index, true);
      }
773
774
    }

775
776
    if (multi_germline_mark)
      multigermline->mark_cross_germlines_as_ambiguous();
777
778

    multigermline->finish();
779
    cout << "Germlines loaded: " ;
780
781
    cout << *multigermline ;
    cout << endl ;
782
783

    // Number of reads for e-value computation
784
    unsigned long long nb_reads_for_evalue = (expected_value > NO_LIMIT_VALUE) ? nb_sequences_in_file(f_reads, true) : 1 ;
785

786
    
787
788
  //////////////////////////////////
  //$$ Read sequence files
789
790
791
792

    int only_nth_read = 1 ;
    if (max_reads_processed_sample != NO_LIMIT_VALUE)
      {
793
        only_nth_read = nb_sequences_in_file(f_reads) / max_reads_processed_sample;
794
795
796
        if (only_nth_read == 0)
          only_nth_read = 1 ;

797
        max_reads_processed = max_reads_processed_sample ;
798
799
800
801
802

        if (only_nth_read > 1)
          cout << "Processing every " << only_nth_read
               << (only_nth_read == 2 ? "nd" : (only_nth_read == 3 ? "rd" : "th"))
               << " read" << endl ;
803
804
      }

805
  OnlineBioReader *reads;
806
807

  try {
808
    reads = OnlineBioReaderFactory::create(f_reads, 1, read_header_separator, max_reads_processed, only_nth_read);
809
  } catch (const invalid_argument e) {
810
    cerr << ERROR_STRING << PROGNAME << " cannot open reads file " << f_reads << ": " << e.what() << endl;
811
    return 1;
812
813
814
815
816
  }

  out_dir += "/";


817
818
819
820
821
  //////////////////////////////://////////
  //         DISCOVER GERMLINES          //
  /////////////////////////////////////////
  if (command == CMD_GERMLINES)
    {
822
      map <char, int> stats_kmer, stats_max;
823
      IKmerStore<KmerAffect> *index = multigermline->index ;
824
825

      // Initialize statistics, with two additional categories
826
827
      index->labels.push_back(make_pair(KmerAffect::getAmbiguous(), BIOREADER_AMBIGUOUS));
      index->labels.push_back(make_pair(KmerAffect::getUnknown(), BIOREADER_UNKNOWN));
828
      
829
      for (list< pair <KmerAffect, BioReader> >::const_iterator it = index->labels.begin(); it != index->labels.end(); ++it)
830
	{
831
832
833
	  char key = affect_char(it->first.affect) ;
	  stats_kmer[key] = 0 ;
	  stats_max[key] = 0 ;
834
835
	}
      
836
      // init forbidden for .max()
837
838
839
      set<KmerAffect> forbidden;
      forbidden.insert(KmerAffect::getAmbiguous());
      forbidden.insert(KmerAffect::getUnknown());
840
      
841
842
843
      // Loop through all reads

      int nb_reads = 0 ;
844
845
846
      int total_length = 0 ;
      int s = index->getS();

847
848
      int kmer_size = seed_weight(seed);

849
850
851
852
853
      while (reads->hasNext())
	{
	  reads->next();
	  nb_reads++;
	  string seq = reads->getSequence().sequence;
854
	  total_length += seq.length() - s + 1;
855

856
	  KmerAffectAnalyser *kaa = new KmerAffectAnalyser(*index, seq);
857
858
859

	  for (int i = 0; i < kaa->count(); i++) 
	    { 
860
861
	      KmerAffect ksa = kaa->getAffectation(i);
	      stats_kmer[affect_char(ksa.affect)]++ ;
862
	    }
863

864
          delete kaa;
865

866
	  CountKmerAffectAnalyser ckaa(*index, seq);
867
	  ckaa.setAllowedOverlap(kmer_size-1);
868

869
	  stats_max[affect_char(ckaa.max(forbidden).affect)]++ ;
870

871
872
	}