vidjil.cpp 51.4 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2
  This file is part of Vidjil <http://www.vidjil.org>
3
  Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016 by Bonsai bioinformatics
4 5 6 7 8
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

Mathieu Giraud's avatar
Mathieu Giraud committed
24
//$$ #include
Mikaël Salson's avatar
Mikaël Salson committed
25 26 27 28 29 30 31 32 33 34 35 36 37 38

#include<algorithm>
#include<utility>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <cstring>
#include <time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#include "core/tools.h"
WebTogz's avatar
WebTogz committed
39
#include "core/json.h"
40
#include "core/germline.h"
Mikaël Salson's avatar
Mikaël Salson committed
41 42 43
#include "core/kmerstore.h"
#include "core/fasta.h"
#include "core/segment.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
44
#include "core/windows.h"
Mikaël Salson's avatar
Mikaël Salson committed
45 46 47 48 49 50 51
#include "core/cluster-junctions.h"
#include "core/dynprog.h"
#include "core/read_score.h"
#include "core/read_chooser.h"
#include "core/compare-all.h"
#include "core/mkdir.h"
#include "core/labels.h"
Mikaël Salson's avatar
Mikaël Salson committed
52
#include "core/list_utils.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
53
#include "core/windowExtractor.h"
Mikaël Salson's avatar
Mikaël Salson committed
54

55 56
#include "lib/json.hpp"

Mikaël Salson's avatar
Mikaël Salson committed
57 58 59 60 61 62 63
#include "vidjil.h"

// RELEASE_TAG may be defined in the "release.h" file.
// If RELEASE_TAG is undefined, the version will be the git hash.
// #define RELEASE_TAG  "2013.04"
#include "release.h"

64 65 66
// GIT_VERSION should be defined in "git-version.h", created by "create-git-version-h.sh", to be used outside of releases
#include "git-version.h"

67
#define VIDJIL_JSON_VERSION "2016b"
Mikaël Salson's avatar
Mikaël Salson committed
68

Mathieu Giraud's avatar
Mathieu Giraud committed
69 70
//$$ #define (mainly default options)

71 72
#define DEFAULT_MULTI_GERMLINE_PATH "germline/"
#define DEFAULT_MULTI_GERMLINE_FILE "germlines.data"
Mikaël Salson's avatar
Mikaël Salson committed
73

74
#define DEFAULT_READ_HEADER_SEPARATOR " "
Mathieu Giraud's avatar
Mathieu Giraud committed
75
#define DEFAULT_READS  "./data/Stanford_S22.fasta"
76
#define DEFAULT_MIN_READS_CLONE 5
77
#define DEFAULT_MAX_REPRESENTATIVES 100
78
#define DEFAULT_MAX_CLONES 100
79
#define DEFAULT_RATIO_READS_CLONE 0.0
80
#define NO_LIMIT "all"
Mikaël Salson's avatar
Mikaël Salson committed
81

Mathieu Giraud's avatar
Mathieu Giraud committed
82
#define COMMAND_WINDOWS "windows"
83
#define COMMAND_CLONES "clones"
Mikaël Salson's avatar
Mikaël Salson committed
84
#define COMMAND_SEGMENT "segment"
85
#define COMMAND_GERMLINES "germlines"
Mikaël Salson's avatar
Mikaël Salson committed
86
 
87
enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
Mikaël Salson's avatar
Mikaël Salson committed
88

89 90 91
#define DEFAULT_OUT_DIR "./out/" 

// Fixed filenames/suffixes
92
#define CLONES_FILENAME ".vdj.fa"
Mikaël Salson's avatar
Mikaël Salson committed
93
#define CLONE_FILENAME "clone.fa-"
94 95
#define WINDOWS_FILENAME ".windows.fa"
#define SEGMENTED_FILENAME ".segmented.vdj.fa"
96
#define UNSEGMENTED_FILENAME ".unsegmented.vdj.fa"
97
#define UNSEGMENTED_DETAIL_FILENAME ".fa"
98
#define AFFECTS_FILENAME ".affects"
99 100 101
#define EDGES_FILENAME ".edges"
#define COMP_FILENAME "comp.vidjil"
#define JSON_SUFFIX ".vidjil"
Mikaël Salson's avatar
Mikaël Salson committed
102 103 104

// "tests/data/leukemia.fa" 

105
#define DEFAULT_K      0
106
#define DEFAULT_W      50
107
#define DEFAULT_SEED   ""
Mikaël Salson's avatar
Mikaël Salson committed
108 109 110 111 112

#define DEFAULT_DELTA_MIN  -10

#define DEFAULT_DELTA_MIN_D  0

Mikaël Salson's avatar
Mikaël Salson committed
113
#define DEFAULT_MAX_AUDITIONED 2000
Mathieu Giraud's avatar
Mathieu Giraud committed
114 115
#define DEFAULT_RATIO_REPRESENTATIVE 0.5

Mikaël Salson's avatar
Mikaël Salson committed
116 117 118 119 120 121
#define DEFAULT_EPSILON  0
#define DEFAULT_MINPTS   10

#define DEFAULT_CLUSTER_COST  Cluster
#define DEFAULT_SEGMENT_COST   VDJ

122
#define DEFAULT_TRIM 100
Mikaël Salson's avatar
Mikaël Salson committed
123

124
// warn
125
#define WARN_MAX_CLONES 100
126
#define WARN_PERCENT_SEGMENTED 40
127

Mikaël Salson's avatar
Mikaël Salson committed
128 129 130
// display
#define WIDTH_NB_READS 7
#define WIDTH_NB_CLONES 3
131

Mikaël Salson's avatar
Mikaël Salson committed
132 133

using namespace std ;
134
using json = nlohmann::json;
Mikaël Salson's avatar
Mikaël Salson committed
135

Mathieu Giraud's avatar
Mathieu Giraud committed
136
//$$ options: usage
Mikaël Salson's avatar
Mikaël Salson committed
137

Mathieu Giraud's avatar
Mathieu Giraud committed
138
extern char *optarg;
Mikaël Salson's avatar
Mikaël Salson committed
139 140 141

extern int optind, optopt, opterr;

142
void usage(char *progname, bool advanced)
Mikaël Salson's avatar
Mikaël Salson committed
143
{
144
  cerr << "Usage: " << progname << " [options] <reads.fa/.fq/.gz>" << endl << endl;
Mikaël Salson's avatar
Mikaël Salson committed
145 146

  cerr << "Command selection" << endl
147 148 149
       << "  -c <command>"
       << "\t"     << COMMAND_CLONES    << "  \t locus detection, window extraction, clone gathering (default command, most efficient, all outputs)" << endl
       << "  \t\t" << COMMAND_WINDOWS   << "  \t locus detection, window extraction" << endl
150
       << "  \t\t" << COMMAND_SEGMENT   << "  \t detailed V(D)J designation (not recommended)" << endl
151
       << "  \t\t" << COMMAND_GERMLINES << "  \t statistics on k-mers in different germlines" << endl
152
       << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
153

154 155 156 157 158
  if (advanced)
  cerr << "Input" << endl
       << "  -# <string>   separator for headers in the reads file (default: '" << DEFAULT_READ_HEADER_SEPARATOR << "')" << endl
       << endl ;

159
  cerr << "Germline databases (at least one -V/(-D)/-J, or -G, or -g option must be given for all commands except -c " << COMMAND_GERMLINES << ")" << endl
Mikaël Salson's avatar
Mikaël Salson committed
160
       << "  -V <file>     V germline multi-fasta file" << endl
161
       << "  -D <file>     D germline multi-fasta file (and resets -m and -w options), will segment into V(D)J components" << endl
Mikaël Salson's avatar
Mikaël Salson committed
162
       << "  -J <file>     J germline multi-fasta file" << endl
163
       << "  -G <prefix>   prefix for V (D) and J repertoires (shortcut for -V <prefix>V.fa -D <prefix>D.fa -J <prefix>J.fa) (basename gives germline code)" << endl
Mathieu Giraud's avatar
Mathieu Giraud committed
164 165 166
       << "  -g <path>     multiple locus/germlines. In the path <path>, takes 'germlines.data' to select locus and parameters" << endl
       << "                Selecting '-g germline' processes TRA, TRB, TRG, TRD, IGH, IGK and IGL locus, possibly with some incomplete/unusal recombinations" << endl
       << "                A different 'germlines.data' file can also be provided with -g <file>" << endl
167 168 169
       << endl

       << "Locus/recombinations" << endl
170
       << "  -d            try to detect several D (experimental)" << endl
171 172
       << "  -i            try to detect incomplete/unusual recombinations (locus with '+', must be used with -g)" << endl
       << "  -2            try to detect unexpected recombinations (must be used with -g)" << endl
173
       << endl ;
174

175 176
  if (advanced)
  cerr << "Experimental options (do not use)" << endl
177
       << "  -I            ignore k-mers common to different germline systems (experimental, must be used with -g, do not use)" << endl
178
       << "  -1            use a unique index for all germline systems (experimental, must be used with -g, do not use)" << endl
179
       << "  -4            try to detect unexpected recombinations with translocations (experimental, must be used with -g, do not use)" << endl
180
       << "  -!            keep unsegmented reads as clones, taking for junction the complete sequence, to be used on very small datasets (for example -!AX 20)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
181 182
       << endl

Mathieu Giraud's avatar
Mathieu Giraud committed
183
       << "Window prediction" << endl
Mikaël Salson's avatar
Mikaël Salson committed
184
#ifndef NO_SPACED_SEEDS
185
       << "  (use either -s or -k option, but not both)" << endl
186
       << "  (all these options, except -w, are overriden when using -g)" << endl
187 188
       << "  -s <string>   spaced seed used for the V/J affectation" << endl
       << "                (default: #####-#####, ######-######, #######-#######, depends on germline)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
189
#endif
190
       << "  -k <int>      k-mer size used for the V/J affectation (default: 10, 12, 13, depends on germline)" << endl
191 192 193
#ifndef NO_SPACED_SEEDS
       << "                (using -k option is equivalent to set with -s a contiguous seed with only '#' characters)" << endl
#endif
194
       << "  -w <int>      w-mer size used for the length of the extracted window (default: " << DEFAULT_W << ")" << endl
195
       << "  -e <float>    maximal e-value for determining if a segmentation can be trusted (default: " << THRESHOLD_NB_EXPECTED << ")" << endl
196
       << "  -t <int>      trim V and J genes (resp. 5' and 3' regions) to keep at most <int> nt (default: " << DEFAULT_TRIM << ") (0: no trim)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
197 198
       << endl

199 200 201
       << "Labeled windows (these windows will be kept even if -r/-% thresholds are not reached)" << endl
       << "  -W <window>   label the given window" << endl
       << "  -l <file>     label a set of windows given in <file>" << endl
202
       << "  -F            filter -- keep only the labeled windows" << endl
203
       << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
204

205
  cerr << "Limits to report a clone (or a window)" << endl
206 207
       << "  -r <nb>       minimal number of reads supporting a clone (default: " << DEFAULT_MIN_READS_CLONE << ")" << endl
       << "  -% <ratio>    minimal percentage of reads supporting a clone (default: " << DEFAULT_RATIO_READS_CLONE << ")" << endl
208 209
       << endl

210
       << "Limits to further analyze some clones" << endl
211
       << "  -y <nb>       maximal number of clones computed with a representative ('" << NO_LIMIT << "': no limit) (default: " << DEFAULT_MAX_REPRESENTATIVES << ")" << endl
212
       << "  -z <nb>       maximal number of clones to be analyzed with a full V(D)J designation ('" << NO_LIMIT << "': no limit, do not use) (default: " << DEFAULT_MAX_CLONES << ")" << endl
213
       << "  -A            reports and segments all clones (-r 0 -% 0 -y " << NO_LIMIT << " -z " << NO_LIMIT << "), to be used only on very small datasets (for example -AX 20)" << endl
214 215
       << "  -x <nb>       maximal number of reads to process ('" << NO_LIMIT << "': no limit, default), only first reads" << endl
       << "  -X <nb>       maximal number of reads to process ('" << NO_LIMIT << "': no limit, default), sampled reads" << endl
216
       << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
217

218
  if (advanced)
219
  cerr << "Fine segmentation options (second pass)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
220
       << "  -f <string>   use custom Cost for fine segmenter : format \"match, subst, indels, homo, del_end\" (default "<<VDJ<<" )"<< endl
221
       << "  -m <int>      minimal admissible delta between the end of the V and the start of the J (default: " << DEFAULT_DELTA_MIN << ") (default with -D: " << DEFAULT_DELTA_MIN_D << ")" << endl
222
       << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
223

224 225 226 227 228 229
  cerr << "Clone analysis (second pass)" << endl
       << "  -3            CDR3/JUNCTION detection (requires gapped V/J germlines)" << endl
       << endl ;

  if (advanced)
  cerr << "Additional clustering (experimental)" << endl
230
       << "  -E <file>     manual clustering -- a file used to force some specific edges" << endl
231 232 233 234 235
       << "  -n <int>      maximum distance between neighbors for automatic clustering (default " << DEFAULT_EPSILON << "). No automatic clusterisation if =0." << endl
       << "  -N <int>      minimum required neighbors for automatic clustering (default " << DEFAULT_MINPTS << ")" << endl
       << "  -S            generate and save comparative matrix for clustering" << endl
       << "  -L            load comparative matrix for clustering" << endl
       << "  -C <string>   use custom Cost for automatic clustering : format \"match, subst, indels, homo, del_end\" (default "<<Cluster<<" )"<< endl
236
       << endl ;
237

Mathieu Giraud's avatar
Mathieu Giraud committed
238
  cerr << "Detailed output per read (generally not recommended, large files, but may be used for filtering, as in -uu -X 1000)" << endl
239 240
       << "  -U            output segmented reads (in " << SEGMENTED_FILENAME << " file)" << endl
       << "  -u            output unsegmented reads (in " << UNSEGMENTED_FILENAME << " file)" << endl
241
       << "  -uu           output unsegmented reads, gathered by unsegmentation cause (in *" << UNSEGMENTED_DETAIL_FILENAME << " files)" << endl
242
       << "  -K            output detailed k-mer affectation on all reads (in " << AFFECTS_FILENAME << " file) (use only for debug, for example -KX 100)" << endl
243 244
       << endl
 
Mikaël Salson's avatar
Mikaël Salson committed
245
       << "Output" << endl
246
       << "  -o <dir>      output directory (default: " << DEFAULT_OUT_DIR << ")" <<  endl
247
       << "  -b <string>   output basename (by default basename of the input file)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
248
    
249
       << "  -a            output all sequences by cluster (" << CLONE_FILENAME << "*), to be used only on small datasets" << endl
Mikaël Salson's avatar
Mikaël Salson committed
250 251 252
       << "  -v            verbose mode" << endl
       << endl        

253 254
       << "  -h            help" << endl
       << "  -H            help, including experimental and advanced options" << endl
255 256 257
       << "The full help is available in the doc/algo.org file."
       << endl

Mikaël Salson's avatar
Mikaël Salson committed
258
       << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
259
       << "Examples (see doc/algo.org)" << endl
260 261
       << "  " << progname << " -c clones   -G germline/IGH -3       data/Stanford_S22.fasta" << endl
       << "  " << progname << " -c clones   -g germline -i -2 -3     data/Stanford_S22.fasta   # (detect the locus for each read, including unusual/unexpected recombinations)" << endl
262
       << "  " << progname << " -c windows  -g germline -i -2 -u -U  data/Stanford_S22.fasta   # (detect the locus, splits the reads into two (large) files)" << endl
263
       << "  " << progname << " -c segment  -G germline/IGH -3       data/Stanford_S22.fasta   # (full analysis of each read, only for debug/testing)" << endl
264
       << "  " << progname << " -c germlines                         data/Stanford_S22.fasta   # (statistics on the k-mers)" << endl
Mikaël Salson's avatar
Mikaël Salson committed
265 266 267 268
    ;
  exit(1);
}

Mathieu Giraud's avatar
Mathieu Giraud committed
269 270 271

int atoi_NO_LIMIT(char *optarg)
{
272 273
  return strcmp(NO_LIMIT, optarg) ? atoi(optarg) : NO_LIMIT_VALUE ;
}
274
double atof_NO_LIMIT(char *optarg)
275 276
{
  return strcmp(NO_LIMIT, optarg) ? atof(optarg) : NO_LIMIT_VALUE ;
Mathieu Giraud's avatar
Mathieu Giraud committed
277 278
}

Mikaël Salson's avatar
Mikaël Salson committed
279 280
int main (int argc, char **argv)
{
281
  cout << "# Vidjil -- V(D)J recombinations analysis <http://www.vidjil.org/>" << endl
282
       << "# Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016 by the Vidjil team" << endl
283
       << "# Bonsai bioinformatics at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille" << endl 
Mathieu Giraud's avatar
Mathieu Giraud committed
284 285 286 287 288
       << endl
       << "# Vidjil is free software, and you are welcome to redistribute it" << endl
       << "# under certain conditions -- see http://git.vidjil.org/blob/master/doc/LICENSE" << endl
       << "# No lymphocyte was harmed in the making of this software," << endl
       << "# however this software is for research use only and comes with no warranty." << endl
289
       << endl
290
       << "# Please cite http://biomedcentral.com/1471-2164/15/409 if you use Vidjil." << endl
Mikaël Salson's avatar
Mikaël Salson committed
291 292
       << endl ;

293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
  //////////////////////////////////
  // Display version information or git log

  string soft_version = "vidjil ";
#ifdef RELEASE_TAG
  cout << "# version: vidjil " << RELEASE_TAG << endl ;
  soft_version.append(RELEASE_TAG);
#else
  cout << "# development version" << endl ;
#ifdef GIT_VERSION
  cout << "# git: " << GIT_VERSION << endl ;
  soft_version.append("dev ");
  soft_version.append(GIT_VERSION);
#endif
#endif

Mathieu Giraud's avatar
Mathieu Giraud committed
309 310
  //$$ options: defaults

311
  string germline_system = "" ;
312 313 314 315
  
  list <string> f_reps_V ;
  list <string> f_reps_D ;
  list <string> f_reps_J ;
316
  list <pair <string, string>> multi_germline_paths_and_files ;
317

318
  string read_header_separator = DEFAULT_READ_HEADER_SEPARATOR ;
Mikaël Salson's avatar
Mikaël Salson committed
319 320
  string f_reads = DEFAULT_READS ;
  string seed = DEFAULT_SEED ;
321
  string f_basename = "";
Mikaël Salson's avatar
Mikaël Salson committed
322

323
  string out_dir = DEFAULT_OUT_DIR;
Mikaël Salson's avatar
Mikaël Salson committed
324 325 326 327
  
  string comp_filename = COMP_FILENAME;

  int k = DEFAULT_K ;
328
  int w = DEFAULT_W ;
Mikaël Salson's avatar
Mikaël Salson committed
329

Mikaël Salson's avatar
Mikaël Salson committed
330 331 332 333
  int epsilon = DEFAULT_EPSILON ;
  int minPts = DEFAULT_MINPTS ;
  Cost cluster_cost = DEFAULT_CLUSTER_COST ;
  Cost segment_cost = DEFAULT_SEGMENT_COST ;
334
  bool detect_CDR3 = false;
Mikaël Salson's avatar
Mikaël Salson committed
335 336 337 338 339
  
  int save_comp = 0;
  int load_comp = 0;
  
  int verbose = 0 ;
340
  int command = CMD_CLONES;
Mikaël Salson's avatar
Mikaël Salson committed
341

342
  int max_representatives = DEFAULT_MAX_REPRESENTATIVES ;
343 344 345
  int max_clones = DEFAULT_MAX_CLONES ;
  int min_reads_clone = DEFAULT_MIN_READS_CLONE ;
  float ratio_reads_clone = DEFAULT_RATIO_READS_CLONE;
Mikaël Salson's avatar
Mikaël Salson committed
346 347
  // int average_deletion = 4;     // Average number of deletion in V or J

348 349
  int max_reads_processed = NO_LIMIT_VALUE;
  int max_reads_processed_sample = NO_LIMIT_VALUE;
350

Mathieu Giraud's avatar
Mathieu Giraud committed
351
  float ratio_representative = DEFAULT_RATIO_REPRESENTATIVE;
Mikaël Salson's avatar
Mikaël Salson committed
352
  unsigned int max_auditionned = DEFAULT_MAX_AUDITIONED;
Mathieu Giraud's avatar
Mathieu Giraud committed
353

Mikaël Salson's avatar
Mikaël Salson committed
354 355
  // Admissible delta between left and right segmentation points
  int delta_min = DEFAULT_DELTA_MIN ; // Kmer+Fine
Mikaël Salson's avatar
Mikaël Salson committed
356
  int trim_sequences = DEFAULT_TRIM;
Mikaël Salson's avatar
Mikaël Salson committed
357 358

  bool output_sequences_by_cluster = false;
359
  bool output_segmented = false;
Mathieu Giraud's avatar
Mathieu Giraud committed
360
  bool output_unsegmented = false;
361
  bool output_unsegmented_detail = false;
362
  bool output_affects = false;
363 364
  bool keep_unsegmented_as_clone = false;

365 366
  bool several_D = false;

367
  bool multi_germline = false;
368
  bool multi_germline_incomplete = false;
369
  bool multi_germline_mark = false;
370
  bool multi_germline_one_index_per_germline = true;
371 372
  bool multi_germline_unexpected_recombinations_12 = false;
  bool multi_germline_unexpected_recombinations_1U = false;
373

Mikaël Salson's avatar
Mikaël Salson committed
374 375
  string forced_edges = "" ;

376
  map <string, string> windows_labels ;
Mathieu Giraud's avatar
Mathieu Giraud committed
377
  string windows_labels_file = "" ;
378
  bool only_labeled_windows = false ;
Mikaël Salson's avatar
Mikaël Salson committed
379 380 381

  char c ;

382 383
  int options_s_k = 0 ;

384 385
  double expected_value = THRESHOLD_NB_EXPECTED;

386 387
  //json which contains the Levenshtein distances
  json jsonLevenshtein;
WebTogz's avatar
WebTogz committed
388

Mathieu Giraud's avatar
Mathieu Giraud committed
389 390
  //$$ options: getopt

391

392
  while ((c = getopt(argc, argv, "A!x:X:hHadiI124g:G:V:D:J:k:r:vw:e:C:f:W:l:Fc:m:N:s:b:Sn:o:L%:y:z:uUK3E:t:#:")) != EOF)
Mikaël Salson's avatar
Mikaël Salson committed
393 394 395 396

    switch (c)
      {
      case 'h':
397 398 399 400
        usage(argv[0], false);

      case 'H':
        usage(argv[0], true);
401

Mikaël Salson's avatar
Mikaël Salson committed
402
      case 'c':
403 404
        if (!strcmp(COMMAND_CLONES,optarg))
          command = CMD_CLONES;
Mikaël Salson's avatar
Mikaël Salson committed
405 406
        else if (!strcmp(COMMAND_SEGMENT,optarg))
          command = CMD_SEGMENT;
Mathieu Giraud's avatar
Mathieu Giraud committed
407 408
        else if (!strcmp(COMMAND_WINDOWS,optarg))
          command = CMD_WINDOWS;
409 410
        else if (!strcmp(COMMAND_GERMLINES,optarg))
          command = CMD_GERMLINES;
Mikaël Salson's avatar
Mikaël Salson committed
411 412
        else {
          cerr << "Unknwown command " << optarg << endl;
413
	  usage(argv[0], false);
Mikaël Salson's avatar
Mikaël Salson committed
414 415
        }
        break;
416

417 418 419 420
      // Input
      case '#':
        read_header_separator = string(optarg);
        break;
Mikaël Salson's avatar
Mikaël Salson committed
421 422 423 424

      // Germline

      case 'V':
425
	f_reps_V.push_back(optarg);
426
	germline_system = "custom" ;
Mikaël Salson's avatar
Mikaël Salson committed
427 428 429
	break;

      case 'D':
430
	f_reps_D.push_back(optarg);
431
	delta_min = DEFAULT_DELTA_MIN_D ;
Mikaël Salson's avatar
Mikaël Salson committed
432 433 434
	break;
        
      case 'J':
435
	f_reps_J.push_back(optarg);
436
	germline_system = "custom" ;
Mikaël Salson's avatar
Mikaël Salson committed
437 438
	break;

439
      case 'g':
440
	multi_germline = true;
441 442 443 444 445 446 447 448
        {
          string arg = string(optarg);
          struct stat buffer;
          if (stat(arg.c_str(), &buffer) == 0)
            {
              if( buffer.st_mode & S_IFDIR )
                {
                  // argument is a directory
449
                  multi_germline_paths_and_files.push_back(make_pair(arg, DEFAULT_MULTI_GERMLINE_FILE)) ;
450 451 452
                }
              else
                {
453
                  multi_germline_paths_and_files.push_back(make_pair(extract_dirname(arg), extract_basename(arg, false)));
454 455 456 457 458 459 460 461
                }
            }
          else
             {
               cerr << ERROR_STRING << "A path or a file must be given with -g" << endl ;
               exit(1);
             }
        }
462
	germline_system = "multi" ;
463
	break;
464

465 466 467 468
      case 'd':
        several_D = true;
        break;

469 470 471
      case 'i':
	multi_germline_incomplete = true;
	break;
472 473 474 475

      case 'I':
        multi_germline_mark = true;
	break;
476 477 478 479

      case '1':
        multi_germline_one_index_per_germline = false ;
        break;
480 481

      case '2':
482 483 484 485 486
        multi_germline_unexpected_recombinations_12 = true ;
        break;

      case '4':
        multi_germline_unexpected_recombinations_1U = true ;
487 488
        break;

Mikaël Salson's avatar
Mikaël Salson committed
489
      case 'G':
Mikaël Salson's avatar
Mikaël Salson committed
490
	germline_system = string(optarg);
491
	f_reps_V.push_back((germline_system + "V.fa").c_str()) ;
492 493 494 495 496
        // Takes D only if it exists
        {
          struct stat buffer; 
          string putative_f_rep_D = germline_system + "D.fa" ;
          if (stat(putative_f_rep_D.c_str(), &buffer) == 0)
497 498 499 500
            {
              f_reps_D.push_back(putative_f_rep_D.c_str()) ;
              delta_min = DEFAULT_DELTA_MIN_D ;
            }
501
        }
502
	f_reps_J.push_back((germline_system + "J.fa").c_str()) ;
503
	germline_system = extract_basename(germline_system);
504

Mikaël Salson's avatar
Mikaël Salson committed
505 506 507 508
	break;

      // Algorithm

509 510 511 512 513 514 515 516 517 518
      case 's':
#ifndef NO_SPACED_SEEDS
	seed = string(optarg);
	k = seed_weight(seed);
	options_s_k++ ;
#else
        cerr << "To enable the option -s, please compile without NO_SPACED_SEEDS" << endl;
#endif
        break;

Mikaël Salson's avatar
Mikaël Salson committed
519 520 521
      case 'k':
	k = atoi(optarg);
	seed = seed_contiguous(k);
522
	options_s_k++ ;
Mikaël Salson's avatar
Mikaël Salson committed
523 524 525 526 527 528 529 530 531 532
        break;

      case 'w':
	w = atoi(optarg);
        break;

      case 'm':
	delta_min = atoi(optarg);
        break;

533 534 535 536
      case '!':
        keep_unsegmented_as_clone = true;
        break;

537
      case 'e':
538
        expected_value = atof_NO_LIMIT(optarg);
539 540
        break;

541 542
      // Output 

Mikaël Salson's avatar
Mikaël Salson committed
543 544 545 546
      case 'o':
        out_dir = optarg ;
        break;

547 548
      case 'b':
        f_basename = optarg;
Mikaël Salson's avatar
Mikaël Salson committed
549 550
        break;

551 552 553 554
      case 'a':
	output_sequences_by_cluster = true;
	break;

Mikaël Salson's avatar
Mikaël Salson committed
555 556 557 558
      case 't':
        trim_sequences = atoi(optarg);
        break;

559 560 561 562
      case 'v':
	verbose += 1 ;
	break;

563 564
      // Limits

Mikaël Salson's avatar
Mikaël Salson committed
565 566 567 568
      case '%':
	ratio_reads_clone = atof(optarg);
	break;

569
      case 'r':
Mikaël Salson's avatar
Mikaël Salson committed
570 571 572
	min_reads_clone = atoi(optarg);
        break;

573
      case 'y':
Mathieu Giraud's avatar
Mathieu Giraud committed
574
	max_representatives = atoi_NO_LIMIT(optarg);
575 576
        break;

Mikaël Salson's avatar
Mikaël Salson committed
577
      case 'z':
Mathieu Giraud's avatar
Mathieu Giraud committed
578
	max_clones = atoi_NO_LIMIT(optarg);
579 580
        if (max_representatives < max_clones)
          max_representatives = max_clones ;
Mikaël Salson's avatar
Mikaël Salson committed
581
        break;
582 583 584

      case 'A': // --all
	ratio_reads_clone = 0 ;
585 586 587
	min_reads_clone = 1 ;
	max_representatives = -1 ;
	max_clones = -1 ;
588 589
	break ;

590
      case 'X':
591 592 593 594
        max_reads_processed_sample = atoi_NO_LIMIT(optarg);
        break;

      case 'x':
Mathieu Giraud's avatar
Mathieu Giraud committed
595
        max_reads_processed = atoi_NO_LIMIT(optarg);
596
        break;
597

598 599 600
      // Labels

      case 'W':
Mathieu Giraud's avatar
Mathieu Giraud committed
601
        windows_labels[string(optarg)] = string("-W");
602 603
        break;

604 605 606 607
      case 'l':
	windows_labels_file = optarg; 
	break;

608 609 610 611
      case 'F':
        only_labeled_windows = true;
        break;

612
      // Clustering
613

614
      case 'E':
615 616
	forced_edges = optarg;
	break;
Mikaël Salson's avatar
Mikaël Salson committed
617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637
	
      case 'n':
	epsilon = atoi(optarg);
        break;

      case 'N':
	minPts = atoi(optarg);
        break;
	
      case 'S':
	save_comp=1;
        break;

      case 'L':
	load_comp=1;
        break;
	
      case 'C':
	cluster_cost=strToCost(optarg, Cluster);
        break;
	
638
      // Fine segmentation
639 640 641 642
      case '3':
        detect_CDR3 = true;
        break;
        
643
      case 'f':
Mikaël Salson's avatar
Mikaël Salson committed
644 645
	segment_cost=strToCost(optarg, VDJ);
        break;
Mathieu Giraud's avatar
Mathieu Giraud committed
646 647

      case 'u':
648
        output_unsegmented_detail = output_unsegmented; // -uu
Mathieu Giraud's avatar
Mathieu Giraud committed
649 650
        output_unsegmented = true;
        break;
651 652 653
      case 'U':
        output_segmented = true;
        break;
654 655 656
      case 'K':
        output_affects = true;
        break;
Mikaël Salson's avatar
Mikaël Salson committed
657 658
      }

659 660 661

  //$$ options: post-processing+display

662 663 664

  if (!germline_system.size() && (command != CMD_GERMLINES))
    {
665
      cerr << ERROR_STRING << "At least one germline must be given with -V/(-D)/-J, or -G, or -g." << endl ;
666 667 668
      exit(1);
    }

669 670
  if (options_s_k > 1)
    {
671
      cerr << ERROR_STRING << "Use at most one -s or -k option." << endl ;
672 673 674
      exit(1);
    }

Mikaël Salson's avatar
Mikaël Salson committed
675 676
  string out_seqdir = out_dir + "/seq/" ;

Mikaël Salson's avatar
Mikaël Salson committed
677 678 679 680 681 682 683 684 685 686 687 688 689
  if (verbose)
    cout << "# verbose " << verbose << endl ;

  if (optind == argc)
    {
      cout << "# using default sequence file: " << f_reads << endl ;
    }
  else if (optind+1 == argc)
    {
      f_reads = argv[optind] ; 
    }
  else
    {
690
      cerr << ERROR_STRING << "Wrong number of arguments." << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
691 692
      exit(1);
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
693

694
  size_t min_cover_representative = (size_t) (min_reads_clone < (int) max_auditionned ? min_reads_clone : max_auditionned) ;
695

696 697 698 699 700 701
  // Default seeds

#ifndef NO_SPACED_SEEDS
  if (k == DEFAULT_K)
    {
      if (germline_system.find("TRA") != string::npos)
702
	seed = SEED_S13 ;
703 704 705

      else if ((germline_system.find("TRB") != string::npos)
	       || (germline_system.find("IGH") != string::npos))
706
	seed = SEED_S12 ;
707
      else // TRD, TRG, IGK, IGL, custom, multi
708
	seed = SEED_S10 ;
709 710 711 712 713

      k = seed_weight(seed);
    }
#else
  {
714
    cerr << ERROR_STRING << "Vidjil was compiled with NO_SPACED_SEEDS: please provide a -k option." << endl;
715 716 717 718 719
    exit(1) ;
  }
#endif
	  

Mathieu Giraud's avatar
Mathieu Giraud committed
720 721 722 723
#ifndef NO_SPACED_SEEDS
  // Check seed buffer  
  if (seed.size() >= MAX_SEED_SIZE)
    {
724
      cerr << ERROR_STRING << "Seed size is too large (MAX_SEED_SIZE)." << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
725 726 727 728
      exit(1);
    }
#endif

729 730 731

  if (w < seed_weight(seed))
    {
732
      cerr << ERROR_STRING << "Too small -w. The window size should be at least equal to the seed size (" << seed_weight(seed) << ")." << endl;
733 734 735
      exit(1);
    }

Mikaël Salson's avatar
Mikaël Salson committed
736 737 738 739
  // Check that out_dir is an existing directory or creates it
  const char *out_cstr = out_dir.c_str();

  if (mkpath(out_cstr, 0755) == -1) {
740
    cerr << ERROR_STRING << "Directory creation: " << out_dir << endl; perror("");
Mikaël Salson's avatar
Mikaël Salson committed
741 742 743
    exit(2);
  }

Mikaël Salson's avatar
Mikaël Salson committed
744 745
  const char *outseq_cstr = out_seqdir.c_str();
  if (mkpath(outseq_cstr, 0755) == -1) {
746
    cerr << ERROR_STRING << "Directory creation: " << out_seqdir << endl; perror("");
Mikaël Salson's avatar
Mikaël Salson committed
747 748 749
    exit(2);
  }

750 751 752 753 754
  // Compute basename if not given as an option
  if (f_basename == "") {
    f_basename = extract_basename(f_reads);
  }

Mikaël Salson's avatar
Mikaël Salson committed
755 756 757
  out_dir += "/" ;

  /// Load labels ;
758
  load_into_map(windows_labels, windows_labels_file);
Mikaël Salson's avatar
Mikaël Salson committed
759 760

  switch(command) {
Mathieu Giraud's avatar
Mathieu Giraud committed
761
  case CMD_WINDOWS: cout << "Extracting windows" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
762
    break;
763
  case CMD_CLONES: cout << "Analysing clones" << endl; 
Mikaël Salson's avatar
Mikaël Salson committed
764 765 766
    break;
  case CMD_SEGMENT: cout << "Segmenting V(D)J" << endl;
    break;
767 768
  case CMD_GERMLINES: cout << "Discovering germlines" << endl;
    break;
Mikaël Salson's avatar
Mikaël Salson committed
769 770
  }

Mathieu Giraud's avatar
Mathieu Giraud committed
771
  cout << "Command line: ";
Mikaël Salson's avatar
Mikaël Salson committed
772
  for (int i=0; i < argc; i++) {
Mathieu Giraud's avatar
Mathieu Giraud committed
773
    cout << argv[i] << " ";
Mikaël Salson's avatar
Mikaël Salson committed
774
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
775
  cout << endl;
Mikaël Salson's avatar
Mikaël Salson committed
776 777 778 779 780 781 782 783 784 785 786 787

  //////////////////////////////////
  // Display time and date
  time_t rawtime;
  struct tm *timeinfo;
  char time_buffer[80];

  time (&rawtime );
  timeinfo = localtime (&rawtime);

  strftime (time_buffer, 80,"%F %T", timeinfo);

Mathieu Giraud's avatar
Mathieu Giraud committed
788
  cout << "# " << time_buffer << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
789 790


791 792
  //////////////////////////////////
  // Warning for non-optimal use
793

794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809
  if (max_clones == -1 || max_clones > WARN_MAX_CLONES)
    {
      cout << endl
	   << "* WARNING: vidjil was run with '-A' option or with a large '-z' option" << endl ;
    }
  
  if (command == CMD_SEGMENT)
    {
      cout << endl
	   << "* WARNING: vidjil was run with '-c segment' option" << endl ;
    }
  
  if (max_clones == -1 || max_clones > WARN_MAX_CLONES || command == CMD_SEGMENT)
    {
      cout << "* Vidjil purpose is to extract very quickly windows overlapping the CDR3" << endl
	   << "* and to gather reads into clones (-c clones), and not to provide an accurate V(D)J segmentation." << endl
810
	   << "* The full V(D)J designations are slow to compute and are provided only for convenience." << endl
811 812 813
	   << "* More information is provided in the 'doc/algo.org' file." << endl 
	   << endl ;
    }
814

815 816 817
  /////////////////////////////////////////
  //            LOAD GERMLINES           //
  /////////////////////////////////////////
818

819 820 821 822
  if (command == CMD_GERMLINES)
    {
      multi_germline = true ;
      multi_germline_one_index_per_germline = false ;
823 824 825

      if (multi_germline_paths_and_files.size() == 0)
        multi_germline_paths_and_files.push_back(make_pair(DEFAULT_MULTI_GERMLINE_PATH, DEFAULT_MULTI_GERMLINE_FILE));
826 827 828
    }

  MultiGermline *multigermline = new MultiGermline(multi_germline_one_index_per_germline);
829

830
    {
831
      cout << "Load germlines and build Kmer indexes" << endl ;
832 833 834
    
      if (multi_germline)
	{
835 836
          for (pair <string, string> path_file: multi_germline_paths_and_files)
            multigermline->build_from_json(path_file.first, path_file.second, GERMLINES_REGULAR, trim_sequences);
837 838 839 840 841 842
	}
      else
	{
	  // Custom germline
	  Germline *germline;
	  germline = new Germline(germline_system, 'X',
843
                                  f_reps_V, f_reps_D, f_reps_J, 
844
                                  delta_min, trim_sequences);
845

846
          germline->new_index(seed);
Mathieu Giraud's avatar