tools.h 8.71 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1 2
#ifndef TOOLS_H
#define TOOLS_H
3
#include <string>
Mikaël Salson's avatar
Mikaël Salson committed
4

5 6
using namespace std ;
typedef string junction ;
7

8 9 10 11 12
// Progress bar
#define PROGRESS_POINT 25000
#define PROGRESS_POINT_CLONES 25
#define PROGRESS_LINE 40

13 14 15 16
// error
#define ERROR_STRING "[error] "
#define WARNING_STRING "[warning] "

Mikaël Salson's avatar
Mikaël Salson committed
17
#define GZ_SUFFIX ".gz"
18

19
#define NO_LIMIT_VALUE  -1  // Value for 'all' on command-line options
20
#define NO_LIMIT_VALUE_STRING  "-1"
21 22


Mathieu Giraud's avatar
Mathieu Giraud committed
23
#define MAX_SEED_SIZE  50 // Spaced seed buffer
24
#define FIRST_POS  1      // Numbering of the base pairs for external output
Mikaël Salson's avatar
Mikaël Salson committed
25

26
#define PERCENT_TOO_MANY_N 30    /* Percent above which we consider there are too
27 28
                                  many Ns in the sequence and the
                                  corresponding subsequence should be
29 30
                                  trimmed. The constant *must* be an integer.
                                   */
31
#define MIN(x,y) ((x) < (y) ? (x) : (y))
32

Mathieu Giraud's avatar
Mathieu Giraud committed
33 34 35 36 37 38
#define LEVEL_DEBUG "debug"
#define LEVEL_INFO  "info"
#define LEVEL_WARN  "warn"
#define LEVEL_ERROR "error"
#define LEVEL_FATAL "fatal"

39 40
#define ALL_KMERS_VALUE 0 /* Use in -Z 0 (filtering on all k-mers with at least
                             one match. */
Mikaël Salson's avatar
Mikaël Salson committed
41 42
#include <sstream>
#include <iostream>
43
#include <iomanip>
Mikaël Salson's avatar
Mikaël Salson committed
44 45
#include <string>
#include <cassert>
46
#include <signal.h>
Mikaël Salson's avatar
Mikaël Salson committed
47
#include <vector>
48
#include "bioreader.hpp"
49
#include "../lib/gzstream.h"
Cyprien Borée's avatar
Cyprien Borée committed
50
#include "kmeraffect.h"
51
#include "../lib/json_fwd.hpp"
52
#include "warnings.h"
53
using json = nlohmann::json;
Mikaël Salson's avatar
Mikaël Salson committed
54 55 56 57
using namespace std;

#define PRINT_VAR(v) cerr << #v << " = " << v << endl

58
#define NB_N_CHOOSE_K_STORED 500
Mikaël Salson's avatar
Mikaël Salson committed
59 60 61

#define SEED_YES '#'

62
// Common seeds
63
#define DEFAULT_SEED "10s"
64
extern map<string, string> seedMap;
Mathieu Giraud's avatar
Mathieu Giraud committed
65
string expand_seed(const string &seed);
66

Mikaël Salson's avatar
Mikaël Salson committed
67 68 69 70
string seed_contiguous(int k);

int seed_weight(const string &seed);

71 72
// https://stackoverflow.com/posts/3599170/revisions
#define UNUSED(x) ((void)(x))
Mikaël Salson's avatar
Mikaël Salson committed
73

74 75
#define FIRST_IF_UNCHANGED(first, second, changed) ((changed) ? (second) : (first))

Mikaël Salson's avatar
Mikaël Salson committed
76 77 78 79 80 81 82
/**
 * Return a spaced key from a contiguous key and a seed model
 * @param input: contiguous key
 * @param seed: spaced seed model, like "###-###"
 * @return the spaced key
 */
string spaced(const string &input, const string &seed);
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101

inline int spaced_int(int *input, const string &seed) {

  // cout << input << endl << seed << endl ;
  // assert(input.length() == seed.length()); // length is not equal, pointer

  int index_word = 0;

  for (size_t i = 0; i < seed.length(); i++) 
    if (seed[i] == SEED_YES)
	index_word = (index_word << 2) | input[i] ;

#ifdef DEBUG_SPACED
  cout << input << " => |" << index_word << "|" <<  endl ;
#endif

  return index_word;

}
Mikaël Salson's avatar
Mikaël Salson committed
102

103 104 105 106 107 108 109
/* Signal handling */


extern bool global_interrupted;

void sigintHandler(int sig_num);

110
/* 
111 112 113 114
  Extract the gene name from a label.
  If there is a pipe '|', consider only what is after the (first) pipe.
  If there is a star '*', consider only what is before the start
  M99686|IGHV5-51*01|Homo sapiens|...   -> IGHV5-51
115 116 117 118
	IGHV-01*05	->	IGHV-01
	IGHV-7500AB	->	IGHV-7500AB
*/
string extractGeneName(string label);
Mikaël Salson's avatar
Mikaël Salson committed
119 120 121 122 123 124 125 126 127 128 129

/**
 * Sort the number of occurrence stored as the second element of a pair.
 * @param a: a pair containing an element and an associated number of occurrence
 * @param b: a pair containing an element and an associated number of occurrence
 * @return true iff a has a number of occurrence greater or equal to b.
 */
template <class T>
bool pair_occurrence_sort(pair<T, int> a, pair<T, int> b);


130
string string_of_int(int number, int pad_to_width=0);
131
string fixed_string_of_float(float number, int precision);
132
string scientific_string_of_double(double number, int precision=2);
Mathieu Giraud's avatar
Mathieu Giraud committed
133
string string_of_map(map <string, string> m, const string &before);
134 135 136 137 138 139 140 141 142 143

/**
 * @param nuc is A, C, G, T or any extended nucleotide (or lowercase)
 * @return is nuc an extended nucleotide ?
 */

bool is_extended_nucleotide(char nuc);
bool has_extended_nucleotides(string s);


Mikaël Salson's avatar
Mikaël Salson committed
144
/**
145
 * @param nuc is A, C, G, T or any extended nucleotide (or lowercase)
Mikaël Salson's avatar
Mikaël Salson committed
146 147 148 149 150 151 152 153 154
 * @return the complementary nucleotide of nuc
 */
char complement_nucleotide(char nuc);

/**
 * @return the complementary sequence of dna
 */
string complement(const string &dna);

155
/**
156 157
 * @param nuc: nucleotide in  either up- or down-casemajuscule (ACGTacgt)
 * @return integer representation (respectively 0, 1, 2 ou 3)
158
 */
159
inline int nuc_to_int(char nuc) {
160 161 162 163
  // A/a : 01*0 0001
  // C/c : 01*0 0011
  // G/g : 01*0 0111
  // T/t : 01*1 0100
164
  // pos :    3210
165 166 167 168
  // Bit de poids fort : b_2
  // Bit de poids faible : xor entre b_2 et b_1
  return ((nuc & 4) >> 1) // poids fort
    | (((nuc & 4) >> 2) ^ ((nuc & 2) >> 1));
169
}
Mikaël Salson's avatar
Mikaël Salson committed
170

Mikaël Salson's avatar
Mikaël Salson committed
171
/**
172
 * Convert size nucleotides from a DNA string to an integer or to an hash.
Mikaël Salson's avatar
Mikaël Salson committed
173 174
 */
int dna_to_int(const string &, int size);
175
uint64_t dna_to_hash(const string &, int size);
Mikaël Salson's avatar
Mikaël Salson committed
176

Mathieu Giraud's avatar
Mathieu Giraud committed
177 178 179 180 181 182 183 184 185 186 187 188 189
#define GENETIC_CODE \
  "KNKN" "TTTT" "RSRS" "IIMI" \
  "QHQH" "PPPP" "RRRR" "LLLL" \
  "EDED" "AAAA" "GGGG" "VVVV" \
  "*Y*Y" "SSSS" "*CWC" "LFLF"

#define GENETIC_CODE_OUT_OF_FRAME '#'

/**
 * Convert nucleotides to amino acids
 */
string nuc_to_aa(const string &nuc);

190 191 192 193 194 195 196 197
/**
 * Extract a field from a separated string
 * @param field: number of the field to be extracted (starts at 1,
 *               if 0: returns the whole string)
 * @param separator: the separator used in the string
 * @param str
 * @return the field to be extracted from the string
 */
Mikaël Salson's avatar
Mikaël Salson committed
198 199
string extract_from_label(string str, int field, string separator);

200
/**
201 202 203 204 205 206
 * @return Extract dirname of a file
 */
string extract_dirname(string path);

/**
 * @return Extract basename of a file and extracts extension (by default)
207 208 209
 */
string extract_basename(string path, bool remove_ext = true);

Mikaël Salson's avatar
Mikaël Salson committed
210 211 212 213 214 215
/**
 * Generate all the possible (nucleotide) strings from the (spaced) seed
 * provided in parameter.
 */
vector<string> generate_all_seeds(const string &str, const string &seed);

Mikaël Salson's avatar
Mikaël Salson committed
216 217 218 219 220 221 222 223
/**
 * remove_trailing_whitespaces removes the whitespaces (ie. ' ', '\t', '\r')
 * that may be at the end of the string
 * @param str: the string 
 * @return the number of whitespaces removed
 */
int remove_trailing_whitespaces(string &str);

224 225 226 227 228
/**
 * @return subsequence delimited by biological positions (starting from 1), including both positions
 */
string subsequence(const string &text, int start, int end);

Mikaël Salson's avatar
Mikaël Salson committed
229 230 231 232 233
/**
 * @return reverse(complement(dna)) if do_revcomp, otherwise dna
 */
string revcomp(const string &dna, bool do_revcomp = true);

234 235 236 237 238 239
/**
 * @return the int value corresponding to the revcomp of the DNA sequence
 *         represented by word, whose length (in number of nucleotides) is size.
 */
int revcomp_int(int word, int size);

Mikaël Salson's avatar
Mikaël Salson committed
240 241 242 243 244
/**
 * @return the reverse of text (ie. text read from right to left)
 */
string reverse(const string &text);

245 246 247 248 249 250 251 252 253
/**
 * @param sequence is a DNA sequence in the correct orientation in uppercase
 *      (ie. no revcomp will be tried)
 * @param frame (0, 1 or 2) depending on where the position of the first codon
 *        in the sequence starts
 * @return true iff a stop codon is in-frame.
 */ 
bool hasInFrameStopCodon(const string &sequence, int frame);

Mathieu Giraud's avatar
Mathieu Giraud committed
254 255 256 257 258
/**
 * @return a Sequence whose fields are given by the parameters
 */
Sequence create_sequence(string label_full, string label, string sequence, string quality);

259
extern double nChoosek_stored[NB_N_CHOOSE_K_STORED][NB_N_CHOOSE_K_STORED];
Mikaël Salson's avatar
Mikaël Salson committed
260 261 262 263 264 265
/**
 * @return the combinatorial of k among n
 * @see http://stackoverflow.com/a/9331125/1192742
 */
double nChoosek(unsigned n, unsigned k);

266 267 268 269 270
/**
 * Remove the ends of the sequence if they contain too many N.
 * The sequence will be considered starting at position start_pos
 * for length letters.
 * The values will be updated correspondingly after trimming.
271 272 273 274
 *
 * More precisely, the purpose of the function is to find the longest
 * substring whose prefixes and suffixes all have a ratio of N that
 * is less than or equal to RATIO_TOO_MANY_N
275 276 277 278
 *
 * The parameters required_start and required_end give the positions
 * of the required sequence that must not be cut out.
 * @post start_pos <= required_start && length >= required_length
279
 */
280 281
void trimSequence(string &sequence, size_t &start_pos, size_t &length,
                  size_t required_start=string::npos, size_t required_length=0);
Mikaël Salson's avatar
Mikaël Salson committed
282

Mathieu Giraud's avatar
Mathieu Giraud committed
283 284 285 286
const Sequence NULL_SEQUENCE = create_sequence("", "", "NULL", "");

bool operator==(const Sequence &s1, const Sequence &s2);
bool operator!=(const Sequence &s1, const Sequence &s2);
Mikaël Salson's avatar
Mikaël Salson committed
287

288 289 290 291 292

/***
 Outputs
 ***/

293
void output_label_average(ostream &out, string label, long long int nb, double average, int precision=1);
294

295
void json_add_warning(json &clone, string code, string msg, string level=LEVEL_WARN);
296

297 298 299
/*
   Opens a ostream, possibly gz-compressed
*/
Mikaël Salson's avatar
Mikaël Salson committed
300
std::ostream* new_ofgzstream(string &f, bool gz, string message="");
301

302

Mikaël Salson's avatar
Mikaël Salson committed
303 304 305 306 307 308 309 310 311 312
//////////////////////////////////////////////////
// Template code
//////////////////////////////////////////////////

template <class T>
bool pair_occurrence_sort(pair<T, int> a, pair<T, int> b) {
  return a.second >= b.second;
}

#endif