tools.h 7.87 KB
Newer Older
Mikael Salson's avatar
Mikael Salson committed
1 2 3
#ifndef TOOLS_H
#define TOOLS_H

4

5 6 7 8 9
// error
#define ERROR_STRING "[error] "
#define WARNING_STRING "[warning] "


10
#define NO_LIMIT_VALUE  -1  // Value for 'all' on command-line options
11
#define NO_LIMIT_VALUE_STRING  "-1"
12 13


Mathieu Giraud's avatar
Mathieu Giraud committed
14
#define MAX_SEED_SIZE  50 // Spaced seed buffer
15
#define FIRST_POS  1      // Numbering of the base pairs for external output
Mikael Salson's avatar
Mikael Salson committed
16

17
#define PERCENT_TOO_MANY_N 30    /* Percent above which we consider there are too
18 19
                                  many Ns in the sequence and the
                                  corresponding subsequence should be
20 21
                                  trimmed. The constant *must* be an integer.
                                   */
22
#define MIN(x,y) ((x) < (y) ? (x) : (y))
23

Mathieu Giraud's avatar
Mathieu Giraud committed
24 25 26 27 28 29
#define LEVEL_DEBUG "debug"
#define LEVEL_INFO  "info"
#define LEVEL_WARN  "warn"
#define LEVEL_ERROR "error"
#define LEVEL_FATAL "fatal"

30 31
#define ALL_KMERS_VALUE 0 /* Use in -Z 0 (filtering on all k-mers with at least
                             one match. */
Mikael Salson's avatar
Mikael Salson committed
32 33
#include <sstream>
#include <iostream>
34
#include <iomanip>
Mikael Salson's avatar
Mikael Salson committed
35 36
#include <string>
#include <cassert>
Mikael Salson's avatar
Mikael Salson committed
37
#include <vector>
38
#include "bioreader.hpp"
39
#include "kmeraffect.h"
40
#include "../lib/json_fwd.hpp"
41
using json = nlohmann::json;
Mikael Salson's avatar
Mikael Salson committed
42 43 44 45
using namespace std;

#define PRINT_VAR(v) cerr << #v << " = " << v << endl

46
#define NB_N_CHOOSE_K_STORED 500
Mikael Salson's avatar
Mikael Salson committed
47 48 49

#define SEED_YES '#'

50
// Common seeds
51
#define DEFAULT_SEED "10s"
52
extern map<string, string> seedMap;
Mathieu Giraud's avatar
Mathieu Giraud committed
53
string expand_seed(const string &seed);
54

Mikael Salson's avatar
Mikael Salson committed
55 56 57 58
string seed_contiguous(int k);

int seed_weight(const string &seed);

59 60
// https://stackoverflow.com/posts/3599170/revisions
#define UNUSED(x) ((void)(x))
Mikael Salson's avatar
Mikael Salson committed
61

62 63
#define FIRST_IF_UNCHANGED(first, second, changed) ((changed) ? (second) : (first))

Mikael Salson's avatar
Mikael Salson committed
64 65 66 67 68 69 70
/**
 * Return a spaced key from a contiguous key and a seed model
 * @param input: contiguous key
 * @param seed: spaced seed model, like "###-###"
 * @return the spaced key
 */
string spaced(const string &input, const string &seed);
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89

inline int spaced_int(int *input, const string &seed) {

  // cout << input << endl << seed << endl ;
  // assert(input.length() == seed.length()); // length is not equal, pointer

  int index_word = 0;

  for (size_t i = 0; i < seed.length(); i++) 
    if (seed[i] == SEED_YES)
	index_word = (index_word << 2) | input[i] ;

#ifdef DEBUG_SPACED
  cout << input << " => |" << index_word << "|" <<  endl ;
#endif

  return index_word;

}
Mikael Salson's avatar
Mikael Salson committed
90

91 92 93 94 95 96 97 98
/* 
	Extract the gene name from a label. This take the whole part
	before the star and returns it. If there is no star in the
	name the whole label is returned.
	IGHV-01*05	->	IGHV-01
	IGHV-7500AB	->	IGHV-7500AB
*/
string extractGeneName(string label);
Mikael Salson's avatar
Mikael Salson committed
99 100 101 102 103 104 105 106 107 108 109 110

/**
 * Sort the number of occurrence stored as the second element of a pair.
 * @param a: a pair containing an element and an associated number of occurrence
 * @param b: a pair containing an element and an associated number of occurrence
 * @return true iff a has a number of occurrence greater or equal to b.
 */
template <class T>
bool pair_occurrence_sort(pair<T, int> a, pair<T, int> b);


string string_of_int(int number);
111
string fixed_string_of_float(float number, int precision);
112
string scientific_string_of_double(double number);
Mathieu Giraud's avatar
Mathieu Giraud committed
113
string string_of_map(map <string, string> m, const string &before);
114 115 116 117 118 119 120 121 122 123

/**
 * @param nuc is A, C, G, T or any extended nucleotide (or lowercase)
 * @return is nuc an extended nucleotide ?
 */

bool is_extended_nucleotide(char nuc);
bool has_extended_nucleotides(string s);


Mikael Salson's avatar
Mikael Salson committed
124
/**
125
 * @param nuc is A, C, G, T or any extended nucleotide (or lowercase)
Mikael Salson's avatar
Mikael Salson committed
126 127 128 129 130 131 132 133 134
 * @return the complementary nucleotide of nuc
 */
char complement_nucleotide(char nuc);

/**
 * @return the complementary sequence of dna
 */
string complement(const string &dna);

135
/**
136 137
 * @param nuc: nucleotide in  either up- or down-casemajuscule (ACGTacgt)
 * @return integer representation (respectively 0, 1, 2 ou 3)
138
 */
139
inline int nuc_to_int(char nuc) {
140 141 142 143
  // A/a : 01*0 0001
  // C/c : 01*0 0011
  // G/g : 01*0 0111
  // T/t : 01*1 0100
144
  // pos :    3210
145 146 147 148
  // Bit de poids fort : b_2
  // Bit de poids faible : xor entre b_2 et b_1
  return ((nuc & 4) >> 1) // poids fort
    | (((nuc & 4) >> 2) ^ ((nuc & 2) >> 1));
149
}
Mikael Salson's avatar
Mikael Salson committed
150

Mikael Salson's avatar
Mikael Salson committed
151
/**
152
 * Convert size nucleotides from a DNA string to an integer or to an hash.
Mikael Salson's avatar
Mikael Salson committed
153 154
 */
int dna_to_int(const string &, int size);
155
uint64_t dna_to_hash(const string &, int size);
Mikael Salson's avatar
Mikael Salson committed
156

157 158 159 160 161 162 163 164 165 166 167 168 169
#define GENETIC_CODE \
  "KNKN" "TTTT" "RSRS" "IIMI" \
  "QHQH" "PPPP" "RRRR" "LLLL" \
  "EDED" "AAAA" "GGGG" "VVVV" \
  "*Y*Y" "SSSS" "*CWC" "LFLF"

#define GENETIC_CODE_OUT_OF_FRAME '#'

/**
 * Convert nucleotides to amino acids
 */
string nuc_to_aa(const string &nuc);

170 171 172 173 174 175 176 177
/**
 * Extract a field from a separated string
 * @param field: number of the field to be extracted (starts at 1,
 *               if 0: returns the whole string)
 * @param separator: the separator used in the string
 * @param str
 * @return the field to be extracted from the string
 */
Mikael Salson's avatar
Mikael Salson committed
178 179
string extract_from_label(string str, int field, string separator);

180
/**
181 182 183 184 185 186
 * @return Extract dirname of a file
 */
string extract_dirname(string path);

/**
 * @return Extract basename of a file and extracts extension (by default)
187 188 189
 */
string extract_basename(string path, bool remove_ext = true);

Mikael Salson's avatar
Mikael Salson committed
190 191 192 193 194 195
/**
 * Generate all the possible (nucleotide) strings from the (spaced) seed
 * provided in parameter.
 */
vector<string> generate_all_seeds(const string &str, const string &seed);

Mikael Salson's avatar
Mikael Salson committed
196 197 198 199 200 201 202 203
/**
 * remove_trailing_whitespaces removes the whitespaces (ie. ' ', '\t', '\r')
 * that may be at the end of the string
 * @param str: the string 
 * @return the number of whitespaces removed
 */
int remove_trailing_whitespaces(string &str);

204 205 206 207 208
/**
 * @return subsequence delimited by biological positions (starting from 1), including both positions
 */
string subsequence(const string &text, int start, int end);

Mikael Salson's avatar
Mikael Salson committed
209 210 211 212 213
/**
 * @return reverse(complement(dna)) if do_revcomp, otherwise dna
 */
string revcomp(const string &dna, bool do_revcomp = true);

214 215 216 217 218 219
/**
 * @return the int value corresponding to the revcomp of the DNA sequence
 *         represented by word, whose length (in number of nucleotides) is size.
 */
int revcomp_int(int word, int size);

Mikael Salson's avatar
Mikael Salson committed
220 221 222 223 224
/**
 * @return the reverse of text (ie. text read from right to left)
 */
string reverse(const string &text);

225 226 227 228 229 230 231 232 233
/**
 * @param sequence is a DNA sequence in the correct orientation in uppercase
 *      (ie. no revcomp will be tried)
 * @param frame (0, 1 or 2) depending on where the position of the first codon
 *        in the sequence starts
 * @return true iff a stop codon is in-frame.
 */ 
bool hasInFrameStopCodon(const string &sequence, int frame);

Mathieu Giraud's avatar
Mathieu Giraud committed
234 235 236 237 238
/**
 * @return a Sequence whose fields are given by the parameters
 */
Sequence create_sequence(string label_full, string label, string sequence, string quality);

239
extern double nChoosek_stored[NB_N_CHOOSE_K_STORED][NB_N_CHOOSE_K_STORED];
Mikael Salson's avatar
Mikael Salson committed
240 241 242 243 244 245
/**
 * @return the combinatorial of k among n
 * @see http://stackoverflow.com/a/9331125/1192742
 */
double nChoosek(unsigned n, unsigned k);

246 247 248 249 250
/**
 * Remove the ends of the sequence if they contain too many N.
 * The sequence will be considered starting at position start_pos
 * for length letters.
 * The values will be updated correspondingly after trimming.
251 252 253 254
 *
 * More precisely, the purpose of the function is to find the longest
 * substring whose prefixes and suffixes all have a ratio of N that
 * is less than or equal to RATIO_TOO_MANY_N
255 256 257
 */
void trimSequence(string &sequence, size_t &start_pos, size_t &length);

Mikael Salson's avatar
Mikael Salson committed
258

Mathieu Giraud's avatar
Mathieu Giraud committed
259 260 261 262
const Sequence NULL_SEQUENCE = create_sequence("", "", "NULL", "");

bool operator==(const Sequence &s1, const Sequence &s2);
bool operator!=(const Sequence &s1, const Sequence &s2);
Mikael Salson's avatar
Mikael Salson committed
263

264 265 266 267 268

/***
 Outputs
 ***/

269
void output_label_average(ostream &out, string label, long long int nb, double average, int precision=1);
270

271
void json_add_warning(json &clone, string code, string msg, string level=LEVEL_WARN);
272 273


Mikael Salson's avatar
Mikael Salson committed
274 275 276 277 278 279 280 281 282 283
//////////////////////////////////////////////////
// Template code
//////////////////////////////////////////////////

template <class T>
bool pair_occurrence_sort(pair<T, int> a, pair<T, int> b) {
  return a.second >= b.second;
}

#endif