tools.h 7.94 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1 2 3
#ifndef TOOLS_H
#define TOOLS_H

4 5
using namespace std ;
typedef string junction ;
6

7 8 9 10 11
// error
#define ERROR_STRING "[error] "
#define WARNING_STRING "[warning] "


12
#define NO_LIMIT_VALUE  -1  // Value for 'all' on command-line options
13
#define NO_LIMIT_VALUE_STRING  "-1"
14 15


Mathieu Giraud's avatar
Mathieu Giraud committed
16
#define MAX_SEED_SIZE  50 // Spaced seed buffer
17
#define FIRST_POS  1      // Numbering of the base pairs for external output
Mikaël Salson's avatar
Mikaël Salson committed
18

19
#define PERCENT_TOO_MANY_N 30    /* Percent above which we consider there are too
20 21
                                  many Ns in the sequence and the
                                  corresponding subsequence should be
22 23
                                  trimmed. The constant *must* be an integer.
                                   */
24
#define MIN(x,y) ((x) < (y) ? (x) : (y))
25

26 27 28 29 30 31
#define LEVEL_DEBUG "debug"
#define LEVEL_INFO  "info"
#define LEVEL_WARN  "warn"
#define LEVEL_ERROR "error"
#define LEVEL_FATAL "fatal"

32 33
#define ALL_KMERS_VALUE 0 /* Use in -Z 0 (filtering on all k-mers with at least
                             one match. */
Mikaël Salson's avatar
Mikaël Salson committed
34 35
#include <sstream>
#include <iostream>
36
#include <iomanip>
Mikaël Salson's avatar
Mikaël Salson committed
37 38
#include <string>
#include <cassert>
39
#include <vector>
40
#include "bioreader.hpp"
41
#include "kmeraffect.h"
42
#include "../lib/json_fwd.hpp"
43
using json = nlohmann::json;
Mikaël Salson's avatar
Mikaël Salson committed
44 45 46 47
using namespace std;

#define PRINT_VAR(v) cerr << #v << " = " << v << endl

48
#define NB_N_CHOOSE_K_STORED 500
Mikaël Salson's avatar
Mikaël Salson committed
49 50 51

#define SEED_YES '#'

52
// Common seeds
53
#define DEFAULT_SEED "10s"
54
extern map<string, string> seedMap;
55
string expand_seed(const string &seed);
56

Mikaël Salson's avatar
Mikaël Salson committed
57 58 59 60
string seed_contiguous(int k);

int seed_weight(const string &seed);

61 62
// https://stackoverflow.com/posts/3599170/revisions
#define UNUSED(x) ((void)(x))
Mikaël Salson's avatar
Mikaël Salson committed
63

64 65
#define FIRST_IF_UNCHANGED(first, second, changed) ((changed) ? (second) : (first))

Mikaël Salson's avatar
Mikaël Salson committed
66 67 68 69 70 71 72
/**
 * Return a spaced key from a contiguous key and a seed model
 * @param input: contiguous key
 * @param seed: spaced seed model, like "###-###"
 * @return the spaced key
 */
string spaced(const string &input, const string &seed);
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91

inline int spaced_int(int *input, const string &seed) {

  // cout << input << endl << seed << endl ;
  // assert(input.length() == seed.length()); // length is not equal, pointer

  int index_word = 0;

  for (size_t i = 0; i < seed.length(); i++) 
    if (seed[i] == SEED_YES)
	index_word = (index_word << 2) | input[i] ;

#ifdef DEBUG_SPACED
  cout << input << " => |" << index_word << "|" <<  endl ;
#endif

  return index_word;

}
Mikaël Salson's avatar
Mikaël Salson committed
92

93 94 95 96 97 98 99 100
/* 
	Extract the gene name from a label. This take the whole part
	before the star and returns it. If there is no star in the
	name the whole label is returned.
	IGHV-01*05	->	IGHV-01
	IGHV-7500AB	->	IGHV-7500AB
*/
string extractGeneName(string label);
Mikaël Salson's avatar
Mikaël Salson committed
101 102 103 104 105 106 107 108 109 110 111

/**
 * Sort the number of occurrence stored as the second element of a pair.
 * @param a: a pair containing an element and an associated number of occurrence
 * @param b: a pair containing an element and an associated number of occurrence
 * @return true iff a has a number of occurrence greater or equal to b.
 */
template <class T>
bool pair_occurrence_sort(pair<T, int> a, pair<T, int> b);


112
string string_of_int(int number, int pad_to_width=0);
113
string fixed_string_of_float(float number, int precision);
114
string scientific_string_of_double(double number);
115
string string_of_map(map <string, string> m, const string &before);
116 117 118 119 120 121 122 123 124 125

/**
 * @param nuc is A, C, G, T or any extended nucleotide (or lowercase)
 * @return is nuc an extended nucleotide ?
 */

bool is_extended_nucleotide(char nuc);
bool has_extended_nucleotides(string s);


Mikaël Salson's avatar
Mikaël Salson committed
126
/**
127
 * @param nuc is A, C, G, T or any extended nucleotide (or lowercase)
Mikaël Salson's avatar
Mikaël Salson committed
128 129 130 131 132 133 134 135 136
 * @return the complementary nucleotide of nuc
 */
char complement_nucleotide(char nuc);

/**
 * @return the complementary sequence of dna
 */
string complement(const string &dna);

137
/**
138 139
 * @param nuc: nucleotide in  either up- or down-casemajuscule (ACGTacgt)
 * @return integer representation (respectively 0, 1, 2 ou 3)
140
 */
141
inline int nuc_to_int(char nuc) {
142 143 144 145
  // A/a : 01*0 0001
  // C/c : 01*0 0011
  // G/g : 01*0 0111
  // T/t : 01*1 0100
146
  // pos :    3210
147 148 149 150
  // Bit de poids fort : b_2
  // Bit de poids faible : xor entre b_2 et b_1
  return ((nuc & 4) >> 1) // poids fort
    | (((nuc & 4) >> 2) ^ ((nuc & 2) >> 1));
151
}
Mikaël Salson's avatar
Mikaël Salson committed
152

Mikaël Salson's avatar
Mikaël Salson committed
153
/**
154
 * Convert size nucleotides from a DNA string to an integer or to an hash.
Mikaël Salson's avatar
Mikaël Salson committed
155 156
 */
int dna_to_int(const string &, int size);
157
uint64_t dna_to_hash(const string &, int size);
Mikaël Salson's avatar
Mikaël Salson committed
158

159 160 161 162 163 164 165 166 167 168 169 170 171
#define GENETIC_CODE \
  "KNKN" "TTTT" "RSRS" "IIMI" \
  "QHQH" "PPPP" "RRRR" "LLLL" \
  "EDED" "AAAA" "GGGG" "VVVV" \
  "*Y*Y" "SSSS" "*CWC" "LFLF"

#define GENETIC_CODE_OUT_OF_FRAME '#'

/**
 * Convert nucleotides to amino acids
 */
string nuc_to_aa(const string &nuc);

172 173 174 175 176 177 178 179
/**
 * Extract a field from a separated string
 * @param field: number of the field to be extracted (starts at 1,
 *               if 0: returns the whole string)
 * @param separator: the separator used in the string
 * @param str
 * @return the field to be extracted from the string
 */
Mikaël Salson's avatar
Mikaël Salson committed
180 181
string extract_from_label(string str, int field, string separator);

182
/**
183 184 185 186 187 188
 * @return Extract dirname of a file
 */
string extract_dirname(string path);

/**
 * @return Extract basename of a file and extracts extension (by default)
189 190 191
 */
string extract_basename(string path, bool remove_ext = true);

192 193 194 195 196 197
/**
 * Generate all the possible (nucleotide) strings from the (spaced) seed
 * provided in parameter.
 */
vector<string> generate_all_seeds(const string &str, const string &seed);

Mikaël Salson's avatar
Mikaël Salson committed
198 199 200 201 202 203 204 205
/**
 * remove_trailing_whitespaces removes the whitespaces (ie. ' ', '\t', '\r')
 * that may be at the end of the string
 * @param str: the string 
 * @return the number of whitespaces removed
 */
int remove_trailing_whitespaces(string &str);

206 207 208 209 210
/**
 * @return subsequence delimited by biological positions (starting from 1), including both positions
 */
string subsequence(const string &text, int start, int end);

Mikaël Salson's avatar
Mikaël Salson committed
211 212 213 214 215
/**
 * @return reverse(complement(dna)) if do_revcomp, otherwise dna
 */
string revcomp(const string &dna, bool do_revcomp = true);

216 217 218 219 220 221
/**
 * @return the int value corresponding to the revcomp of the DNA sequence
 *         represented by word, whose length (in number of nucleotides) is size.
 */
int revcomp_int(int word, int size);

Mikaël Salson's avatar
Mikaël Salson committed
222 223 224 225 226
/**
 * @return the reverse of text (ie. text read from right to left)
 */
string reverse(const string &text);

227 228 229 230 231 232 233 234 235
/**
 * @param sequence is a DNA sequence in the correct orientation in uppercase
 *      (ie. no revcomp will be tried)
 * @param frame (0, 1 or 2) depending on where the position of the first codon
 *        in the sequence starts
 * @return true iff a stop codon is in-frame.
 */ 
bool hasInFrameStopCodon(const string &sequence, int frame);

Mathieu Giraud's avatar
Mathieu Giraud committed
236 237 238 239 240
/**
 * @return a Sequence whose fields are given by the parameters
 */
Sequence create_sequence(string label_full, string label, string sequence, string quality);

241
extern double nChoosek_stored[NB_N_CHOOSE_K_STORED][NB_N_CHOOSE_K_STORED];
Mikaël Salson's avatar
Mikaël Salson committed
242 243 244 245 246 247
/**
 * @return the combinatorial of k among n
 * @see http://stackoverflow.com/a/9331125/1192742
 */
double nChoosek(unsigned n, unsigned k);

248 249 250 251 252
/**
 * Remove the ends of the sequence if they contain too many N.
 * The sequence will be considered starting at position start_pos
 * for length letters.
 * The values will be updated correspondingly after trimming.
253 254 255 256
 *
 * More precisely, the purpose of the function is to find the longest
 * substring whose prefixes and suffixes all have a ratio of N that
 * is less than or equal to RATIO_TOO_MANY_N
257 258 259
 */
void trimSequence(string &sequence, size_t &start_pos, size_t &length);

Mikaël Salson's avatar
Mikaël Salson committed
260

Mathieu Giraud's avatar
Mathieu Giraud committed
261 262 263 264
const Sequence NULL_SEQUENCE = create_sequence("", "", "NULL", "");

bool operator==(const Sequence &s1, const Sequence &s2);
bool operator!=(const Sequence &s1, const Sequence &s2);
Mikaël Salson's avatar
Mikaël Salson committed
265

266 267 268 269 270

/***
 Outputs
 ***/

271
void output_label_average(ostream &out, string label, long long int nb, double average, int precision=1);
272

273
void json_add_warning(json &clone, string code, string msg, string level=LEVEL_WARN);
274 275


Mikaël Salson's avatar
Mikaël Salson committed
276 277 278 279 280 281 282 283 284 285
//////////////////////////////////////////////////
// Template code
//////////////////////////////////////////////////

template <class T>
bool pair_occurrence_sort(pair<T, int> a, pair<T, int> b) {
  return a.second >= b.second;
}

#endif