Commit 85455813 authored by Mikaël Salson's avatar Mikaël Salson

tools: generate_all_seeds

Take a string and a seed (which spans the whole seed) and generates all
the strings matching the input string with the seed.

Tests included.
parent c31997c9
#include <algorithm>
#include <iostream>
#include <iomanip>
#include <algorithm>
#include "tools.h"
string seed_contiguous(int k)
......@@ -223,6 +224,29 @@ string extract_basename(string path, bool remove_ext) {
return path;
}
vector<string> generate_all_seeds(const string &str, const string &seed) {
assert(str.length() == seed.length());
static const string nucleotides = "ACGT";
static const size_t nb_nucleotides = nucleotides.length();
size_t nb_spaces = count(seed.begin(), seed.end(), '-');
vector<string> sequences(1 << (nb_spaces * 2));
if (nb_spaces == 0)
sequences[0] = str;
else {
size_t first_pos = seed.find_first_of('-');
string start_str = str.substr(0,first_pos);
vector<string> end_sequences = generate_all_seeds(str.substr(first_pos+1), seed.substr(first_pos+1));
size_t j = 0;
for (string &end_str: end_sequences) {
for (size_t i = 0; i < nb_nucleotides; i++) {
sequences[j++] = start_str + nucleotides[i] + end_str;
}
}
}
return sequences;
}
int remove_trailing_whitespaces(string &str) {
int count = 0;
while (str.size() > 0 && (str[str.size() - 1] == '\r'
......
......@@ -24,6 +24,7 @@
#include <iomanip>
#include <string>
#include <cassert>
#include <vector>
#include "fasta.h"
using namespace std;
......@@ -159,6 +160,12 @@ string extract_dirname(string path);
*/
string extract_basename(string path, bool remove_ext = true);
/**
* Generate all the possible (nucleotide) strings from the (spaced) seed
* provided in parameter.
*/
vector<string> generate_all_seeds(const string &str, const string &seed);
/**
* remove_trailing_whitespaces removes the whitespaces (ie. ' ', '\t', '\r')
* that may be at the end of the string
......
......@@ -2,6 +2,7 @@
#include <core/fasta.h>
#include "tests.h"
#include <stdexcept>
#include <vector>
void testOnlineFasta1() {
OnlineFasta fa("../../data/test1.fa");
......@@ -342,6 +343,35 @@ void testExtractBasename() {
TEST_EXTRACT_BASENAME, extract_basename("/", true));
}
void testGenerateAllSeeds() {
std::vector<string> solution1 = {"ATAAT"};
TAP_TEST(generate_all_seeds(solution1[0], "#####") == solution1,
TEST_GENERATE_ALL_SEEDS, "");
std::vector<string> solution2 = {"ATAAT", "ATCAT", "ATGAT", "ATTAT"};
std::vector<string> try2 = generate_all_seeds("ATAAT", "##-##");
TAP_TEST(try2 == solution2,
TEST_GENERATE_ALL_SEEDS, "");
std::vector<string> solution3 = {"ATAAT", "ATCAT", "ATGAT", "ATTAT",
"ATACT", "ATCCT", "ATGCT", "ATTCT",
"ATAGT", "ATCGT", "ATGGT", "ATTGT",
"ATATT", "ATCTT", "ATGTT", "ATTTT"};
TAP_TEST(generate_all_seeds("ATAAT", "##--#") == solution3,
TEST_GENERATE_ALL_SEEDS, "");
std::vector<string> solution4 = {"AA", "CA", "GA", "TA",
"AC", "CC", "GC", "TC",
"AG", "CG", "GG", "TG",
"AT", "CT", "GT", "TT"};
TAP_TEST(generate_all_seeds("AA", "--") == solution4,
TEST_GENERATE_ALL_SEEDS, "");
}
void testNChooseK() {
TAP_TEST(nChoosek(1, 10) == 0, TEST_N_CHOOSE_K, "");
TAP_TEST(nChoosek(1, 1) == 1, TEST_N_CHOOSE_K, "");
......@@ -462,5 +492,6 @@ void testTools() {
testExtendedNucleotides();
testExtractBasename();
testNChooseK();
testGenerateAllSeeds();
testTrimSequence();
}
......@@ -30,6 +30,7 @@ enum {
TEST_EXTRACT_BASENAME,
TEST_N_CHOOSE_K,
TEST_TRIM_SEQUENCE,
TEST_GENERATE_ALL_SEEDS,
/* Storage tests */
TEST_ARRAY_KMERSTORE,
......@@ -190,6 +191,7 @@ inline void declare_tests() {
RECORD_TAP_TEST(TEST_EXTRACT_BASENAME, "extractBasename()");
RECORD_TAP_TEST(TEST_N_CHOOSE_K, "test nChooseK()");
RECORD_TAP_TEST(TEST_TRIM_SEQUENCE, "test trimSequence()");
RECORD_TAP_TEST(TEST_GENERATE_ALL_SEEDS, "test generate_all_seeds()");
RECORD_TAP_TEST(TEST_ARRAY_KMERSTORE, "Testing ArrayKmerStore");
RECORD_TAP_TEST(TEST_KMERSTORE_INSERT_ONE_SEQ, "Testing IKmerStore::insert() on one sequence");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment