diff --git a/partitioning/SmithWaterman.cpp b/partitioning/SmithWaterman.cpp index a0dc8aaf0d20974782751167cd645431f9ea27d4..2220263c799af58a28a5eb53d63d764ef8a579d5 100644 --- a/partitioning/SmithWaterman.cpp +++ b/partitioning/SmithWaterman.cpp @@ -1,216 +1,69 @@ -////////////////////////////////////////////// -// Simple Ends-Free Smith-Waterman Algorithm -// -// You will be prompted for input sequences -// Penalties and match scores are hard-coded -// -// Program does not perform multiple tracebacks if -// it finds several alignments with the same score -// -// By Nikhil Gopal -// Similar implementation here: https://wiki.uni-koeln.de/biologicalphysics/index.php/Implementation_of_the_Smith-Waterman_local_alignment_algorithm -////////////////////////////////////////////// -//g++ SmithWaterman.cpp -o SmithWaterman +#include "SmithWaterman.hpp" -#include "SmithWaterman.h" +SmithWatermanWorker::SmithWatermanWorker(const size_t primer_size_) : primer_size(primer_size_) {} -#include <iostream> -#include <fstream> -#include <cstdlib> -#include <string> -#include <cmath> -#include <sys/time.h> -using namespace std; - -int ind; - -double score_match = 10; -double score_gap = -7; -double score_mismatch = -5; - -double similarityScore(char a, char b); -double findMax(double array[], int length); - -double similarityScore(char a, char b) -{ - double result; - if(a==b) - { - result=score_match; - } - else - { - result=score_mismatch; +SmithWatermanWorker::AlignmentResult SmithWatermanWorker::align(const char *seq, const uint32_t seq_size, + const char *primer) { + // Reset data + for (size_t j = 0; j <= primer_size; ++j) { + line0[j] = 0; // First line } - return result; -} - -double findMax(double array[], int length) -{ - double max = array[0]; - ind = 0; - - for(int i=1; i<length; i++) - { - if(array[i] > max) - { - max = array[i]; - ind=i; - } - } - return max; -} - - -void SmithWaterman(const std::string& seqA, const std::string& seqB, double& adjusted_score, int& align_start, int& align_stop) { - - // initialize some variables - int lengthSeqA = seqA.length(); - int lengthSeqB = seqB.length(); - - // initialize empty matrix - //double matrix[lengthSeqA+1][lengthSeqB+1]; - double **matrix = (double **)malloc((lengthSeqA+1) * sizeof(double*)); - for(int i = 0; i < (lengthSeqA+1); i++) matrix[i] = (double *)malloc((lengthSeqB+1) * sizeof(double)); - - for(int i=0;i<=lengthSeqA;i++) - { - for(int j=0;j<=lengthSeqB;j++) - { - matrix[i][j]=0; + line1[0] = 0; // First column + + auto prev_line = line0; + auto current_line = line1; + score_t *tmp; + + // Fill matrix and remember max + AlignmentResult res = {0, 0}; + for (size_t i = 1; i <= seq_size; ++i) { + for (size_t j = 1; j <= primer_size; ++j) { + current_line[j] = std::max(prev_line[j - 1] + sw_similarity(seq[i - 1], primer[j - 1]), 0); + current_line[j] = std::max(current_line[j], prev_line[j] + SW_GAP_SCORE); + current_line[j] = std::max(current_line[j], current_line[j - 1] + SW_GAP_SCORE); + if (current_line[j] >= res.score) { + res.score = current_line[j]; + res.pos = i; + } } + // Swap the two pointers + tmp = prev_line; + prev_line = current_line; + current_line = tmp; } - double traceback[4]; - - int **I_i = (int **)malloc((lengthSeqA+1) * sizeof(int*)); - for(int i = 0; i < (lengthSeqA+1); i++) I_i[i] = (int *)malloc((lengthSeqB+1) * sizeof(int)); - - int **I_j = (int **)malloc((lengthSeqA+1) * sizeof(int*)); - for(int i = 0; i < (lengthSeqA+1); i++) I_j[i] = (int *)malloc((lengthSeqB+1) * sizeof(int)); + return res; +} - //start populating matrix - for (int i=1;i<=lengthSeqA;i++) - { - for(int j=1;j<=lengthSeqB;j++) - { - traceback[0] = matrix[i-1][j-1]+similarityScore(seqA[i-1],seqB[j-1]); - traceback[1] = matrix[i-1][j]+score_gap; - traceback[2] = matrix[i][j-1]+score_gap; - traceback[3] = 0; - matrix[i][j] = findMax(traceback,4); - switch(ind) - { - case 0: - I_i[i][j] = i-1; - I_j[i][j] = j-1; - break; - case 1: - I_i[i][j] = i-1; - I_j[i][j] = j; - break; - case 2: - I_i[i][j] = i; - I_j[i][j] = j-1; - break; - case 3: - I_i[i][j] = i; - I_j[i][j] = j; - break; - } - } +SmithWatermanWorker::AlignmentResult SmithWatermanWorker::align_reverse(const char *seq, const uint32_t seq_size, + const char *primer) { + // Reset data + for (size_t j = 0; j <= primer_size; ++j) { + line0[j] = 0; // First line } - - // find the max score in the matrix - double matrix_max = 0; - int i_max=0, j_max=0; - for(int i=1;i<=lengthSeqA;i++) - { - for(int j=1;j<=lengthSeqB;j++) - { - if(matrix[i][j]>=matrix_max) - { - matrix_max = matrix[i][j]; - i_max=i; - j_max=j; + line1[primer_size] = 0; // Last column + + auto prev_line = line0; + auto current_line = line1; + score_t *tmp; + + // Fill matrix and remember max + AlignmentResult res = {0, seq_size}; + for (int i = seq_size - 1; i >= 0; --i) { + for (int j = primer_size - 1; j >= 0; --j) { + current_line[j] = std::max(prev_line[j + 1] + sw_similarity(seq[i], primer[j]), 0); + current_line[j] = std::max(current_line[j], prev_line[j] + SW_GAP_SCORE); + current_line[j] = std::max(current_line[j], current_line[j + 1] + SW_GAP_SCORE); + if (current_line[j] >= res.score) { + res.score = current_line[j]; + res.pos = i; } } + // Swap the two pointers + tmp = prev_line; + prev_line = current_line; + current_line = tmp; } - //cout << "Max score in the matrix is " << matrix_max << endl; - - // traceback - - int current_i=i_max, current_j=j_max; - int next_i=I_i[current_i][current_j]; - int next_j=I_j[current_i][current_j]; - int tick=0; - //char consensus_a[lengthSeqA+lengthSeqB+2],consensus_b[lengthSeqA+lengthSeqB+2]; - - while(((current_i!=next_i) || (current_j!=next_j)) && (next_j!=0) && (next_i!=0)) - { - - //if(next_i==current_i) consensus_a[tick] = '-'; // deletion in A - //else consensus_a[tick] = seqA[current_i-1]; // match/mismatch in A - - //if(next_j==current_j) consensus_b[tick] = '-'; // deletion in B - //else consensus_b[tick] = seqB[current_j-1]; // match/mismatch in B - - current_i = next_i; - current_j = next_j; - next_i = I_i[current_i][current_j]; - next_j = I_j[current_i][current_j]; - tick++; - } - - for(int i = 0; i < lengthSeqA+1; i++) - { - free(matrix[i]); - free(I_i[i]); - free(I_j[i]); - } - free(matrix); - free(I_i); - free(I_j); - - //print the consensus sequences - //cout<<endl<<" "<<endl; - //cout<<"Alignment:"<<endl<<endl; - //for(int i=0;i<lengthSeqA;i++){cout<<seqA[i];}; cout<<" and"<<endl; - //for(int i=0;i<lengthSeqB;i++){cout<<seqB[i];}; cout<<endl<<endl; - //for(int i=tick-1;i>=0;i--) cout<<consensus_a[i]; - //cout<<endl; - //for(int j=tick-1;j>=0;j--) cout<<consensus_b[j]; - //cout<<endl; - - adjusted_score = matrix_max/lengthSeqB; // makes the score between 0 and 10 - align_start = next_i; - align_stop = i_max; - + return res; } - -/* -int main(int argc, char* argv[]) -{ - - //args = ref_sequence to_align_sequence - if(argc!=3){ - printf("usage : SmithWaterman ref_sequence to_align_sequence\n"); - return 1; - } - - //cout << argv[2] << endl; - string seqA = argv[1]; // sequence A - string seqB = argv[2]; // sequence B - - double adjusted_score; - int align_start; - int align_stop; - - SmithWaterman(seqA, seqB, adjusted_score, align_start, align_stop); - - cout << adjusted_score << " " << align_start << " " << align_stop << endl; - - return 0; -}*/ - diff --git a/partitioning/SmithWaterman.h b/partitioning/SmithWaterman.h deleted file mode 100644 index faf39f10b420c764e00b5b58af141eba267ada19..0000000000000000000000000000000000000000 --- a/partitioning/SmithWaterman.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef SMITH_WATERMAN_H -#define SMITH_WATERMAN_H - -#include <string> - -void SmithWaterman(const std::string& seqA, const std::string& seqB, double& adjusted_score, int& align_start, int& align_stop); - -#endif // SMITH_WATERMAN_H diff --git a/partitioning/SmithWaterman.hpp b/partitioning/SmithWaterman.hpp new file mode 100644 index 0000000000000000000000000000000000000000..313381e244416493b8efa2edc5779caf12068237 --- /dev/null +++ b/partitioning/SmithWaterman.hpp @@ -0,0 +1,46 @@ +#ifndef B84D3238_413F_430F_94CC_E2EF840C19EB +#define B84D3238_413F_430F_94CC_E2EF840C19EB + +#include <algorithm> +#include <array> +#include <cstddef> +#include <cstdint> +#include <vector> + +typedef int score_t; + +constexpr score_t SW_GAP_SCORE = -7; +constexpr score_t SW_MISMATCH_SCORE = -5; +constexpr score_t SW_MATCH_SCORE = 10; + +constexpr float SW_GOOD_SCORE_FACTOR = 0.95; +constexpr float SW_GOOD_ENOUGH_SCORE_FACTOR = 0.75; + +constexpr size_t MAX_PRIMER_SIZE = 8; + +class SmithWatermanWorker { + public: + struct AlignmentResult { + score_t score; + size_t pos; + }; + + SmithWatermanWorker(const size_t primer_size_); + + /// @return The **end** position of the maximum score alignment + AlignmentResult align(const char* seq, const uint32_t seq_size, const char* primer); + + /// @return The **start** position of the maximum score alignment + AlignmentResult align_reverse(const char* seq, const uint32_t seq_size, const char* primer); + + score_t get_max_possible_score() const { return primer_size * SW_MATCH_SCORE; } + + private: + const size_t primer_size; + score_t line0[MAX_PRIMER_SIZE + 1]; + score_t line1[MAX_PRIMER_SIZE + 1]; + + static constexpr score_t sw_similarity(char a, char b) { return a == b ? SW_MATCH_SCORE : SW_MISMATCH_SCORE; } +}; + +#endif /* B84D3238_413F_430F_94CC_E2EF840C19EB */ diff --git a/partitioning/fast_clustering.cpp b/partitioning/fast_clustering.cpp index b7109f8be9328a6a130f5e85e343d11675cd86d7..eb9be28347cb44c266730f2a2e8ce04d9ba31839 100755 --- a/partitioning/fast_clustering.cpp +++ b/partitioning/fast_clustering.cpp @@ -1,176 +1,156 @@ -#include <iostream> -#include <fstream> +#include <algorithm> +#include <chrono> #include <filesystem> +#include <fstream> +#include <iostream> #include <string> #include <vector> -#include <algorithm> -#include <chrono> - -#include "SmithWaterman.h" // include the header file for the Smith-Waterman algorithm +#include "SmithWaterman.hpp" // include the header file for the Smith-Waterman algorithm std::string reverse_complement(const std::string& sequence) { - // get the reverse complement of a sequence - std::string rc_sequence = sequence; - std::transform(rc_sequence.begin(), rc_sequence.end(), rc_sequence.begin(), [](char c) { - switch (c) { - case 'A': return 'T'; - case 'C': return 'G'; - case 'G': return 'C'; - case 'T': return 'A'; - default: return c; - } - }); - std::reverse(rc_sequence.begin(), rc_sequence.end()); - return rc_sequence; + // get the reverse complement of a sequence + std::string rc_sequence = sequence; + std::transform(rc_sequence.begin(), rc_sequence.end(), rc_sequence.begin(), [](char c) { + switch (c) { + case 'A': + return 'T'; + case 'C': + return 'G'; + case 'G': + return 'C'; + case 'T': + return 'A'; + default: + return c; + } + }); + std::reverse(rc_sequence.begin(), rc_sequence.end()); + return rc_sequence; } -std::vector<std::string> read_fastq(const std::string& filename) { - // function to read FASTQ file and return a vector of sequences - std::ifstream file(filename); - std::vector<std::string> sequences; - std::string line; +std::string get_cluster_id(const std::string& read, const std::string& seq_to_find, int split_level, + SmithWatermanWorker& sw_worker, const score_t good_score_threshold, + const score_t good_enough_score_threshold) { + /* + get the <split_level> bases following the primer in the read, also try to find the primer in the reverse complement + sequence + */ - if (!file.is_open()) { - std::cerr << "Error opening file: " << filename << std::endl; - return sequences; - } + // call local alignment algorithm on the forward sequence + auto res_direct = sw_worker.align(read.data(), read.size(), seq_to_find.data()); - // Skip the header line - while (std::getline(file, line)) { + // don't test for the reverse complement if near perfect alignment already found (maximum possible score = 10) + if (res_direct.score >= good_score_threshold && read.size() > res_direct.pos + split_level) { + return read.substr(res_direct.pos, split_level); // extract the cluster id + } - // Read the sequence line - std::getline(file, line); - sequences.push_back(line); + std::string read_rc = reverse_complement(read); // get reverse complement of the read sequence - // Skip the quality score lines - std::getline(file, line); - std::getline(file, line); - } + // test local alignement with the reverse complement sequence + auto res_revcomp = sw_worker.align(read_rc.data(), read_rc.size(), seq_to_find.data()); - file.close(); + if (res_direct.score >= res_revcomp.score) { // primer found in forward sequence + if (res_direct.score >= good_enough_score_threshold && read.size() > res_direct.pos + split_level) { + return read.substr(res_direct.pos, split_level); + } + } else { // primer found in reverse complement sequence + if (res_revcomp.score >= good_enough_score_threshold && read_rc.size() > res_revcomp.pos + split_level) { + return read_rc.substr(res_revcomp.pos, split_level); + } + } - return sequences; + // score too weak, or read too small = ignore the read + return "None"; } - -std::string get_cluster_id(const std::string& read, const std::string& seq_to_find, int split_level) { - /* - get the <split_level> bases following the primer in the read, also try to find the primer in the reverse complement sequence - */ - - double score_fw, score_rc; // alignment score with the forward and reverse complement sequence - int x_fw, y_fw, x_rc, y_rc; // positions start and stop of the alignments found, x_fw and x_rc not used - - // call local alignment algorithm on the forward sequence - SmithWaterman(read, seq_to_find, score_fw, x_fw, y_fw); - - // don't test for the reverse complement if near perfect alignment already found (maximum possible score = 10) - if (score_fw >= 9.5 && read.size() > y_fw + split_level) { - return read.substr(y_fw, split_level); // extract the cluster id - } - - std::string read_rc = reverse_complement(read); // get reverse complement of the read sequence - - // test local alignement with the reverse complement sequence - SmithWaterman(read_rc, seq_to_find, score_rc, x_rc, y_rc); - - if (score_fw >= score_rc) { // primer found in forward sequence - if (score_fw >= 7.5 && read.size() > y_fw + split_level) { - return read.substr(y_fw, split_level); - } - } else { // primer found in reverse complement sequence - if (score_rc >= 7.5 && read_rc.size() > y_rc + split_level) { - return read_rc.substr(y_rc, split_level); - } - } - - // score too weak, or read too small = ignore the read - return "None"; +void clustering(const std::string& reads_path, const std::string& start_primer, int split_level, + const std::string& output_dir) { + /* split_lvel = nbr of bases to use for the cluster id + */ + + // delete all fasta files in the clusters dir + std::filesystem::path dir_path(output_dir); + try { + for (const auto& entry : std::filesystem::directory_iterator(dir_path)) { + if (entry.path().extension() == ".fasta") { + std::filesystem::remove(entry.path()); + } + } + } catch (const std::filesystem::filesystem_error& ex) { + std::cerr << "Error deleting files: " << ex.what() << '\n'; + } + + int seq_to_find_size = std::min(start_primer.size(), MAX_PRIMER_SIZE); + std::string seq_to_find = + start_primer.substr(start_primer.size() - + seq_to_find_size); // search for a smaller sequence than the full primer if its len is > 10 + + std::ifstream input_read_file(reads_path); // read the fastq + + std::string read_name; + std::string sequence; + std::string line; + + SmithWatermanWorker sw_worker(seq_to_find.size()); + auto max_possible_score = sw_worker.get_max_possible_score(); + score_t good_score_threshold = max_possible_score * SW_GOOD_SCORE_FACTOR; + score_t good_enough_score_threshold = max_possible_score * SW_GOOD_ENOUGH_SCORE_FACTOR; + + // browse the reads file and skip the header line + while (std::getline(input_read_file, read_name)) { + // read the sequence line + std::getline(input_read_file, sequence); + + // Skip the quality score lines + std::getline(input_read_file, line); + std::getline(input_read_file, line); + + // find the position for the primer and get the bases following it + std::string cluster_id = get_cluster_id(sequence, seq_to_find, split_level, sw_worker, good_score_threshold, + good_enough_score_threshold); + + if (cluster_id == "None") { + continue; // ignore the read + } + // write the sequence and its id to a cluster file corresponding to the bases found after the primer + std::ofstream output_file(output_dir + "/" + cluster_id + ".fasta", + std::ios::app); // ios app is to append in the file + if (!output_file.is_open()) { + std::cerr << "Error opening output file\n"; + return; + } + output_file << ">" << read_name << "\n" << sequence << "\n"; + } + + input_read_file.close(); } - -void clustering(const std::string& reads_path, const std::string& start_primer, int split_level, const std::string& output_dir) { - /* split_lvel = nbr of bases to use for the cluster id - */ - - //delete all fasta files in the clusters dir - std::filesystem::path dir_path(output_dir); - try { - for (const auto& entry : std::filesystem::directory_iterator(dir_path)) { - if (entry.path().extension() == ".fasta") { - std::filesystem::remove(entry.path()); - } - } - } catch (const std::filesystem::filesystem_error& ex) { - std::cerr << "Error deleting files: " << ex.what() << '\n'; - } - - int seq_to_find_size = std::min(static_cast<int>(start_primer.size()), 8); - std::string seq_to_find = start_primer.substr(start_primer.size() - seq_to_find_size); // search for a smaller sequence than the full primer if its len is > 10 - - std::ifstream input_read_file(reads_path); // read the fastq - - std::string read_name; - std::string sequence; - std::string line; - - // browse the reads file and skip the header line - while (std::getline(input_read_file, read_name)) { - - // read the sequence line - std::getline(input_read_file, sequence); - - // Skip the quality score lines - std::getline(input_read_file, line); - std::getline(input_read_file, line); - - // find the position for the primer and get the bases following it - std::string cluster_id = get_cluster_id(sequence, seq_to_find, split_level); - - if (cluster_id == "None") { - continue; // ignore the read - } - // write the sequence and its id to a cluster file corresponding to the bases found after the primer - std::ofstream output_file(output_dir + "/" + cluster_id + ".fasta", std::ios::app); // ios app is to append in the file - if (!output_file.is_open()) { - std::cerr << "Error opening output file\n"; - return; - } - output_file << ">" << read_name << "\n" << sequence << "\n"; - } - - input_read_file.close(); -} - - - int main(int argc, char* argv[]) { + // check if the input and output file paths are provided as arguments + if (argc != 3) { + std::cerr << "Usage: " << argv[0] << " <input_fastq> <output_dir>" << std::endl; + return 1; + } // ./fast_clustering reads.fastq clusters_dir_path - // check if the input and output file paths are provided as arguments - if (argc != 3) { - std::cerr << "Usage: " << argv[0] << " <input_fastq> <output_dir>" << std::endl; - return 1; - } // ./fast_clustering reads.fastq clusters_dir_path - - // get the input and output paths from the arguments - std::string input_fastq = argv[1]; - std::string output_dir = argv[2]; // must be an existing dir + // get the input and output paths from the arguments + std::string input_fastq = argv[1]; + std::string output_dir = argv[2]; // must be an existing dir - // start a timer - auto start = std::chrono::high_resolution_clock::now(); + // start a timer + auto start = std::chrono::high_resolution_clock::now(); - std::string start_primer = "GTTCAGAGTTCTACAGTCCGACGATCC"; + std::string start_primer = "GTTCAGAGTTCTACAGTCCGACGATCC"; - clustering(input_fastq, start_primer, 3, output_dir); + clustering(input_fastq, start_primer, 3, output_dir); - // end the timer and print the elapsed time - auto end = std::chrono::high_resolution_clock::now(); - std::chrono::duration<double> elapsed = end - start; - std::cout << elapsed.count() << "s for fast clustering" << std::endl; + // end the timer and print the elapsed time + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration<double> elapsed = end - start; + std::cout << elapsed.count() << "s for fast clustering" << std::endl; - return 0; + return 0; } -//g++ -o fast_clustering fast_clustering.cpp SmithWaterman.cpp && ./fast_clustering reads.fastq cluster_dir +// g++ -o fast_clustering fast_clustering.cpp SmithWaterman.cpp -lstdc++fs -std=c++17 -O3 -g -Wall -Wextra && ./fast_clustering reads.fastq cluster_dir