From a3fbdd2c68a5c778941e91625707e0405d09ab1d Mon Sep 17 00:00:00 2001 From: oboulle <olivier.boulle@inria.fr> Date: Thu, 11 Jul 2024 15:14:33 +0200 Subject: [PATCH] ini --- partitioning/SmithWaterman.cpp | 216 +++++++++++++++++++++++++++++++ partitioning/SmithWaterman.h | 8 ++ partitioning/fast_clustering.cpp | 189 +++++++++++++++++++++++++++ 3 files changed, 413 insertions(+) create mode 100644 partitioning/SmithWaterman.cpp create mode 100644 partitioning/SmithWaterman.h create mode 100755 partitioning/fast_clustering.cpp diff --git a/partitioning/SmithWaterman.cpp b/partitioning/SmithWaterman.cpp new file mode 100644 index 0000000..d79b6eb --- /dev/null +++ b/partitioning/SmithWaterman.cpp @@ -0,0 +1,216 @@ +////////////////////////////////////////////// +// Simple Ends-Free Smith-Waterman Algorithm +// +// You will be prompted for input sequences +// Penalties and match scores are hard-coded +// +// Program does not perform multiple tracebacks if +// it finds several alignments with the same score +// +// By Nikhil Gopal +// Similar implementation here: https://wiki.uni-koeln.de/biologicalphysics/index.php/Implementation_of_the_Smith-Waterman_local_alignment_algorithm +////////////////////////////////////////////// +//g++ SmithWaterman.cpp -o SmithWaterman + +#include "SmithWaterman.h" + +#include <iostream> +#include <fstream> +#include <cstdlib> +#include <string> +#include <cmath> +#include <sys/time.h> +using namespace std; + +int ind; + +double score_match = 10; +double score_gap = -7; +double score_mismatch = -5; + +double similarityScore(char a, char b); +double findMax(double array[], int length); + +double similarityScore(char a, char b) +{ + double result; + if(a==b) + { + result=score_match; + } + else + { + result=score_mismatch; + } + return result; +} + +double findMax(double array[], int length) +{ + double max = array[0]; + ind = 0; + + for(int i=1; i<length; i++) + { + if(array[i] > max) + { + max = array[i]; + ind=i; + } + } + return max; +} + + +void SmithWaterman(const std::string& seqA, const std::string& seqB, double& adjusted_score, int& align_start, int& align_stop) { + + // initialize some variables + int lengthSeqA = seqA.length(); + int lengthSeqB = seqB.length(); + + // initialize empty matrix + //double matrix[lengthSeqA+1][lengthSeqB+1]; + double **matrix = (double **)malloc((lengthSeqA+1) * sizeof(double*)); + for(int i = 0; i < (lengthSeqA+1); i++) matrix[i] = (double *)malloc((lengthSeqB+1) * sizeof(double)); + + for(int i=0;i<=lengthSeqA;i++) + { + for(int j=0;j<=lengthSeqB;j++) + { + matrix[i][j]=0; + } + } + + double traceback[4]; + + int **I_i = (int **)malloc((lengthSeqA+1) * sizeof(int*)); + for(int i = 0; i < (lengthSeqA+1); i++) I_i[i] = (int *)malloc((lengthSeqB+1) * sizeof(int)); + + int **I_j = (int **)malloc((lengthSeqA+1) * sizeof(int*)); + for(int i = 0; i < (lengthSeqA+1); i++) I_j[i] = (int *)malloc((lengthSeqB+1) * sizeof(int)); + + //start populating matrix + for (int i=1;i<=lengthSeqA;i++) + { + for(int j=1;j<=lengthSeqB;j++) + { + traceback[0] = matrix[i-1][j-1]+similarityScore(seqA[i-1],seqB[j-1]); + traceback[1] = matrix[i-1][j]+score_gap; + traceback[2] = matrix[i][j-1]+score_gap; + traceback[3] = 0; + matrix[i][j] = findMax(traceback,4); + switch(ind) + { + case 0: + I_i[i][j] = i-1; + I_j[i][j] = j-1; + break; + case 1: + I_i[i][j] = i-1; + I_j[i][j] = j; + break; + case 2: + I_i[i][j] = i; + I_j[i][j] = j-1; + break; + case 3: + I_i[i][j] = i; + I_j[i][j] = j; + break; + } + } + } + + // find the max score in the matrix + double matrix_max = 0; + int i_max=0, j_max=0; + for(int i=1;i<=lengthSeqA;i++) + { + for(int j=1;j<=lengthSeqB;j++) + { + if(matrix[i][j]>=matrix_max) + { + matrix_max = matrix[i][j]; + i_max=i; + j_max=j; + } + } + } + + //cout << "Max score in the matrix is " << matrix_max << endl; + + // traceback + + int current_i=i_max, current_j=j_max; + int next_i=I_i[current_i][current_j]; + int next_j=I_j[current_i][current_j]; + int tick=0; + //char consensus_a[lengthSeqA+lengthSeqB+2],consensus_b[lengthSeqA+lengthSeqB+2]; + + while(((current_i!=next_i) || (current_j!=next_j)) && (next_j!=0) && (next_i!=0)) + { + + //if(next_i==current_i) consensus_a[tick] = '-'; // deletion in A + //else consensus_a[tick] = seqA[current_i-1]; // match/mismatch in A + + //if(next_j==current_j) consensus_b[tick] = '-'; // deletion in B + //else consensus_b[tick] = seqB[current_j-1]; // match/mismatch in B + + current_i = next_i; + current_j = next_j; + next_i = I_i[current_i][current_j]; + next_j = I_j[current_i][current_j]; + tick++; + } + + for(int i = 0; i < lengthSeqA+1; i++) + { + free(matrix[i]); + free(I_i[i]); + free(I_j[i]); + } + free(matrix); + free(I_i); + free(I_j); + + //print the consensus sequences + //cout<<endl<<" "<<endl; + //cout<<"Alignment:"<<endl<<endl; + //for(int i=0;i<lengthSeqA;i++){cout<<seqA[i];}; cout<<" and"<<endl; + //for(int i=0;i<lengthSeqB;i++){cout<<seqB[i];}; cout<<endl<<endl; + //for(int i=tick-1;i>=0;i--) cout<<consensus_a[i]; + //cout<<endl; + //for(int j=tick-1;j>=0;j--) cout<<consensus_b[j]; + //cout<<endl; + + adjusted_score = matrix_max/lengthSeqB; + align_start = next_i; + align_stop = i_max; + +} + +/* +int main(int argc, char* argv[]) +{ + + //args = ref_sequence to_align_sequence + if(argc!=3){ + printf("usage : SmithWaterman ref_sequence to_align_sequence\n"); + return 1; + } + + //cout << argv[2] << endl; + string seqA = argv[1]; // sequence A + string seqB = argv[2]; // sequence B + + double adjusted_score; + int align_start; + int align_stop; + + SmithWaterman(seqA, seqB, adjusted_score, align_start, align_stop); + + cout << adjusted_score << " " << align_start << " " << align_stop << endl; + + return 0; +}*/ + diff --git a/partitioning/SmithWaterman.h b/partitioning/SmithWaterman.h new file mode 100644 index 0000000..faf39f1 --- /dev/null +++ b/partitioning/SmithWaterman.h @@ -0,0 +1,8 @@ +#ifndef SMITH_WATERMAN_H +#define SMITH_WATERMAN_H + +#include <string> + +void SmithWaterman(const std::string& seqA, const std::string& seqB, double& adjusted_score, int& align_start, int& align_stop); + +#endif // SMITH_WATERMAN_H diff --git a/partitioning/fast_clustering.cpp b/partitioning/fast_clustering.cpp new file mode 100755 index 0000000..237fa79 --- /dev/null +++ b/partitioning/fast_clustering.cpp @@ -0,0 +1,189 @@ +#include <iostream> +#include <fstream> +#include <string> +#include <vector> +#include <list> +#include <bitset> +#include <cmath> +#include <algorithm> +#include <filesystem> +#include <chrono> + +#include "SmithWaterman.h" // include the header file for the Smith-Waterman algorithm + + +std::string reverse_complement(const std::string& sequence) { + // get the reverse complement of a sequence + std::string rc_sequence = sequence; + std::transform(rc_sequence.begin(), rc_sequence.end(), rc_sequence.begin(), [](char c) { + switch (c) { + case 'A': return 'T'; + case 'C': return 'G'; + case 'G': return 'C'; + case 'T': return 'A'; + default: return c; + } + }); + std::reverse(rc_sequence.begin(), rc_sequence.end()); + return rc_sequence; +} + + + +int count_sequences(const std::string& input_fastq) { + // get the number of sequences from the fastq + std::ifstream file(input_fastq); + std::string line; + int seq_number = 0; + + while (std::getline(file, line)) { + if (line[0] == '@') { + seq_number++; + } + } + return seq_number; +} + +// function to read FASTQ file and return a vector of sequences +std::vector<std::string> read_fastq(const std::string& filename) { + std::ifstream file(filename); + std::vector<std::string> sequences; + std::string line; + + if (!file.is_open()) { + std::cerr << "Error opening file: " << filename << std::endl; + return sequences; + } + + // Skip the header line + while (std::getline(file, line)) { + + // Read the sequence line + std::getline(file, line); + sequences.push_back(line); + + // Skip the quality score lines + std::getline(file, line); + std::getline(file, line); + } + + file.close(); + + return sequences; +} + + +std::string get_cluster_id(const std::string& read, const std::string& start_primer, int split_level) { + /* + get the <split_level> bases following the primer in the read, check if the read is in reverse complement + */ + + std::string read_rc = reverse_complement(read); // reverse complement of the read sequence + + // get primer end position, check if read is forward + int seq_to_find_size = std::min(static_cast<int>(start_primer.size()), 12); + std::string seq_to_find = start_primer.substr(start_primer.size() - seq_to_find_size); // search for a smaller sequence than the full primer if its len is > 10 + + double score_fw, score_rc; + int x_fw, y_fw, x_rc, y_rc; //x_fw and x_rc not used + + // call local alignment algorithm on the forward sequence + SmithWaterman(read, seq_to_find, score_fw, x_fw, y_fw); + + // same for the reverse complement sequence + SmithWaterman(read_rc, seq_to_find, score_rc, x_rc, y_rc); + + if (score_fw >= score_rc) { // primer found in forward sequence + if (score_fw >= 8 && read.size() > y_fw + split_level) { + return read.substr(y_fw, split_level); + } + } else { // primer found in reverse complement sequence + if (score_rc >= 8 && read_rc.size() > y_rc + split_level) { + return read_rc.substr(y_rc, split_level); + } + } + + // score too weak, or read too small = ignore the read + return "None"; +} + + +void clustering(const std::string& reads_path, const std::string& start_primer, int split_level, const std::string& output_dir) { + /* + */ + // start a timer + auto start = std::chrono::high_resolution_clock::now(); + + + std::ifstream input_read_file(reads_path); + + std::string read_name; + std::string sequence; + std::string line; + + + // browse the reads file and skip the header line + while (std::getline(input_read_file, read_name)) { + + // read the sequence line + std::getline(input_read_file, sequence); + + // Skip the quality score lines + std::getline(input_read_file, line); + std::getline(input_read_file, line); + + std::string cluster_id = get_cluster_id(sequence, start_primer, split_level); + + if (cluster_id == "None") { + continue; + } + std::ofstream output_file(output_dir + "/" + cluster_id + ".fasta", std::ios::app); // ios app is to append in the file + if (!output_file.is_open()) { + std::cerr << "Error opening output file\n"; + return; + } + output_file << ">" << read_name << "\n" << sequence << "\n"; + } + + input_read_file.close(); +} + + + +int main(int argc, char* argv[]) { + + // check if the input and output file paths are provided as arguments + if (argc != 3) { + std::cerr << "Usage: " << argv[0] << " <input_fastq> <output_dir>" << std::endl; + return 1; + } // ./fast_clustering reads.fastq clusters_dir_path + + // get the input and output paths from the arguments + //std::string input_fastq = argv[1]; + //std::string output_dir = argv[2]; + + std::string data_name = "10k"; + + std::string input_fastq = "new_tests/"+data_name+"/shuffled_reads_"+data_name+".fastq"; + std::string output_dir = "new_tests/"+data_name+"/clusters_"+data_name; + + //delete all fasta files in the clusters dir + std::filesystem::path dir_path(output_dir); + try { + for (const auto& entry : std::filesystem::directory_iterator(dir_path)) { + if (entry.path().extension() == ".fasta") { + std::filesystem::remove(entry.path()); + } + } + } catch (const std::filesystem::filesystem_error& ex) { + std::cerr << "Error deleting files: " << ex.what() << '\n'; + } + + std::string start_primer = "AGCCATCGTACGTAGCGGAT"; + + clustering(input_fastq, start_primer, 3, output_dir); + + return 0; +} + +//g++ -o partitioning partitioning.cpp -- GitLab