From a3fbdd2c68a5c778941e91625707e0405d09ab1d Mon Sep 17 00:00:00 2001
From: oboulle <olivier.boulle@inria.fr>
Date: Thu, 11 Jul 2024 15:14:33 +0200
Subject: [PATCH] ini

---
 partitioning/SmithWaterman.cpp   | 216 +++++++++++++++++++++++++++++++
 partitioning/SmithWaterman.h     |   8 ++
 partitioning/fast_clustering.cpp | 189 +++++++++++++++++++++++++++
 3 files changed, 413 insertions(+)
 create mode 100644 partitioning/SmithWaterman.cpp
 create mode 100644 partitioning/SmithWaterman.h
 create mode 100755 partitioning/fast_clustering.cpp

diff --git a/partitioning/SmithWaterman.cpp b/partitioning/SmithWaterman.cpp
new file mode 100644
index 0000000..d79b6eb
--- /dev/null
+++ b/partitioning/SmithWaterman.cpp
@@ -0,0 +1,216 @@
+//////////////////////////////////////////////
+// Simple Ends-Free Smith-Waterman Algorithm
+//
+// You will be prompted for input sequences
+// Penalties and match scores are hard-coded
+//
+// Program does not perform multiple tracebacks if 
+// it finds several alignments with the same score
+//
+// By Nikhil Gopal
+// Similar implementation here: https://wiki.uni-koeln.de/biologicalphysics/index.php/Implementation_of_the_Smith-Waterman_local_alignment_algorithm
+//////////////////////////////////////////////
+//g++ SmithWaterman.cpp -o SmithWaterman
+
+#include "SmithWaterman.h"
+
+#include <iostream>
+#include <fstream>
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <sys/time.h>
+using namespace std;
+
+int ind; 
+
+double score_match = 10;
+double score_gap = -7;
+double score_mismatch = -5;
+
+double similarityScore(char a, char b);
+double findMax(double array[], int length);
+
+double similarityScore(char a, char b)
+{
+	double result;
+	if(a==b)
+	{
+		result=score_match;
+	}
+	else
+	{
+		result=score_mismatch;
+	}
+	return result;
+}
+
+double findMax(double array[], int length)
+{
+	double max = array[0];
+	ind = 0;
+
+	for(int i=1; i<length; i++)
+	{
+		if(array[i] > max)
+		{
+			max = array[i];
+			ind=i;
+		}
+	}
+	return max;
+}
+
+
+void SmithWaterman(const std::string& seqA, const std::string& seqB, double& adjusted_score, int& align_start, int& align_stop) {
+
+    // initialize some variables
+	int lengthSeqA = seqA.length();
+	int lengthSeqB = seqB.length();
+
+	// initialize empty matrix
+	//double matrix[lengthSeqA+1][lengthSeqB+1];
+	double **matrix = (double **)malloc((lengthSeqA+1) * sizeof(double*));
+	for(int i = 0; i < (lengthSeqA+1); i++) matrix[i] = (double *)malloc((lengthSeqB+1) * sizeof(double));
+
+	for(int i=0;i<=lengthSeqA;i++)
+	{
+		for(int j=0;j<=lengthSeqB;j++)
+		{
+			matrix[i][j]=0;
+		}
+	}
+
+	double traceback[4];
+
+	int **I_i = (int **)malloc((lengthSeqA+1) * sizeof(int*));
+	for(int i = 0; i < (lengthSeqA+1); i++) I_i[i] = (int *)malloc((lengthSeqB+1) * sizeof(int));
+
+	int **I_j = (int **)malloc((lengthSeqA+1) * sizeof(int*));
+	for(int i = 0; i < (lengthSeqA+1); i++) I_j[i] = (int *)malloc((lengthSeqB+1) * sizeof(int));
+
+	//start populating matrix
+	for (int i=1;i<=lengthSeqA;i++)
+	{
+		for(int j=1;j<=lengthSeqB;j++)
+        {
+			traceback[0] = matrix[i-1][j-1]+similarityScore(seqA[i-1],seqB[j-1]);
+			traceback[1] = matrix[i-1][j]+score_gap;
+			traceback[2] = matrix[i][j-1]+score_gap;
+			traceback[3] = 0;
+			matrix[i][j] = findMax(traceback,4);
+			switch(ind)
+			{
+				case 0:
+					I_i[i][j] = i-1;
+					I_j[i][j] = j-1;
+					break;
+				case 1:
+					I_i[i][j] = i-1;
+                    I_j[i][j] = j;
+                    break;
+				case 2:
+					I_i[i][j] = i;
+					I_j[i][j] = j-1;
+					break;
+				case 3:
+					I_i[i][j] = i;
+					I_j[i][j] = j;
+					break;
+			}
+        }
+	}
+
+	// find the max score in the matrix
+	double matrix_max = 0;
+	int i_max=0, j_max=0;
+	for(int i=1;i<=lengthSeqA;i++)
+	{
+		for(int j=1;j<=lengthSeqB;j++)
+		{
+			if(matrix[i][j]>=matrix_max)
+			{
+				matrix_max = matrix[i][j];
+				i_max=i;
+				j_max=j;
+			}
+		}
+	}
+
+	//cout << "Max score in the matrix is " << matrix_max << endl;
+
+	// traceback
+
+	int current_i=i_max, current_j=j_max;
+	int next_i=I_i[current_i][current_j];
+	int next_j=I_j[current_i][current_j];
+	int tick=0;
+	//char consensus_a[lengthSeqA+lengthSeqB+2],consensus_b[lengthSeqA+lengthSeqB+2];
+
+	while(((current_i!=next_i) || (current_j!=next_j)) && (next_j!=0) && (next_i!=0))
+	{
+
+		//if(next_i==current_i)  consensus_a[tick] = '-';                  // deletion in A
+		//else                   consensus_a[tick] = seqA[current_i-1];   // match/mismatch in A
+
+		//if(next_j==current_j)  consensus_b[tick] = '-';                  // deletion in B
+		//else                   consensus_b[tick] = seqB[current_j-1];   // match/mismatch in B
+
+		current_i = next_i;
+		current_j = next_j;
+		next_i = I_i[current_i][current_j];
+		next_j = I_j[current_i][current_j];
+		tick++;
+	}
+
+	for(int i = 0; i < lengthSeqA+1; i++)
+	{
+	    free(matrix[i]);
+		free(I_i[i]);
+		free(I_j[i]);
+	}
+	free(matrix);
+	free(I_i);
+	free(I_j);
+
+	//print the consensus sequences
+	//cout<<endl<<" "<<endl;
+	//cout<<"Alignment:"<<endl<<endl;
+	//for(int i=0;i<lengthSeqA;i++){cout<<seqA[i];}; cout<<"  and"<<endl;
+	//for(int i=0;i<lengthSeqB;i++){cout<<seqB[i];}; cout<<endl<<endl;
+	//for(int i=tick-1;i>=0;i--) cout<<consensus_a[i];
+	//cout<<endl;
+	//for(int j=tick-1;j>=0;j--) cout<<consensus_b[j];
+	//cout<<endl;
+
+	adjusted_score = matrix_max/lengthSeqB;
+	align_start = next_i;
+	align_stop = i_max;
+
+}
+
+/*
+int main(int argc, char* argv[])
+{
+
+	//args = ref_sequence to_align_sequence
+	if(argc!=3){
+		printf("usage : SmithWaterman ref_sequence to_align_sequence\n");
+		return 1;
+	}
+
+	//cout << argv[2] << endl;
+	string seqA = argv[1]; // sequence A
+	string seqB = argv[2]; // sequence B
+
+    double adjusted_score;
+    int align_start;
+    int align_stop;
+
+	SmithWaterman(seqA, seqB, adjusted_score, align_start, align_stop);
+
+	cout << adjusted_score << " " << align_start << " " << align_stop << endl;
+
+	return 0;
+}*/
+
diff --git a/partitioning/SmithWaterman.h b/partitioning/SmithWaterman.h
new file mode 100644
index 0000000..faf39f1
--- /dev/null
+++ b/partitioning/SmithWaterman.h
@@ -0,0 +1,8 @@
+#ifndef SMITH_WATERMAN_H
+#define SMITH_WATERMAN_H
+
+#include <string>
+
+void SmithWaterman(const std::string& seqA, const std::string& seqB, double& adjusted_score, int& align_start, int& align_stop);
+
+#endif // SMITH_WATERMAN_H
diff --git a/partitioning/fast_clustering.cpp b/partitioning/fast_clustering.cpp
new file mode 100755
index 0000000..237fa79
--- /dev/null
+++ b/partitioning/fast_clustering.cpp
@@ -0,0 +1,189 @@
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <list>
+#include <bitset>
+#include <cmath>
+#include <algorithm>
+#include <filesystem>
+#include <chrono>
+
+#include "SmithWaterman.h" // include the header file for the Smith-Waterman algorithm
+
+
+std::string reverse_complement(const std::string& sequence) {
+    // get the reverse complement of a sequence
+    std::string rc_sequence = sequence;
+    std::transform(rc_sequence.begin(), rc_sequence.end(), rc_sequence.begin(), [](char c) {
+        switch (c) {
+            case 'A': return 'T';
+            case 'C': return 'G';
+            case 'G': return 'C';
+            case 'T': return 'A';
+            default: return c;
+        }
+    });
+    std::reverse(rc_sequence.begin(), rc_sequence.end());
+    return rc_sequence;
+}
+
+
+
+int count_sequences(const std::string& input_fastq) {
+    // get the number of sequences from the fastq
+    std::ifstream file(input_fastq);
+    std::string line;
+    int seq_number = 0;
+
+    while (std::getline(file, line)) {
+        if (line[0] == '@') {
+            seq_number++;
+        }
+    }
+    return seq_number;
+}
+
+// function to read FASTQ file and return a vector of sequences
+std::vector<std::string> read_fastq(const std::string& filename) {
+    std::ifstream file(filename);
+    std::vector<std::string> sequences;
+    std::string line;
+
+    if (!file.is_open()) {
+        std::cerr << "Error opening file: " << filename << std::endl;
+        return sequences;
+    }
+
+    // Skip the header line
+    while (std::getline(file, line)) {
+
+        // Read the sequence line
+        std::getline(file, line);
+        sequences.push_back(line);
+
+        // Skip the quality score lines
+        std::getline(file, line);
+        std::getline(file, line);
+    }
+
+    file.close();
+
+    return sequences;
+}
+
+
+std::string get_cluster_id(const std::string& read, const std::string& start_primer, int split_level) {
+    /*
+    get the <split_level> bases following the primer in the read, check if the read is in reverse complement
+    */
+
+    std::string read_rc = reverse_complement(read); // reverse complement of the read sequence
+
+    // get primer end position, check if read is forward
+    int seq_to_find_size = std::min(static_cast<int>(start_primer.size()), 12);
+    std::string seq_to_find = start_primer.substr(start_primer.size() - seq_to_find_size); // search for a smaller sequence than the full primer if its len is > 10
+
+    double score_fw, score_rc;
+    int x_fw, y_fw, x_rc, y_rc; //x_fw and x_rc not used
+
+    // call local alignment algorithm on the forward sequence
+    SmithWaterman(read, seq_to_find, score_fw, x_fw, y_fw);
+
+    // same for the reverse complement sequence
+    SmithWaterman(read_rc, seq_to_find, score_rc, x_rc, y_rc);
+
+    if (score_fw >= score_rc) { // primer found in forward sequence
+        if (score_fw >= 8 && read.size() > y_fw + split_level) {
+            return read.substr(y_fw, split_level);
+        }
+    } else { // primer found in reverse complement sequence
+        if (score_rc >= 8 && read_rc.size() > y_rc + split_level) {
+            return read_rc.substr(y_rc, split_level);
+        }
+    }
+
+    // score too weak, or read too small = ignore the read
+    return "None";
+}
+
+
+void clustering(const std::string& reads_path, const std::string& start_primer, int split_level, const std::string& output_dir) {
+    /*
+    */
+    // start a timer
+    auto start = std::chrono::high_resolution_clock::now();
+
+
+    std::ifstream input_read_file(reads_path);
+
+    std::string read_name;
+    std::string sequence;
+    std::string line;
+
+
+    // browse the reads file and skip the header line
+    while (std::getline(input_read_file, read_name)) {
+
+        // read the sequence line
+        std::getline(input_read_file, sequence);
+
+        // Skip the quality score lines
+        std::getline(input_read_file, line);
+        std::getline(input_read_file, line);
+
+        std::string cluster_id = get_cluster_id(sequence, start_primer, split_level);
+
+        if (cluster_id == "None") {
+            continue;
+        }
+        std::ofstream output_file(output_dir + "/" + cluster_id + ".fasta", std::ios::app); // ios app is to append in the file
+        if (!output_file.is_open()) {
+            std::cerr << "Error opening output file\n";
+            return;
+        }
+        output_file << ">" << read_name << "\n" << sequence << "\n";
+    }
+
+    input_read_file.close();
+}
+
+
+
+int main(int argc, char* argv[]) {
+
+    // check if the input and output file paths are provided as arguments
+    if (argc != 3) {
+        std::cerr << "Usage: " << argv[0] << " <input_fastq> <output_dir>" << std::endl;
+        return 1;
+    } // ./fast_clustering reads.fastq clusters_dir_path
+
+    // get the input and output paths from the arguments
+    //std::string input_fastq = argv[1];
+    //std::string output_dir = argv[2];
+
+    std::string data_name = "10k";
+
+    std::string input_fastq = "new_tests/"+data_name+"/shuffled_reads_"+data_name+".fastq";
+    std::string output_dir = "new_tests/"+data_name+"/clusters_"+data_name;
+
+    //delete all fasta files in the clusters dir
+    std::filesystem::path dir_path(output_dir);
+    try {
+        for (const auto& entry : std::filesystem::directory_iterator(dir_path)) {
+            if (entry.path().extension() == ".fasta") {
+                std::filesystem::remove(entry.path());
+            }
+        }
+    } catch (const std::filesystem::filesystem_error& ex) {
+        std::cerr << "Error deleting files: " << ex.what() << '\n';
+    }
+
+    std::string start_primer = "AGCCATCGTACGTAGCGGAT";
+
+    clustering(input_fastq, start_primer, 3, output_dir);
+
+    return 0;
+}
+
+//g++ -o partitioning partitioning.cpp
-- 
GitLab