Mentions légales du service

Skip to content
Snippets Groups Projects
Commit a3fbdd2c authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

ini

parent 182c7ce4
No related branches found
No related tags found
No related merge requests found
//////////////////////////////////////////////
// Simple Ends-Free Smith-Waterman Algorithm
//
// You will be prompted for input sequences
// Penalties and match scores are hard-coded
//
// Program does not perform multiple tracebacks if
// it finds several alignments with the same score
//
// By Nikhil Gopal
// Similar implementation here: https://wiki.uni-koeln.de/biologicalphysics/index.php/Implementation_of_the_Smith-Waterman_local_alignment_algorithm
//////////////////////////////////////////////
//g++ SmithWaterman.cpp -o SmithWaterman
#include "SmithWaterman.h"
#include <iostream>
#include <fstream>
#include <cstdlib>
#include <string>
#include <cmath>
#include <sys/time.h>
using namespace std;
int ind;
double score_match = 10;
double score_gap = -7;
double score_mismatch = -5;
double similarityScore(char a, char b);
double findMax(double array[], int length);
double similarityScore(char a, char b)
{
double result;
if(a==b)
{
result=score_match;
}
else
{
result=score_mismatch;
}
return result;
}
double findMax(double array[], int length)
{
double max = array[0];
ind = 0;
for(int i=1; i<length; i++)
{
if(array[i] > max)
{
max = array[i];
ind=i;
}
}
return max;
}
void SmithWaterman(const std::string& seqA, const std::string& seqB, double& adjusted_score, int& align_start, int& align_stop) {
// initialize some variables
int lengthSeqA = seqA.length();
int lengthSeqB = seqB.length();
// initialize empty matrix
//double matrix[lengthSeqA+1][lengthSeqB+1];
double **matrix = (double **)malloc((lengthSeqA+1) * sizeof(double*));
for(int i = 0; i < (lengthSeqA+1); i++) matrix[i] = (double *)malloc((lengthSeqB+1) * sizeof(double));
for(int i=0;i<=lengthSeqA;i++)
{
for(int j=0;j<=lengthSeqB;j++)
{
matrix[i][j]=0;
}
}
double traceback[4];
int **I_i = (int **)malloc((lengthSeqA+1) * sizeof(int*));
for(int i = 0; i < (lengthSeqA+1); i++) I_i[i] = (int *)malloc((lengthSeqB+1) * sizeof(int));
int **I_j = (int **)malloc((lengthSeqA+1) * sizeof(int*));
for(int i = 0; i < (lengthSeqA+1); i++) I_j[i] = (int *)malloc((lengthSeqB+1) * sizeof(int));
//start populating matrix
for (int i=1;i<=lengthSeqA;i++)
{
for(int j=1;j<=lengthSeqB;j++)
{
traceback[0] = matrix[i-1][j-1]+similarityScore(seqA[i-1],seqB[j-1]);
traceback[1] = matrix[i-1][j]+score_gap;
traceback[2] = matrix[i][j-1]+score_gap;
traceback[3] = 0;
matrix[i][j] = findMax(traceback,4);
switch(ind)
{
case 0:
I_i[i][j] = i-1;
I_j[i][j] = j-1;
break;
case 1:
I_i[i][j] = i-1;
I_j[i][j] = j;
break;
case 2:
I_i[i][j] = i;
I_j[i][j] = j-1;
break;
case 3:
I_i[i][j] = i;
I_j[i][j] = j;
break;
}
}
}
// find the max score in the matrix
double matrix_max = 0;
int i_max=0, j_max=0;
for(int i=1;i<=lengthSeqA;i++)
{
for(int j=1;j<=lengthSeqB;j++)
{
if(matrix[i][j]>=matrix_max)
{
matrix_max = matrix[i][j];
i_max=i;
j_max=j;
}
}
}
//cout << "Max score in the matrix is " << matrix_max << endl;
// traceback
int current_i=i_max, current_j=j_max;
int next_i=I_i[current_i][current_j];
int next_j=I_j[current_i][current_j];
int tick=0;
//char consensus_a[lengthSeqA+lengthSeqB+2],consensus_b[lengthSeqA+lengthSeqB+2];
while(((current_i!=next_i) || (current_j!=next_j)) && (next_j!=0) && (next_i!=0))
{
//if(next_i==current_i) consensus_a[tick] = '-'; // deletion in A
//else consensus_a[tick] = seqA[current_i-1]; // match/mismatch in A
//if(next_j==current_j) consensus_b[tick] = '-'; // deletion in B
//else consensus_b[tick] = seqB[current_j-1]; // match/mismatch in B
current_i = next_i;
current_j = next_j;
next_i = I_i[current_i][current_j];
next_j = I_j[current_i][current_j];
tick++;
}
for(int i = 0; i < lengthSeqA+1; i++)
{
free(matrix[i]);
free(I_i[i]);
free(I_j[i]);
}
free(matrix);
free(I_i);
free(I_j);
//print the consensus sequences
//cout<<endl<<" "<<endl;
//cout<<"Alignment:"<<endl<<endl;
//for(int i=0;i<lengthSeqA;i++){cout<<seqA[i];}; cout<<" and"<<endl;
//for(int i=0;i<lengthSeqB;i++){cout<<seqB[i];}; cout<<endl<<endl;
//for(int i=tick-1;i>=0;i--) cout<<consensus_a[i];
//cout<<endl;
//for(int j=tick-1;j>=0;j--) cout<<consensus_b[j];
//cout<<endl;
adjusted_score = matrix_max/lengthSeqB;
align_start = next_i;
align_stop = i_max;
}
/*
int main(int argc, char* argv[])
{
//args = ref_sequence to_align_sequence
if(argc!=3){
printf("usage : SmithWaterman ref_sequence to_align_sequence\n");
return 1;
}
//cout << argv[2] << endl;
string seqA = argv[1]; // sequence A
string seqB = argv[2]; // sequence B
double adjusted_score;
int align_start;
int align_stop;
SmithWaterman(seqA, seqB, adjusted_score, align_start, align_stop);
cout << adjusted_score << " " << align_start << " " << align_stop << endl;
return 0;
}*/
#ifndef SMITH_WATERMAN_H
#define SMITH_WATERMAN_H
#include <string>
void SmithWaterman(const std::string& seqA, const std::string& seqB, double& adjusted_score, int& align_start, int& align_stop);
#endif // SMITH_WATERMAN_H
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <list>
#include <bitset>
#include <cmath>
#include <algorithm>
#include <filesystem>
#include <chrono>
#include "SmithWaterman.h" // include the header file for the Smith-Waterman algorithm
std::string reverse_complement(const std::string& sequence) {
// get the reverse complement of a sequence
std::string rc_sequence = sequence;
std::transform(rc_sequence.begin(), rc_sequence.end(), rc_sequence.begin(), [](char c) {
switch (c) {
case 'A': return 'T';
case 'C': return 'G';
case 'G': return 'C';
case 'T': return 'A';
default: return c;
}
});
std::reverse(rc_sequence.begin(), rc_sequence.end());
return rc_sequence;
}
int count_sequences(const std::string& input_fastq) {
// get the number of sequences from the fastq
std::ifstream file(input_fastq);
std::string line;
int seq_number = 0;
while (std::getline(file, line)) {
if (line[0] == '@') {
seq_number++;
}
}
return seq_number;
}
// function to read FASTQ file and return a vector of sequences
std::vector<std::string> read_fastq(const std::string& filename) {
std::ifstream file(filename);
std::vector<std::string> sequences;
std::string line;
if (!file.is_open()) {
std::cerr << "Error opening file: " << filename << std::endl;
return sequences;
}
// Skip the header line
while (std::getline(file, line)) {
// Read the sequence line
std::getline(file, line);
sequences.push_back(line);
// Skip the quality score lines
std::getline(file, line);
std::getline(file, line);
}
file.close();
return sequences;
}
std::string get_cluster_id(const std::string& read, const std::string& start_primer, int split_level) {
/*
get the <split_level> bases following the primer in the read, check if the read is in reverse complement
*/
std::string read_rc = reverse_complement(read); // reverse complement of the read sequence
// get primer end position, check if read is forward
int seq_to_find_size = std::min(static_cast<int>(start_primer.size()), 12);
std::string seq_to_find = start_primer.substr(start_primer.size() - seq_to_find_size); // search for a smaller sequence than the full primer if its len is > 10
double score_fw, score_rc;
int x_fw, y_fw, x_rc, y_rc; //x_fw and x_rc not used
// call local alignment algorithm on the forward sequence
SmithWaterman(read, seq_to_find, score_fw, x_fw, y_fw);
// same for the reverse complement sequence
SmithWaterman(read_rc, seq_to_find, score_rc, x_rc, y_rc);
if (score_fw >= score_rc) { // primer found in forward sequence
if (score_fw >= 8 && read.size() > y_fw + split_level) {
return read.substr(y_fw, split_level);
}
} else { // primer found in reverse complement sequence
if (score_rc >= 8 && read_rc.size() > y_rc + split_level) {
return read_rc.substr(y_rc, split_level);
}
}
// score too weak, or read too small = ignore the read
return "None";
}
void clustering(const std::string& reads_path, const std::string& start_primer, int split_level, const std::string& output_dir) {
/*
*/
// start a timer
auto start = std::chrono::high_resolution_clock::now();
std::ifstream input_read_file(reads_path);
std::string read_name;
std::string sequence;
std::string line;
// browse the reads file and skip the header line
while (std::getline(input_read_file, read_name)) {
// read the sequence line
std::getline(input_read_file, sequence);
// Skip the quality score lines
std::getline(input_read_file, line);
std::getline(input_read_file, line);
std::string cluster_id = get_cluster_id(sequence, start_primer, split_level);
if (cluster_id == "None") {
continue;
}
std::ofstream output_file(output_dir + "/" + cluster_id + ".fasta", std::ios::app); // ios app is to append in the file
if (!output_file.is_open()) {
std::cerr << "Error opening output file\n";
return;
}
output_file << ">" << read_name << "\n" << sequence << "\n";
}
input_read_file.close();
}
int main(int argc, char* argv[]) {
// check if the input and output file paths are provided as arguments
if (argc != 3) {
std::cerr << "Usage: " << argv[0] << " <input_fastq> <output_dir>" << std::endl;
return 1;
} // ./fast_clustering reads.fastq clusters_dir_path
// get the input and output paths from the arguments
//std::string input_fastq = argv[1];
//std::string output_dir = argv[2];
std::string data_name = "10k";
std::string input_fastq = "new_tests/"+data_name+"/shuffled_reads_"+data_name+".fastq";
std::string output_dir = "new_tests/"+data_name+"/clusters_"+data_name;
//delete all fasta files in the clusters dir
std::filesystem::path dir_path(output_dir);
try {
for (const auto& entry : std::filesystem::directory_iterator(dir_path)) {
if (entry.path().extension() == ".fasta") {
std::filesystem::remove(entry.path());
}
}
} catch (const std::filesystem::filesystem_error& ex) {
std::cerr << "Error deleting files: " << ex.what() << '\n';
}
std::string start_primer = "AGCCATCGTACGTAGCGGAT";
clustering(input_fastq, start_primer, 3, output_dir);
return 0;
}
//g++ -o partitioning partitioning.cpp
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment