Mentions légales du service

Skip to content
Snippets Groups Projects
Commit d6cbd2db authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

cleaned includes and functions, refactoring

parent a3fbdd2c
No related branches found
No related tags found
No related merge requests found
#include <iostream> #include <iostream>
#include <fstream> #include <fstream>
#include <filesystem>
#include <string> #include <string>
#include <vector> #include <vector>
#include <list>
#include <bitset>
#include <cmath>
#include <algorithm> #include <algorithm>
#include <filesystem>
#include <chrono> #include <chrono>
#include "SmithWaterman.h" // include the header file for the Smith-Waterman algorithm #include "SmithWaterman.h" // include the header file for the Smith-Waterman algorithm
...@@ -29,23 +26,8 @@ std::string reverse_complement(const std::string& sequence) { ...@@ -29,23 +26,8 @@ std::string reverse_complement(const std::string& sequence) {
} }
int count_sequences(const std::string& input_fastq) {
// get the number of sequences from the fastq
std::ifstream file(input_fastq);
std::string line;
int seq_number = 0;
while (std::getline(file, line)) {
if (line[0] == '@') {
seq_number++;
}
}
return seq_number;
}
// function to read FASTQ file and return a vector of sequences
std::vector<std::string> read_fastq(const std::string& filename) { std::vector<std::string> read_fastq(const std::string& filename) {
// function to read FASTQ file and return a vector of sequences
std::ifstream file(filename); std::ifstream file(filename);
std::vector<std::string> sequences; std::vector<std::string> sequences;
std::string line; std::string line;
...@@ -73,24 +55,26 @@ std::vector<std::string> read_fastq(const std::string& filename) { ...@@ -73,24 +55,26 @@ std::vector<std::string> read_fastq(const std::string& filename) {
} }
std::string get_cluster_id(const std::string& read, const std::string& start_primer, int split_level) { std::string get_cluster_id(const std::string& read, const std::string& seq_to_find, int split_level) {
/* /*
get the <split_level> bases following the primer in the read, check if the read is in reverse complement get the <split_level> bases following the primer in the read, check if the read is in reverse complement
*/ */
std::string read_rc = reverse_complement(read); // reverse complement of the read sequence
// get primer end position, check if read is forward std::string read_rc = reverse_complement(read); // reverse complement of the read sequence
int seq_to_find_size = std::min(static_cast<int>(start_primer.size()), 12);
std::string seq_to_find = start_primer.substr(start_primer.size() - seq_to_find_size); // search for a smaller sequence than the full primer if its len is > 10
double score_fw, score_rc; double score_fw, score_rc; // alignment score with the forward and reverse complement sequence
int x_fw, y_fw, x_rc, y_rc; //x_fw and x_rc not used int x_fw, y_fw, x_rc, y_rc; //x_fw and x_rc not used
// call local alignment algorithm on the forward sequence // call local alignment algorithm on the forward sequence
SmithWaterman(read, seq_to_find, score_fw, x_fw, y_fw); SmithWaterman(read, seq_to_find, score_fw, x_fw, y_fw);
// same for the reverse complement sequence // don't test for the reverse complement if near perfect alignment already found
if (score_fw >= 9.5 && read.size() > y_fw + split_level) {
return read.substr(y_fw, split_level);
}
// test local alignement with the reverse complement sequence
SmithWaterman(read_rc, seq_to_find, score_rc, x_rc, y_rc); SmithWaterman(read_rc, seq_to_find, score_rc, x_rc, y_rc);
if (score_fw >= score_rc) { // primer found in forward sequence if (score_fw >= score_rc) { // primer found in forward sequence
...@@ -111,17 +95,28 @@ std::string get_cluster_id(const std::string& read, const std::string& start_pri ...@@ -111,17 +95,28 @@ std::string get_cluster_id(const std::string& read, const std::string& start_pri
void clustering(const std::string& reads_path, const std::string& start_primer, int split_level, const std::string& output_dir) { void clustering(const std::string& reads_path, const std::string& start_primer, int split_level, const std::string& output_dir) {
/* /*
*/ */
// start a timer
auto start = std::chrono::high_resolution_clock::now();
//delete all fasta files in the clusters dir
std::filesystem::path dir_path(output_dir);
try {
for (const auto& entry : std::filesystem::directory_iterator(dir_path)) {
if (entry.path().extension() == ".fasta") {
std::filesystem::remove(entry.path());
}
}
} catch (const std::filesystem::filesystem_error& ex) {
std::cerr << "Error deleting files: " << ex.what() << '\n';
}
std::ifstream input_read_file(reads_path); int seq_to_find_size = std::min(static_cast<int>(start_primer.size()), 8);
std::string seq_to_find = start_primer.substr(start_primer.size() - seq_to_find_size); // search for a smaller sequence than the full primer if its len is > 10
std::ifstream input_read_file(reads_path); // read the fastq
std::string read_name; std::string read_name;
std::string sequence; std::string sequence;
std::string line; std::string line;
// browse the reads file and skip the header line // browse the reads file and skip the header line
while (std::getline(input_read_file, read_name)) { while (std::getline(input_read_file, read_name)) {
...@@ -132,11 +127,13 @@ void clustering(const std::string& reads_path, const std::string& start_primer, ...@@ -132,11 +127,13 @@ void clustering(const std::string& reads_path, const std::string& start_primer,
std::getline(input_read_file, line); std::getline(input_read_file, line);
std::getline(input_read_file, line); std::getline(input_read_file, line);
std::string cluster_id = get_cluster_id(sequence, start_primer, split_level); // find the position fo the primer and get the bases following it
std::string cluster_id = get_cluster_id(sequence, seq_to_find, split_level);
if (cluster_id == "None") { if (cluster_id == "None") {
continue; continue; // ignore the read
} }
// write the sequence and its id to a cluster file corresponding to the bases found after the primer
std::ofstream output_file(output_dir + "/" + cluster_id + ".fasta", std::ios::app); // ios app is to append in the file std::ofstream output_file(output_dir + "/" + cluster_id + ".fasta", std::ios::app); // ios app is to append in the file
if (!output_file.is_open()) { if (!output_file.is_open()) {
std::cerr << "Error opening output file\n"; std::cerr << "Error opening output file\n";
...@@ -159,31 +156,27 @@ int main(int argc, char* argv[]) { ...@@ -159,31 +156,27 @@ int main(int argc, char* argv[]) {
} // ./fast_clustering reads.fastq clusters_dir_path } // ./fast_clustering reads.fastq clusters_dir_path
// get the input and output paths from the arguments // get the input and output paths from the arguments
//std::string input_fastq = argv[1]; std::string input_fastq = argv[1];
//std::string output_dir = argv[2]; std::string output_dir = argv[2];
std::string data_name = "10k"; // start a timer
auto start = std::chrono::high_resolution_clock::now();
std::string input_fastq = "new_tests/"+data_name+"/shuffled_reads_"+data_name+".fastq"; std::string data_name = "100k";
std::string output_dir = "new_tests/"+data_name+"/clusters_"+data_name;
//delete all fasta files in the clusters dir input_fastq = "new_tests/"+data_name+"/shuffled_reads_"+data_name+".fastq";
std::filesystem::path dir_path(output_dir); output_dir = "new_tests/"+data_name+"/clusters_"+data_name;
try {
for (const auto& entry : std::filesystem::directory_iterator(dir_path)) {
if (entry.path().extension() == ".fasta") {
std::filesystem::remove(entry.path());
}
}
} catch (const std::filesystem::filesystem_error& ex) {
std::cerr << "Error deleting files: " << ex.what() << '\n';
}
std::string start_primer = "AGCCATCGTACGTAGCGGAT"; std::string start_primer = "AGCCATCGTACGTAGCGGAT";
clustering(input_fastq, start_primer, 3, output_dir); clustering(input_fastq, start_primer, 3, output_dir);
// end the timer and print the elapsed time
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
std::cout << elapsed.count() << "s for fast clustering" << std::endl;
return 0; return 0;
} }
//g++ -o partitioning partitioning.cpp //g++ -o fast_clustering fast_clustering.cpp SmithWaterman.cpp && ./fast_clustering reads.fastq cluster_dir
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment