Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 93fd0536 authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

updated comments

parent 9845f58d
Branches
No related tags found
No related merge requests found
...@@ -9,12 +9,12 @@ ...@@ -9,12 +9,12 @@
#include <mutex> #include <mutex>
#include <unordered_map> #include <unordered_map>
#include "SmithWaterman.h" // include the header file for the Smith-Waterman algorithm #include "SmithWaterman.h" // include the header file for the Smith-Waterman algorithm (for local alignment)
std::unordered_map<std::string, std::mutex> output_mutexes; // contains a mutex for each possible output cluster file std::unordered_map<std::string, std::mutex> output_mutexes; // contains a mutex for each possible output cluster file
std::string reverse_complement(const std::string& sequence) { std::string reverse_complement(const std::string& sequence) {
// get the reverse complement of a sequence // returns the reverse complement of a sequence
std::string rc_sequence = sequence; std::string rc_sequence = sequence;
std::transform(rc_sequence.begin(), rc_sequence.end(), rc_sequence.begin(), [](char c) { std::transform(rc_sequence.begin(), rc_sequence.end(), rc_sequence.begin(), [](char c) {
switch (c) { switch (c) {
...@@ -32,11 +32,9 @@ std::string reverse_complement(const std::string& sequence) { ...@@ -32,11 +32,9 @@ std::string reverse_complement(const std::string& sequence) {
std::string get_cluster_id(const std::string& read, const std::string& seq_to_find, int split_level) { std::string get_cluster_id(const std::string& read, const std::string& seq_to_find, int split_level) {
/* /*
get the <split_level> bases following the primer in the read, check if the read is in reverse complement get the <split_level> bases following the primer in the read, also try to find the primer in the reverse complement sequence
*/ */
double score_fw, score_rc; // alignment score with the forward and reverse complement sequence double score_fw, score_rc; // alignment score with the forward and reverse complement sequence
int x_fw, y_fw, x_rc, y_rc; //x_fw and x_rc not used int x_fw, y_fw, x_rc, y_rc; //x_fw and x_rc not used
...@@ -48,7 +46,7 @@ std::string get_cluster_id(const std::string& read, const std::string& seq_to_fi ...@@ -48,7 +46,7 @@ std::string get_cluster_id(const std::string& read, const std::string& seq_to_fi
return read.substr(y_fw, split_level); return read.substr(y_fw, split_level);
} }
std::string read_rc = reverse_complement(read); // reverse complement of the read sequence std::string read_rc = reverse_complement(read); // get reverse complement of the read sequence
// test local alignement with the reverse complement sequence // test local alignement with the reverse complement sequence
SmithWaterman(read_rc, seq_to_find, score_rc, x_rc, y_rc); SmithWaterman(read_rc, seq_to_find, score_rc, x_rc, y_rc);
...@@ -69,6 +67,12 @@ std::string get_cluster_id(const std::string& read, const std::string& seq_to_fi ...@@ -69,6 +67,12 @@ std::string get_cluster_id(const std::string& read, const std::string& seq_to_fi
void process_file_part(const std::string& input_filename, int part_id, int start_line, int end_line, const std::string& seq_to_find, int split_level, const std::string& output_dir) { void process_file_part(const std::string& input_filename, int part_id, int start_line, int end_line, const std::string& seq_to_find, int split_level, const std::string& output_dir) {
/*
thread used on a part of the input file
loop over the sequences to sort them into clusters depending on the bases following the primer
*/
std::cout << "thread " << part_id << " for lines " << start_line << " to " << end_line << std::endl;
std::ifstream input_read_file(input_filename); std::ifstream input_read_file(input_filename);
if (!input_read_file.is_open()) { if (!input_read_file.is_open()) {
...@@ -77,95 +81,97 @@ void process_file_part(const std::string& input_filename, int part_id, int start ...@@ -77,95 +81,97 @@ void process_file_part(const std::string& input_filename, int part_id, int start
} }
std::string line; std::string line;
// skip the file to the start line std::string read_name;
std::string sequence;
// skip the file lines to reach the start line
input_read_file.seekg(0, std::ios::beg); input_read_file.seekg(0, std::ios::beg);
for (int i = 0; i < start_line; i++) { for (int i = 0; i < start_line; i++) {
std::getline(input_read_file, line); std::getline(input_read_file, line);
} }
std::cout << "thread " << part_id << " for lines " << start_line << " to " << end_line << std::endl; // some var for progression display
// Process the lines in the file part
std::string read_name;
std::string sequence;
int read_count = 0; int read_count = 0;
int total_reads = (end_line - start_line)/4; int total_reads = (end_line - start_line)/4;
int step_count = 0; int step_count = 0;
// process the lines in the file part, add 4 lines to the i counter each loop // process the sequences in the file part, add 4 lines to the i counter each loop because of the fastq format
for (int i = start_line; i < end_line; i+=4) { for (int i = start_line; i < end_line; i+=4) {
// read the sequence name // read the sequence name
std::getline(input_read_file, read_name); std::getline(input_read_file, read_name);
// read the sequence line // read the sequence
std::getline(input_read_file, sequence); std::getline(input_read_file, sequence);
// Skip the quality score lines // skip the quality score lines
std::getline(input_read_file, line); std::getline(input_read_file, line);
std::getline(input_read_file, line); std::getline(input_read_file, line);
// find the position fo the primer and get the bases following it // find the position of the primer and get the bases following it
std::string cluster_id = get_cluster_id(sequence, seq_to_find, split_level); std::string cluster_id = get_cluster_id(sequence, seq_to_find, split_level);
read_count += 1; read_count += 1;
if (read_count >= total_reads/10 && part_id == 0){ if (read_count >= total_reads/10 && part_id == 0){
// progression display for the thread 0
read_count = 0; read_count = 0;
step_count += 1; step_count += 1;
std::cout << "thread " << part_id << " : " << step_count <<"0%" << std::endl; std::cout << "thread " << part_id << " : " << step_count <<"0%" << std::endl;
} }
// no cluster associated to the read, because of weak score of short length
if (cluster_id == "None") { if (cluster_id == "None") {
continue; // ignore the read continue; // reject the read
} }
// Lock the mutex for this output file // lock the mutex for this output file
std::lock_guard<std::mutex> lock(output_mutexes[cluster_id]); std::lock_guard<std::mutex> lock(output_mutexes[cluster_id]);
// write the sequence and its id to a cluster file corresponding to the bases found after the primer // write the sequence and its id to a cluster file corresponding to the bases found after the primer
std::ofstream output_file(output_dir + "/" + cluster_id + ".fasta", std::ios::app); // ios app is to append in the file std::ofstream output_file(output_dir + "/" + cluster_id + ".fasta", std::ios::app); // ios app is to append to the file
if (!output_file.is_open()) { if (!output_file.is_open()) {
std::cerr << "Error opening output file\n"; std::cerr << "Error opening output file\n";
return; return;
} }
output_file << ">" << read_name << "\n" << sequence << "\n"; output_file << ">" << read_name << "\n" << sequence << "\n"; // output format is fasta
} }
} }
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
// check if the input and output file paths are provided as arguments // check if the number of thread is provided
if (argc < 3 || argc > 4) { if (argc < 4 || argc > 5) {
std::cerr << "Usage: " << argv[0] << " <input_fastq> <output_dir> <n_thread>" << std::endl; std::cerr << "Usage: " << argv[0] << " <input_fastq> <output_dir> <start_primer> <optional:n_thread>" << std::endl;
return 1; return 1;
} // ./fast_clustering reads.fastq clusters_dir_path }
// get the input and output paths from the arguments // get the input, output paths and primer from the arguments
std::string input_fastq = argv[1]; std::string input_fastq = argv[1];
std::string output_dir = argv[2]; std::string output_dir = argv[2];
std::string start_primer = argv[3]; //"GTTCAGAGTTCTACAGTCCGACGATCC" for 27M image reads;
int n_thread; // number of threads used, default = maximum possible int n_thread; // number of threads used, default = maximum possible
if (argc == 3) { if (argc == 4) {
n_thread = std::thread::hardware_concurrency(); n_thread = std::thread::hardware_concurrency();
} else { } else {
n_thread = std::stoi(argv[3]); n_thread = std::stoi(argv[4]);
} }
std::string start_primer = "GTTCAGAGTTCTACAGTCCGACGATCC"; // optimisation : search for a smaller sequence than the full primer if its len is > 8
int seq_to_find_size = std::min(static_cast<int>(start_primer.size()), 8); int seq_to_find_size = std::min(static_cast<int>(start_primer.size()), 8);
std::string seq_to_find = start_primer.substr(start_primer.size() - seq_to_find_size); // search for a smaller sequence than the full primer if its len is > 10 // ex : instead of searching "123456789", search just "456789" in the sequences, which is way faster
std::string seq_to_find = start_primer.substr(start_primer.size() - seq_to_find_size);
// start a timer // start a timer
auto start = std::chrono::high_resolution_clock::now(); auto start = std::chrono::high_resolution_clock::now();
// create output clusters directory // create output directory to contain all clusters files
std::filesystem::create_directory(output_dir); std::filesystem::create_directory(output_dir);
//delete all previous fasta files in the output clusters dir if any // delete all previous fasta files in the output clusters dir if any
std::filesystem::path dir_path(output_dir); std::filesystem::path dir_path(output_dir);
try { try {
for (const auto& entry : std::filesystem::directory_iterator(dir_path)) { for (const auto& entry : std::filesystem::directory_iterator(dir_path)) {
...@@ -191,22 +197,23 @@ int main(int argc, char* argv[]) { ...@@ -191,22 +197,23 @@ int main(int argc, char* argv[]) {
} }
input_read_file.close(); input_read_file.close();
// Calculate the number of lines in each part // calculate the number of lines used for each thread
int lines_per_part = (num_lines + n_thread - 1) / n_thread; int lines_per_part = (num_lines + n_thread - 1) / n_thread;
lines_per_part = (lines_per_part + 3) / 4 * 4; // Round up to the nearest multiple of 4 lines_per_part = (lines_per_part + 3) / 4 * 4; // round up to the nearest multiple of 4
// Create a thread for each file part // create a thread for each file part
std::vector<std::thread> threads; std::vector<std::thread> threads;
for (int i = 0; i < n_thread; i++) { for (int i = 0; i < n_thread; i++) {
// Calculate the start and end line numbers for this part // get the start and end line numbers for this part
int start_line = i * lines_per_part; int start_line = i * lines_per_part;
int end_line = std::min((i + 1) * lines_per_part, num_lines); int end_line = std::min((i + 1) * lines_per_part, num_lines);
// start the thread
threads.emplace_back(process_file_part, input_fastq, i, start_line, end_line, seq_to_find, 3, output_dir); threads.emplace_back(process_file_part, input_fastq, i, start_line, end_line, seq_to_find, 3, output_dir);
} }
// Wait for all threads to finish // wait for all threads to finish
for (auto& t : threads) { for (auto& t : threads) {
t.join(); t.join();
} }
...@@ -219,4 +226,4 @@ int main(int argc, char* argv[]) { ...@@ -219,4 +226,4 @@ int main(int argc, char* argv[]) {
return 0; return 0;
} }
//g++ -o fast_clustering_t fast_clustering_t.cpp SmithWaterman.cpp && ./fast_clustering_t reads.fastq cluster_dir //g++ -o fast_clustering_t fast_clustering_t.cpp SmithWaterman.cpp && ./fast_clustering_t reads.fastq cluster_dir GTTCAGAGTTCTACAGTCCGACGATCC
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment