updated comments

93fd0536 · BOULLE Olivier · 9845f58d · 93fd0536
Commit 93fd0536 authored 10 months ago by BOULLE Olivier
--- a/partitioning/fast_clustering_t.cpp
+++ b/partitioning/fast_clustering_t.cpp
@@ -9,12 +9,12 @@
 #include <mutex>
 #include <unordered_map>
-#include "SmithWaterman.h" // include the header file for the Smith-Waterman algorithm
+#include "SmithWaterman.h" // include the header file for the Smith-Waterman algorithm (for local alignment)
 std::unordered_map<std::string, std::mutex> output_mutexes; // contains a mutex for each possible output cluster file
 std::string reverse_complement(const std::string& sequence) {
-    // get the reverse complement of a sequence
+    // returns the reverse complement of a sequence
    std::string rc_sequence = sequence;
    std::transform(rc_sequence.begin(), rc_sequence.end(), rc_sequence.begin(), [](char c) {
        switch (c) {
@@ -32,11 +32,9 @@ std::string reverse_complement(const std::string& sequence) {
 std::string get_cluster_id(const std::string& read, const std::string& seq_to_find, int split_level) {
    /*
-    get the <split_level> bases following the primer in the read, check if the read is in reverse complement
+    get the <split_level> bases following the primer in the read, also try to find the primer in the reverse complement sequence
    */
    double score_fw, score_rc; // alignment score with the forward and reverse complement sequence
    int x_fw, y_fw, x_rc, y_rc; //x_fw and x_rc not used
@@ -48,7 +46,7 @@ std::string get_cluster_id(const std::string& read, const std::string& seq_to_fi
        return read.substr(y_fw, split_level);
    }
-    std::string read_rc = reverse_complement(read); // reverse complement of the read sequence
+    std::string read_rc = reverse_complement(read); // get reverse complement of the read sequence
    // test local alignement with the reverse complement sequence
    SmithWaterman(read_rc, seq_to_find, score_rc, x_rc, y_rc);
@@ -69,6 +67,12 @@ std::string get_cluster_id(const std::string& read, const std::string& seq_to_fi
 void process_file_part(const std::string& input_filename, int part_id, int start_line, int end_line, const std::string& seq_to_find, int split_level, const std::string& output_dir) {
+    /*
+    thread used on a part of the input file
+    loop over the sequences to sort them into clusters depending on the bases following the primer
+    */
+    std::cout << "thread " << part_id << " for lines " << start_line << " to " << end_line << std::endl;
    std::ifstream input_read_file(input_filename);
    if (!input_read_file.is_open()) {
@@ -77,95 +81,97 @@ void process_file_part(const std::string& input_filename, int part_id, int start
    }
    std::string line;
-    // skip the file to the start line
+    std::string read_name;
+    std::string sequence;
+    // skip the file lines to reach the start line
    input_read_file.seekg(0, std::ios::beg);
    for (int i = 0; i < start_line; i++) {
        std::getline(input_read_file, line);
    }
-    std::cout << "thread " << part_id << " for lines " << start_line << " to " << end_line << std::endl;
+    // some var for progression display
-    // Process the lines in the file part
-    std::string read_name;
-    std::string sequence;
    int read_count = 0;
    int total_reads = (end_line - start_line)/4;
    int step_count = 0;
-    // process the lines in the file part, add 4 lines to the i counter each loop
+    // process the sequences in the file part, add 4 lines to the i counter each loop because of the fastq format
    for (int i = start_line; i < end_line; i+=4) {
        // read the sequence name
        std::getline(input_read_file, read_name);
-        // read the sequence line
+        // read the sequence
        std::getline(input_read_file, sequence);
-        // Skip the quality score lines
+        // skip the quality score lines
        std::getline(input_read_file, line);
        std::getline(input_read_file, line);
-        // find the position fo the primer and get the bases following it
+        // find the position of the primer and get the bases following it
        std::string cluster_id = get_cluster_id(sequence, seq_to_find, split_level);
        read_count += 1;
        if (read_count >= total_reads/10 && part_id == 0){
+            // progression display for the thread 0
            read_count = 0;
            step_count += 1;
            std::cout << "thread " << part_id << " : " << step_count <<"0%" << std::endl;
        }
+        // no cluster associated to the read, because of weak score of short length
        if (cluster_id == "None") {
-            continue; // ignore the read
+            continue; // reject the read
        }
-        // Lock the mutex for this output file
+        // lock the mutex for this output file
        std::lock_guard<std::mutex> lock(output_mutexes[cluster_id]);
        // write the sequence and its id to a cluster file corresponding to the bases found after the primer
-        std::ofstream output_file(output_dir + "/" + cluster_id + ".fasta", std::ios::app); // ios app is to append in the file
+        std::ofstream output_file(output_dir + "/" + cluster_id + ".fasta", std::ios::app); // ios app is to append to the file
        if (!output_file.is_open()) {
            std::cerr << "Error opening output file\n";
            return;
        }
-        output_file << ">" << read_name << "\n" << sequence << "\n";
+        output_file << ">" << read_name << "\n" << sequence << "\n"; // output format is fasta
    }
 }
 int main(int argc, char* argv[]) {
-    // check if the input and output file paths are provided as arguments
+    // check if the number of thread is provided
-    if (argc < 3 || argc > 4) {
+    if (argc < 4 || argc > 5) {
-        std::cerr << "Usage: " << argv[0] << " <input_fastq> <output_dir> <n_thread>" << std::endl;
+        std::cerr << "Usage: " << argv[0] << " <input_fastq> <output_dir> <start_primer> <optional:n_thread>" << std::endl;
        return 1;
-    } // ./fast_clustering reads.fastq clusters_dir_path
+    }
-    // get the input and output paths from the arguments
+    // get the input, output paths and primer from the arguments
    std::string input_fastq = argv[1];
    std::string output_dir = argv[2];
+    std::string start_primer = argv[3]; //"GTTCAGAGTTCTACAGTCCGACGATCC" for 27M image reads;
    int n_thread; // number of threads used, default = maximum possible
-    if (argc == 3) {
+    if (argc == 4) {
        n_thread = std::thread::hardware_concurrency();
    } else {
-        n_thread = std::stoi(argv[3]);
+        n_thread = std::stoi(argv[4]);
    }
-    std::string start_primer = "GTTCAGAGTTCTACAGTCCGACGATCC";
+    // optimisation : search for a smaller sequence than the full primer if its len is > 8
    int seq_to_find_size = std::min(static_cast<int>(start_primer.size()), 8);
-    std::string seq_to_find = start_primer.substr(start_primer.size() - seq_to_find_size); // search for a smaller sequence than the full primer if its len is > 10
+    // ex : instead of searching "123456789", search just "456789" in the sequences, which is way faster
+    std::string seq_to_find = start_primer.substr(start_primer.size() - seq_to_find_size);
    // start a timer
    auto start = std::chrono::high_resolution_clock::now();
-    // create output clusters directory
+    // create output directory to contain all clusters files
    std::filesystem::create_directory(output_dir);
-    //delete all previous fasta files in the output clusters dir if any
+    // delete all previous fasta files in the output clusters dir if any
    std::filesystem::path dir_path(output_dir);
    try {
        for (const auto& entry : std::filesystem::directory_iterator(dir_path)) {
@@ -191,22 +197,23 @@ int main(int argc, char* argv[]) {
    }
    input_read_file.close();
-    // Calculate the number of lines in each part
+    // calculate the number of lines used for each thread
    int lines_per_part = (num_lines + n_thread - 1) / n_thread;
-    lines_per_part = (lines_per_part + 3) / 4 * 4; // Round up to the nearest multiple of 4
+    lines_per_part = (lines_per_part + 3) / 4 * 4; // round up to the nearest multiple of 4
-    // Create a thread for each file part
+    // create a thread for each file part
    std::vector<std::thread> threads;
    for (int i = 0; i < n_thread; i++) {
-        // Calculate the start and end line numbers for this part
+        // get the start and end line numbers for this part
        int start_line = i * lines_per_part;
        int end_line = std::min((i + 1) * lines_per_part, num_lines);
+        // start the thread
        threads.emplace_back(process_file_part, input_fastq, i, start_line, end_line, seq_to_find, 3, output_dir);
    }
-    // Wait for all threads to finish
+    // wait for all threads to finish
    for (auto& t : threads) {
        t.join();
    }
@@ -219,4 +226,4 @@ int main(int argc, char* argv[]) {
    return 0;
 }
-//g++ -o fast_clustering_t fast_clustering_t.cpp SmithWaterman.cpp && ./fast_clustering_t reads.fastq cluster_dir
+//g++ -o fast_clustering_t fast_clustering_t.cpp SmithWaterman.cpp && ./fast_clustering_t reads.fastq cluster_dir GTTCAGAGTTCTACAGTCCGACGATCC