diff --git a/partitioning/fast_clustering_t.cpp b/partitioning/fast_clustering_t.cpp new file mode 100755 index 0000000000000000000000000000000000000000..62c36b9dec736ace410022593d3e0439db6e3a95 --- /dev/null +++ b/partitioning/fast_clustering_t.cpp @@ -0,0 +1,212 @@ +#include <iostream> +#include <fstream> +#include <filesystem> +#include <string> +#include <vector> +#include <algorithm> +#include <chrono> +#include <thread> +#include <mutex> +#include <unordered_map> + +#include "SmithWaterman.h" // include the header file for the Smith-Waterman algorithm + +std::unordered_map<std::string, std::mutex> output_mutexes; // contains a mutex for each possible output cluster file + +std::string reverse_complement(const std::string& sequence) { + // get the reverse complement of a sequence + std::string rc_sequence = sequence; + std::transform(rc_sequence.begin(), rc_sequence.end(), rc_sequence.begin(), [](char c) { + switch (c) { + case 'A': return 'T'; + case 'C': return 'G'; + case 'G': return 'C'; + case 'T': return 'A'; + default: return c; + } + }); + std::reverse(rc_sequence.begin(), rc_sequence.end()); + return rc_sequence; +} + + +std::string get_cluster_id(const std::string& read, const std::string& seq_to_find, int split_level) { + /* + get the <split_level> bases following the primer in the read, check if the read is in reverse complement + */ + + + std::string read_rc = reverse_complement(read); // reverse complement of the read sequence + + double score_fw, score_rc; // alignment score with the forward and reverse complement sequence + int x_fw, y_fw, x_rc, y_rc; //x_fw and x_rc not used + + // call local alignment algorithm on the forward sequence + SmithWaterman(read, seq_to_find, score_fw, x_fw, y_fw); + + // don't test for the reverse complement if near perfect alignment already found + if (score_fw >= 9.5 && read.size() > y_fw + split_level) { + return read.substr(y_fw, split_level); + } + + // test local alignement with the reverse complement sequence + SmithWaterman(read_rc, seq_to_find, score_rc, x_rc, y_rc); + + if (score_fw >= score_rc) { // primer found in forward sequence + if (score_fw >= 8 && read.size() > y_fw + split_level) { + return read.substr(y_fw, split_level); + } + } else { // primer found in reverse complement sequence + if (score_rc >= 8 && read_rc.size() > y_rc + split_level) { + return read_rc.substr(y_rc, split_level); + } + } + + // score too weak, or read too small = ignore the read + return "None"; +} + + +void process_file_part(const std::string& input_filename, int part_id, int start_line, int end_line, const std::string& seq_to_find, int split_level, const std::string& output_dir) { + + std::ifstream input_read_file(input_filename); + if (!input_read_file.is_open()) { + std::cerr << "Error opening input file\n"; + return; + } + + std::string line; + // skip the file to the start line + input_read_file.seekg(0, std::ios::beg); + for (int i = 0; i < start_line; i++) { + std::getline(input_read_file, line); + } + + std::cout << "thread " << part_id << " for lines " << start_line << " to " << end_line << std::endl; + + // Process the lines in the file part + std::string read_name; + std::string sequence; + + // process the lines in the file part, add 4 lines to the i counter each loop + for (int i = start_line; i < end_line; i+=4) { + + // read the sequence name + std::getline(input_read_file, read_name); + + // read the sequence line + std::getline(input_read_file, sequence); + + // Skip the quality score lines + std::getline(input_read_file, line); + std::getline(input_read_file, line); + + // find the position fo the primer and get the bases following it + std::string cluster_id = get_cluster_id(sequence, seq_to_find, split_level); + + if (cluster_id == "None") { + continue; // ignore the read + } + + // Lock the mutex for this output file + std::lock_guard<std::mutex> lock(output_mutexes[cluster_id]); + + // write the sequence and its id to a cluster file corresponding to the bases found after the primer + std::ofstream output_file(output_dir + "/" + cluster_id + ".fasta", std::ios::app); // ios app is to append in the file + if (!output_file.is_open()) { + std::cerr << "Error opening output file\n"; + return; + } + output_file << ">" << read_name << "\n" << sequence << "\n"; + } +} + + + +int main(int argc, char* argv[]) { + + // check if the input and output file paths are provided as arguments + if (argc != 3) { + std::cerr << "Usage: " << argv[0] << " <input_fastq> <output_dir>" << std::endl; + return 1; + } // ./fast_clustering reads.fastq clusters_dir_path + + // get the input and output paths from the arguments + std::string input_fastq = argv[1]; + std::string output_dir = argv[2]; + + std::string start_primer = "GTTCAGAGTTCTACAGTCCGACGATCC"; + + int seq_to_find_size = std::min(static_cast<int>(start_primer.size()), 8); + std::string seq_to_find = start_primer.substr(start_primer.size() - seq_to_find_size); // search for a smaller sequence than the full primer if its len is > 10 + + std::string data_name = "100k"; + + input_fastq = "new_tests/"+data_name+"/shuffled_reads_"+data_name+".fastq"; + output_dir = "new_tests/"+data_name+"/clusters_"+data_name; + + // number of threads used + int num_parts = 4;//std::thread::hardware_concurrency(); + + + // start a timer + auto start = std::chrono::high_resolution_clock::now(); + + // create output clusters directory + std::filesystem::create_directory(output_dir); + + //delete all previous fasta files in the output clusters dir if any + std::filesystem::path dir_path(output_dir); + try { + for (const auto& entry : std::filesystem::directory_iterator(dir_path)) { + if (entry.path().extension() == ".fasta") { + std::filesystem::remove(entry.path()); + } + } + } catch (const std::filesystem::filesystem_error& ex) { + std::cerr << "Error deleting files: " << ex.what() << '\n'; + } + + // count the number of lines in the input file + std::ifstream input_read_file(input_fastq); + if (!input_read_file.is_open()) { + std::cerr << "Error opening input file\n"; + return 1; + } + + int num_lines = 0; + std::string line; + while (std::getline(input_read_file, line)) { + num_lines++; + } + input_read_file.close(); + + // Calculate the number of lines in each part + int lines_per_part = (num_lines + num_parts - 1) / num_parts; + lines_per_part = (lines_per_part + 3) / 4 * 4; // Round up to the nearest multiple of 4 + + // Create a thread for each file part + std::vector<std::thread> threads; + for (int i = 0; i < num_parts; i++) { + + // Calculate the start and end line numbers for this part + int start_line = i * lines_per_part; + int end_line = std::min((i + 1) * lines_per_part, num_lines); + + threads.emplace_back(process_file_part, input_fastq, i, start_line, end_line, seq_to_find, 3, output_dir); + } + + // Wait for all threads to finish + for (auto& t : threads) { + t.join(); + } + + // end the timer and print the elapsed time + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration<double> elapsed = end - start; + std::cout << elapsed.count() << "s for fast clustering" << std::endl; + + return 0; +} + +//g++ -o fast_clustering_t fast_clustering_t.cpp SmithWaterman.cpp && ./fast_clustering_t reads.fastq cluster_dir