diff --git a/partitioning/fast_clustering_t2.cpp b/partitioning/fast_clustering_t2.cpp index 5678ccd6bf2affbb808540f4029edd1c153edf37..adb5bc71631ab7a579509b429ae60cdd0a856259 100755 --- a/partitioning/fast_clustering_t2.cpp +++ b/partitioning/fast_clustering_t2.cpp @@ -160,10 +160,9 @@ void clustering(const std::string& reads_path, const std::string& start_primer, } - int fill_table_with_fastq_file(std::vector<std::array<std::string, 3>>& seq_table, const std::string& fastq_path) { + void fill_table_with_fastq_file(std::string** seq_table, const std::string& fastq_path) { // function to read FASTQ file, return length of filled tab - int num_reads = 0; std::string line; std::string read_name; std::string sequence; @@ -172,26 +171,30 @@ void clustering(const std::string& reads_path, const std::string& start_primer, if (!file.is_open()) { std::cerr << "Error opening file: " << fastq_path << std::endl; - return 0; + return; } + int num_reads = 0; // read the sequence name while (std::getline(file, read_name)) { - // read the sequence std::getline(file, sequence); // fill the tab line - seq_table.push_back( {read_name, sequence, ""} ); + std::array<std::string, 3> row = {read_name, sequence, ""}; + /*for (int i = 0; i < 3; i++) { + seq_table[num_reads][i] = row[i]; + }*/ + //seq_table[num_reads] = row; + std::copy(row.begin(), row.end(), seq_table[num_reads]); num_reads++; // skip the quality score lines std::getline(file, line); std::getline(file, line); + } file.close(); - - return num_reads; } void enumerate_clusters(std::vector<std::string>& cluster_id_list, const std::string& sequence, int split_level, const std::string& bases, int index) { @@ -209,7 +212,7 @@ void enumerate_clusters(std::vector<std::string>& cluster_id_list, const std::st } -void manage_threads(std::vector<std::array<std::string, 3>>& seq_table, int table_size, const std::string& start_primer, const std::string& output_dir_path) { +void manage_threads(std::string** seq_table, int table_size, const std::string& start_primer, const std::string& output_dir_path) { // start a timer auto start = std::chrono::high_resolution_clock::now(); @@ -270,7 +273,6 @@ void manage_threads(std::vector<std::array<std::string, 3>>& seq_table, int tabl auto end2 = std::chrono::high_resolution_clock::now(); elapsed = end2 - end; std::cout << elapsed.count() << "s for writting" << std::endl; - } @@ -286,21 +288,54 @@ int main(int argc, char* argv[]) { std::string input_fastq = argv[1]; std::string output_dir = argv[2]; // must be an existing dir + //delete all fasta files in the clusters dir + std::filesystem::path dir_path(output_dir); + try { + for (const auto& entry : std::filesystem::directory_iterator(dir_path)) { + if (entry.path().extension() == ".fasta") { + std::filesystem::remove(entry.path()); + } + } + } catch (const std::filesystem::filesystem_error& ex) { + std::cerr << "Error deleting files: " << ex.what() << '\n'; + } + // start a timer auto start = std::chrono::high_resolution_clock::now(); std::string start_primer = "GTTCAGAGTTCTACAGTCCGACGATCC"; - std::vector<std::array<std::string, 3>> seq_table; - int num_reads = fill_table_with_fastq_file(seq_table, input_fastq);// , start_primer, 3, output_dir); - manage_threads(seq_table, num_reads, start_primer, output_dir); + // get line_number + std::ifstream file(input_fastq); + int line_count = 0; + std::string line; + while (std::getline(file, line)) { + line_count++; + } + + int read_count = line_count/4; + + // init read table // contain seq_name; sequence; cluster_id(empty) + std::string** seq_table = new std::string*[read_count]; + for (int i = 0; i < read_count; i++) { + seq_table[i] = new std::string[3]; + } + + fill_table_with_fastq_file(seq_table, input_fastq); + + manage_threads(seq_table, read_count, start_primer, output_dir); // end the timer and print the elapsed time auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration<double> elapsed = end - start; std::cout << elapsed.count() << "s for fast clustering" << std::endl; + for (int i = 0; i < read_count; i++) { + delete[] seq_table[i]; + } + delete[] seq_table; + return 0; } -//g++ -o fast_clustering fast_clustering.cpp SmithWaterman.cpp && ./fast_clustering reads.fastq cluster_dir +//g++ -o fast_clustering -fopenmp fast_clustering_t2.cpp SmithWaterman.cpp && ./fast_clustering reads.fastq cluster_dir