diff --git a/partitioning/partitioning.cpp b/partitioning/partitioning.cpp index b6a18c536b2338715adce9da4e864274860683d1..d84efe651c32da1dc29059120f8c3ec2a013f044 100755 --- a/partitioning/partitioning.cpp +++ b/partitioning/partitioning.cpp @@ -2,6 +2,7 @@ #include <fstream> #include <string> #include <vector> +#include <list> #include <bitset> #include <unordered_map> #include <cmath> @@ -110,19 +111,24 @@ std::vector<std::vector<int>> get_minimizers(const std::string& filename) { } } + // Remove empty lists from the minimizer list + minimizer_list.erase(std::remove_if(minimizer_list.begin(), minimizer_list.end(), + [](const std::vector<int>& v) { return v.empty(); }), + minimizer_list.end()); + // End the timer and print the elapsed time auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration<double> elapsed = end - start; - std::cout << elapsed.count() << " for minimizer list" << std::endl; + std::cout << elapsed.count() << "s for minimizer list" << std::endl; return minimizer_list; } -void minlist_to_graph(int reads_number, const std::vector<std::vector<int>>& minimizer_list, const std::string& graph_file_path) { - - // Create a vector of vectors to store the graph +void minlist_to_graph(const std::vector<std::vector<int>>& minimizer_list, int reads_number, const std::string& graph_file_path) { + // Create a vector of lists to store the graph std::vector<std::vector<int>> graph_lines(reads_number); + //std::vector<std::list<int>> graph_lines(reads_number); // Start the timer auto start = std::chrono::high_resolution_clock::now(); @@ -141,6 +147,11 @@ void minlist_to_graph(int reads_number, const std::vector<std::vector<int>>& min } } + // End the timer and print the elapsed time + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration<double> elapsed = end - start; + std::cout << elapsed.count() << " create graph" << std::endl; + // Calculate the number of edges in the graph int edge_number = 0; for (const auto& neighbours_list_by_id : graph_lines) { @@ -153,31 +164,107 @@ void minlist_to_graph(int reads_number, const std::vector<std::vector<int>>& min // Write the number of vertices and edges to the output file output_graph_file << reads_number << " " << edge_number << std::endl; - + // Write the graph to the output file for (const auto& neighbours_list_by_id : graph_lines) { - for (int i = 0; i < neighbours_list_by_id.size(); i++) { - output_graph_file << neighbours_list_by_id[i]; - if (i != neighbours_list_by_id.size() - 1) { - output_graph_file << " "; - } + for (const auto& neighbour : neighbours_list_by_id) { + output_graph_file << neighbour << " "; } output_graph_file << std::endl; } + // End the timer and print the elapsed time + end = std::chrono::high_resolution_clock::now(); + elapsed = end - start; + std::cout << elapsed.count() << "s for min_list of graph" << std::endl; +} + + +void minlist_to_csv(const std::vector<std::vector<int>>& minimizer_list, int reads_number, const std::string& output_file) { + // Start the timer + auto start = std::chrono::high_resolution_clock::now(); + + // Get the number of minimizers + int minimizers_number = minimizer_list.size(); + + // Create a binary matrix to store the minimizer-read relationships + std::vector<std::vector<int>> matrix(minimizers_number, std::vector<int>(reads_number, 0)); + + // Iterate over the minimizer list + for (int i = 0; i < minimizers_number; i++) { + // Iterate over the list of reads that contain the current minimizer + for (int read_id : minimizer_list[i]) { + // Set the corresponding cell in the matrix to 1 + matrix[i][read_id] = 1; + } + } + + // Open the output file + std::ofstream outfile(output_file); + + // Write the column headers to the output file + outfile << "min,"; + for (int j = 0; j < reads_number; j++) { + outfile << "r" << j; + if (j < reads_number - 1) { + outfile << ","; + } + } + outfile << "\n"; + + // Write the matrix to the output file in CSV format + for (int i = 0; i < minimizers_number; i++) { + outfile << "m" << i << ","; + for (int j = 0; j < reads_number; j++) { + outfile << matrix[i][j]; + if (j < reads_number - 1) { + outfile << ","; + } + } + outfile << "\n"; + } + + // Close the output file + outfile.close(); + // End the timer and print the elapsed time auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration<double> elapsed = end - start; - std::cout << elapsed.count() << " min_list of graph" << std::endl; + std::cout << elapsed.count() << "s for minlist_to_csv" << std::endl; } -int main() { - std::string filename = "partition_C/shuffled_reads.fastq"; - std::vector<std::vector<int>> minimizer_list = get_minimizers(filename); - minlist_to_graph(10000, minimizer_list, "partition_C/reads.graph"); +int main(int argc, char* argv[]) { + + // Check if the input and output file paths are provided as arguments + if (argc != 3) { + std::cerr << "Usage: " << argv[0] << " <input_fastq> <output_file>" << std::endl; + return 1; + } + + // Get the input and output file paths from the arguments + std::string input_fastq = argv[1]; + std::string output_file = argv[2]; + + // get the number of sequences from the fastq + std::ifstream file(input_fastq); + std::string line; + int seq_number = 0; + + while (std::getline(file, line)) { + if (line[0] == '@') { + seq_number++; + } + } + + // Extract the minimizers from the input file + std::vector<std::vector<int>> minimizer_list = get_minimizers(input_fastq); + + // Write the minimizer matrix to the output file in CSV format + //minlist_to_csv(minimizer_list, seq_number, output_file); + minlist_to_graph(minimizer_list, seq_number, output_file); return 0; }