diff --git a/partitioning/partitioning.cpp b/partitioning/partitioning.cpp new file mode 100755 index 0000000000000000000000000000000000000000..b6a18c536b2338715adce9da4e864274860683d1 --- /dev/null +++ b/partitioning/partitioning.cpp @@ -0,0 +1,183 @@ +#include <iostream> +#include <fstream> +#include <string> +#include <vector> +#include <bitset> +#include <unordered_map> +#include <cmath> +#include <algorithm> +#include <chrono> + + +std::string reverse_complement(const std::string& sequence) { + std::string rc_sequence = sequence; + std::transform(rc_sequence.begin(), rc_sequence.end(), rc_sequence.begin(), [](char c) { + switch (c) { + case 'A': return 'T'; + case 'C': return 'G'; + case 'G': return 'C'; + case 'T': return 'A'; + default: return c; + } + }); + std::reverse(rc_sequence.begin(), rc_sequence.end()); + return rc_sequence; +} + +// Function to convert a minimizer to the associated number +int minimizer_to_number(const std::string& minimizer) { + std::unordered_map<char, std::string> base_to_num_dict = {{'A', "00"}, {'C', "01"}, {'G', "10"}, {'T', "11"}}; + std::string number_bin = ""; + for (char base : minimizer) { + number_bin += base_to_num_dict[base]; + } + int number = std::bitset<32>(number_bin).to_ulong(); + + return number; +} + +// Function to read FASTQ file and return a vector of sequences +std::vector<std::string> read_fastq(const std::string& filename) { + std::ifstream file(filename); + std::vector<std::string> sequences; + std::string line; + + if (!file.is_open()) { + std::cerr << "Error opening file: " << filename << std::endl; + return sequences; + } + + // Skip the header line + while (std::getline(file, line)) { + + // Read the sequence line + std::getline(file, line); + sequences.push_back(line); + + // Skip the quality score lines + std::getline(file, line); + std::getline(file, line); + } + + file.close(); + + return sequences; +} + +// Function to get a vector of minimizers from the read file +std::vector<std::vector<int>> get_minimizers(const std::string& filename) { + + // Start the timer + auto start = std::chrono::high_resolution_clock::now(); + + int window_size = 10; // window to look for the minimizer + int minimizer_size = 6; // length of minimizer kmer + int minimizers_number = std::pow(4, minimizer_size); // 4 power length of minimizers + std::vector<std::vector<int>> minimizer_list(minimizers_number); // each minimizer possible is associated to an index in the list, + // list at index i = list of reads that contains minimizer i + + std::vector<std::string> sequences = read_fastq(filename); + + for (int read_id = 0; read_id < sequences.size(); read_id++) { + const std::string& read_seq = sequences[read_id]; + int sequence_size = read_seq.size(); + std::string read_seq_rv = reverse_complement(read_seq); // reverse complement of the read sequence + std::string last_minimizer_added = std::string(minimizer_size, 'Z'); // initialize last minimizer added to a string of 'Z' + + for (int i = 0; i <= sequence_size - window_size; i++) { + std::string sub_fw = read_seq.substr(i, window_size); // forward substring of length window_size + std::string sub_rv = read_seq_rv.substr(sequence_size - window_size - i, window_size); // reverse substring of length window_size + + std::string minimizer = std::string(minimizer_size, 'Z'); // initialize minimizer to a string of 'Z' + + for (int j = 0; j <= window_size - minimizer_size; j++) { // this is far from an optimal search + std::string sub_minimizer = sub_fw.substr(j, minimizer_size); // forward substring of length minimizer_size + if (sub_minimizer < minimizer) { // if sub_minimizer is smaller than current minimizer, update minimizer + minimizer = sub_minimizer; + } + + sub_minimizer = sub_rv.substr(j, minimizer_size); // reverse substring of length minimizer_size + if (sub_minimizer < minimizer) { // if sub_minimizer is smaller than current minimizer, update minimizer + minimizer = sub_minimizer; + } + } + + if (minimizer != last_minimizer_added) { // avoid duplicates + int minimizer_index = minimizer_to_number(minimizer); // convert minimizer to number + minimizer_list[minimizer_index].push_back(read_id); // add read_id to the list of reads that contains minimizer + last_minimizer_added = minimizer; // update last minimizer added + } + } + } + + // End the timer and print the elapsed time + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration<double> elapsed = end - start; + std::cout << elapsed.count() << " for minimizer list" << std::endl; + + return minimizer_list; +} + + +void minlist_to_graph(int reads_number, const std::vector<std::vector<int>>& minimizer_list, const std::string& graph_file_path) { + + // Create a vector of vectors to store the graph + std::vector<std::vector<int>> graph_lines(reads_number); + + // Start the timer + auto start = std::chrono::high_resolution_clock::now(); + + // Iterate over the minimizer list + for (const auto& neighbours_list : minimizer_list) { + // Iterate over the neighbours list + for (int i = 0; i < neighbours_list.size(); i++) { + int read_id = neighbours_list[i]; + // Add the neighbours to the graph + for (int j = 0; j < neighbours_list.size(); j++) { + if (j != i) { + graph_lines[read_id].push_back(neighbours_list[j] + 1); + } + } + } + } + + // Calculate the number of edges in the graph + int edge_number = 0; + for (const auto& neighbours_list_by_id : graph_lines) { + edge_number += neighbours_list_by_id.size(); + } + edge_number /= 2; + + // Open the output file + std::ofstream output_graph_file(graph_file_path); + + // Write the number of vertices and edges to the output file + output_graph_file << reads_number << " " << edge_number << std::endl; + + // Write the graph to the output file + for (const auto& neighbours_list_by_id : graph_lines) { + for (int i = 0; i < neighbours_list_by_id.size(); i++) { + output_graph_file << neighbours_list_by_id[i]; + if (i != neighbours_list_by_id.size() - 1) { + output_graph_file << " "; + } + } + output_graph_file << std::endl; + } + + // End the timer and print the elapsed time + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration<double> elapsed = end - start; + std::cout << elapsed.count() << " min_list of graph" << std::endl; +} + + +int main() { + std::string filename = "partition_C/shuffled_reads.fastq"; + std::vector<std::vector<int>> minimizer_list = get_minimizers(filename); + + minlist_to_graph(10000, minimizer_list, "partition_C/reads.graph"); + + + return 0; +}