diff --git a/partitioning/partitioning.cpp b/partitioning/partitioning.cpp index c51381203fe73b71f7942b01fc90628e81b39965..dcbb25cf390fe4d54bd37896afef6f231905468b 100755 --- a/partitioning/partitioning.cpp +++ b/partitioning/partitioning.cpp @@ -15,6 +15,7 @@ int minimizer_size = 6; // length of minimizer kmer std::string reverse_complement(const std::string& sequence) { + // get the reverse complement of a sequence std::string rc_sequence = sequence; std::transform(rc_sequence.begin(), rc_sequence.end(), rc_sequence.begin(), [](char c) { switch (c) { @@ -61,7 +62,21 @@ int minimizer_to_number(const std::string& minimizer) { }*/ -// Function to read FASTQ file and return a vector of sequences +int count_sequences(const std::string& input_fastq) { + // get the number of sequences from the fastq + std::ifstream file(input_fastq); + std::string line; + int seq_number = 0; + + while (std::getline(file, line)) { + if (line[0] == '@') { + seq_number++; + } + } + return seq_number; +} + +// function to read FASTQ file and return a vector of sequences std::vector<std::string> read_fastq(const std::string& filename) { std::ifstream file(filename); std::vector<std::string> sequences; @@ -89,46 +104,43 @@ std::vector<std::string> read_fastq(const std::string& filename) { return sequences; } -// Function to get a vector of minimizers from the read file +// function to get a vector of minimizers from the read file std::vector<std::vector<int>> get_minimizers(const std::string& filename) { - // Start the timer - auto start = std::chrono::high_resolution_clock::now(); - int minimizers_number = sequence_index_map.size(); // get the number of minimizers of the dict std::vector<std::vector<int>> minimizer_list(minimizers_number); // each minimizer possible is associated to an index in the list, // list at index i = list of reads that contains minimizer i + std::string default_minimizer = std::string(minimizer_size, 'Z'); // initialize a default minimizer a string of 'Z' + std::vector<std::string> sequences = read_fastq(filename); for (int read_id = 0; read_id < sequences.size(); read_id++) { const std::string& read_seq = sequences[read_id]; int sequence_size = read_seq.size(); + std::string read_seq_rv = reverse_complement(read_seq); // reverse complement of the read sequence - std::string last_minimizer_added = std::string(minimizer_size, 'Z'); // initialize last minimizer added to a string of 'Z' + std::string last_minimizer_added = default_minimizer; // initialize last minimizer added to a string of 'Z' for (int i = 0; i <= sequence_size - window_size; i++) { - std::string sub_fw = read_seq.substr(i, window_size); // forward substring of length window_size - std::string sub_rv = read_seq_rv.substr(sequence_size - window_size - i, window_size); // reverse substring of length window_size - std::string minimizer = std::string(minimizer_size, 'Z'); // initialize minimizer to a string of 'Z' + std::string minimizer = default_minimizer; // initialize minimizer to a string of 'Z' - for (int j = 0; j <= window_size - minimizer_size; j++) { // this is far from an optimal search - std::string sub_minimizer = sub_fw.substr(j, minimizer_size); // forward substring of length minimizer_size + for (int j = 0; j <= window_size - minimizer_size; j++) { + std::string sub_minimizer = read_seq.substr(i+j, minimizer_size); // forward substring of length minimizer_size if (sub_minimizer < minimizer) { // if sub_minimizer is smaller than current minimizer, update minimizer minimizer = sub_minimizer; } - sub_minimizer = sub_rv.substr(j, minimizer_size); // reverse substring of length minimizer_size + sub_minimizer = read_seq_rv.substr(sequence_size-i-j-minimizer_size, minimizer_size); // reverse substring of length minimizer_size if (sub_minimizer < minimizer) { // if sub_minimizer is smaller than current minimizer, update minimizer minimizer = sub_minimizer; } } if (minimizer != last_minimizer_added) { // avoid duplicates - int minimizer_index = sequence_index_map[minimizer]; // convert minimizer to number - minimizer_list[minimizer_index].push_back(read_id); // add read_id to the list of reads that contains minimizer + minimizer_list[sequence_index_map[minimizer]].push_back(read_id); // add read_id to the list of reads that contains minimizer last_minimizer_added = minimizer; // update last minimizer added } } @@ -139,22 +151,26 @@ std::vector<std::vector<int>> get_minimizers(const std::string& filename) { [](const std::vector<int>& v) { return v.empty(); }), minimizer_list.end()); - // End the timer and print the elapsed time - auto end = std::chrono::high_resolution_clock::now(); - std::chrono::duration<double> elapsed = end - start; - std::cout << elapsed.count() << "s for minimizer list" << std::endl; - return minimizer_list; } -void minlist_to_graph(const std::vector<std::vector<int>>& minimizer_list, int reads_number, const std::string& graph_file_path) { - // Create a vector of lists to store the graph - std::vector<std::vector<int>> graph_lines(reads_number); +void minlist_to_graph(const std::string& input_fastq, const std::string& graph_file_path) { - // Start the timer + // start the timer auto start = std::chrono::high_resolution_clock::now(); + // get the number of sequences from the fastq + int seq_number = count_sequences(input_fastq); + + // compute the minimizers list + std::vector<std::vector<int>> minimizer_list = get_minimizers(input_fastq); + + + // create a vector of lists to store the graph + std::vector<std::vector<int>> graph_lines(seq_number); + + // Iterate over the minimizer list for (const auto& neighbours_list : minimizer_list) { // Iterate over the neighbours list @@ -185,7 +201,7 @@ void minlist_to_graph(const std::vector<std::vector<int>>& minimizer_list, int r std::ofstream output_graph_file(graph_file_path); // Write the number of vertices and edges to the output file - output_graph_file << reads_number << " " << edge_number << std::endl; + output_graph_file << seq_number << " " << edge_number << std::endl; // Write the graph to the output file for (const auto& neighbours_list_by_id : graph_lines) { @@ -202,15 +218,22 @@ void minlist_to_graph(const std::vector<std::vector<int>>& minimizer_list, int r } -void minlist_to_csv(const std::vector<std::vector<int>>& minimizer_list, int reads_number, const std::string& output_file) { - // Start the timer +void minlist_to_csv(const std::string& input_fastq, const std::string& output_file) { + + // start the timer auto start = std::chrono::high_resolution_clock::now(); + // get the number of sequences from the fastq + int seq_number = count_sequences(input_fastq); + + // compute the minimizers list + std::vector<std::vector<int>> minimizer_list = get_minimizers(input_fastq); + // Get the number of minimizers int min_list_size = minimizer_list.size(); // Create a binary matrix to store the minimizer-read relationships - std::vector<std::vector<int>> matrix(min_list_size, std::vector<int>(reads_number, 0)); + std::vector<std::vector<int>> matrix(min_list_size, std::vector<int>(seq_number, 0)); // Iterate over the minimizer list for (int i = 0; i < min_list_size; i++) { @@ -226,9 +249,9 @@ void minlist_to_csv(const std::vector<std::vector<int>>& minimizer_list, int rea // Write the column headers to the output file outfile << "min,"; - for (int j = 0; j < reads_number; j++) { + for (int j = 0; j < seq_number; j++) { outfile << "r" << j; - if (j < reads_number - 1) { + if (j < seq_number - 1) { outfile << ","; } } @@ -237,9 +260,9 @@ void minlist_to_csv(const std::vector<std::vector<int>>& minimizer_list, int rea // Write the matrix to the output file in CSV format for (int i = 0; i < min_list_size; i++) { outfile << "m" << i << ","; - for (int j = 0; j < reads_number; j++) { + for (int j = 0; j < seq_number; j++) { outfile << matrix[i][j]; - if (j < reads_number - 1) { + if (j < seq_number - 1) { outfile << ","; } } @@ -256,15 +279,22 @@ void minlist_to_csv(const std::vector<std::vector<int>>& minimizer_list, int rea } -void minlist_to_csvbm(const std::vector<std::vector<int>>& minimizer_list, int reads_number, const std::string& output_file) { - // Start the timer +void minlist_to_csvbm(const std::string& input_fastq, const std::string& output_file) { + + // start the timer auto start = std::chrono::high_resolution_clock::now(); - // Get the number of minimizers + // get the number of sequences from the fastq + int seq_number = count_sequences(input_fastq); + + // compute the minimizers list + std::vector<std::vector<int>> minimizer_list = get_minimizers(input_fastq); + + // get the number of minimizers int min_list_size = minimizer_list.size(); // Create a binary matrix to store the minimizer-read relationships - std::vector<std::vector<int>> matrix(min_list_size, std::vector<int>(reads_number, 0)); + std::vector<std::vector<int>> matrix(min_list_size, std::vector<int>(seq_number, 0)); for (int i = 0; i < min_list_size; i++) { for (int read_id : minimizer_list[i]) { @@ -272,10 +302,10 @@ void minlist_to_csvbm(const std::vector<std::vector<int>>& minimizer_list, int r } } - // Open the output file + // open the output file std::ofstream outfile(output_file); - // Write the header line + // write the header line int line_number = matrix.size(); int column_number = matrix[0].size(); int ones_counter = 0; @@ -301,10 +331,10 @@ void minlist_to_csvbm(const std::vector<std::vector<int>>& minimizer_list, int r } } - // Close the output file + // close the output file outfile.close(); - // End the timer and print the elapsed time + // end the timer and print the elapsed time auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration<double> elapsed = end - start; std::cout << elapsed.count() << "s for minlist_to_csvbm" << std::endl; @@ -312,14 +342,14 @@ void minlist_to_csvbm(const std::vector<std::vector<int>>& minimizer_list, int r void csvbm_streaming(const std::string& reads_path, const std::string& csvbm_path) { - // read the fastq file, for each read, get the corresponding binary line of minimizers and write it in the csvbm format - // input : path to a fastq read file - // output : path to a csvbm file to be created - + /* read the fastq file, for each read, get the corresponding binary line of minimizers and write it in the csvbm format + input : path to a fastq read file + output : path to a csvbm file to be created + this format represents a matrix where reads are the rows and minimizers the columns, allowing the streaming + */ // start a timer auto start = std::chrono::high_resolution_clock::now(); - std::string temp_csvbm_path = csvbm_path+"_temp"; // file to write the csvbm lines without the headder // the header content can only be computed after browsing all the reads @@ -332,6 +362,7 @@ void csvbm_streaming(const std::string& reads_path, const std::string& csvbm_pat std::string line; int minimizers_number = sequence_index_map.size(); // get the number of minimizers of the dict + std::string default_minimizer = std::string(minimizer_size, 'Z'); // initialize a default minimizer a string of 'Z' // browse the reads file and skip the header line while (std::getline(input_read_file, line)) { @@ -340,33 +371,29 @@ void csvbm_streaming(const std::string& reads_path, const std::string& csvbm_pat std::getline(input_read_file, line); reads_counter++; - std::vector<int> minimizer_list(minimizers_number, 0); // each minimizer possible is associated to an index in the list + std::vector<int> minimizer_list(minimizers_number, 0); // init a zeros list, each minimizer possible is associated to an index in the list int sequence_size = line.size(); std::string read_seq_rv = reverse_complement(line); // reverse complement of the read sequence - std::string last_minimizer_added = std::string(minimizer_size, 'Z'); // initialize last minimizer added to a string of 'Z' + std::string last_minimizer_added = default_minimizer; // initialize last minimizer added to 'ZZZZZ...' for (int i = 0; i <= sequence_size - window_size; i++) { - std::string sub_fw = line.substr(i, window_size); // forward substring of length window_size - std::string sub_rv = read_seq_rv.substr(sequence_size - window_size - i, window_size); // reverse substring of length window_size - - std::string minimizer = std::string(minimizer_size, 'Z'); // initialize minimizer to a string of 'Z' - for (int j = 0; j <= window_size - minimizer_size; j++) { // this is far from an optimal search - std::string sub_minimizer = sub_fw.substr(j, minimizer_size); // forward substring of length minimizer_size + std::string minimizer = default_minimizer; // initialize minimizer to 'ZZZZZ...' + for (int j = 0; j <= window_size - minimizer_size; j++) { + std::string sub_minimizer = line.substr(i+j, minimizer_size); // forward substring of length minimizer_size if (sub_minimizer < minimizer) { // if sub_minimizer is smaller than current minimizer, update minimizer minimizer = sub_minimizer; } - sub_minimizer = sub_rv.substr(j, minimizer_size); // reverse substring of length minimizer_size + sub_minimizer = read_seq_rv.substr(sequence_size-i-j-minimizer_size, minimizer_size); // reverse substring of length minimizer_size if (sub_minimizer < minimizer) { // if sub_minimizer is smaller than current minimizer, update minimizer minimizer = sub_minimizer; } } if (minimizer != last_minimizer_added) { // avoid duplicates - int minimizer_index = sequence_index_map[minimizer]; // convert minimizer to number - minimizer_list[minimizer_index] = 1; // set 1 for the index corresponding to the minimizer + minimizer_list[sequence_index_map[minimizer]] = 1; // set 1 for the index corresponding to the minimizer last_minimizer_added = minimizer; // update last minimizer added } } @@ -374,7 +401,7 @@ void csvbm_streaming(const std::string& reads_path, const std::string& csvbm_pat std::string output_line; // Write the data in the output file for (const auto& minimizers_presence : minimizer_list) { - if (minimizers_presence == 1) { + if (minimizers_presence) { // if '1' in the corresponding index output_line += std::to_string(distance) + "\n"; ones_counter++; distance = 1; @@ -393,16 +420,17 @@ void csvbm_streaming(const std::string& reads_path, const std::string& csvbm_pat input_read_file.close(); temp_output_file.close(); - //TODO Write the header line at the beginning of the csvbm + // write the header line at the beginning of the csvbm std::ifstream temp_input_file(temp_csvbm_path); // reopen the file to read it from the beginning std::ofstream final_output_file(csvbm_path); + // write the header // line number + columns number + "1" counter + is_reversed(=0) final_output_file << minimizers_number << " " << reads_counter << " " << ones_counter << " 0\n"; while (std::getline(temp_input_file, line)) { - final_output_file << line << "\n"; // copie du contenu du fichier d'origine dans le fichier temporaire + final_output_file << line << "\n"; // copy all the lines to the final output file } temp_input_file.close(); @@ -410,52 +438,37 @@ void csvbm_streaming(const std::string& reads_path, const std::string& csvbm_pat std::remove(temp_csvbm_path.c_str()); // remove temp file - // End the timer and print the elapsed time + // end the timer and print the elapsed time auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration<double> elapsed = end - start; std::cout << elapsed.count() << "s for csvbm streaming" << std::endl; - } int main(int argc, char* argv[]) { - // Check if the input and output file paths are provided as arguments + // check if the input and output file paths are provided as arguments if (argc != 4) { std::cerr << "Usage: " << argv[0] << " <input_fastq> <output_file> [--graph|--csv|--csvbm]" << std::endl; return 1; } // ./partitioning reads.fastq reads.graph --graph - // Get the input and output file paths from the arguments + // get the input and output file paths from the arguments std::string input_fastq = argv[1]; std::string output_file = argv[2]; std::string option = argv[3]; + // read the list of all possible minimizers and the associated indexes read_sequence_index_map("minimizers_6.txt"); - csvbm_streaming(input_fastq, output_file+"_s"); - - // Extract the minimizers from the input file - std::vector<std::vector<int>> minimizer_list = get_minimizers(input_fastq); - - // get the number of sequences from the fastq - std::ifstream file(input_fastq); - std::string line; - int seq_number = 0; - - while (std::getline(file, line)) { - if (line[0] == '@') { - seq_number++; - } - } - - // Call the appropriate function based on the option + // call the appropriate function based on the option if (option == "--graph") { - minlist_to_graph(minimizer_list, seq_number, output_file); + minlist_to_graph(input_fastq, output_file); } else if (option == "--csv") { - minlist_to_csv(minimizer_list, seq_number, output_file); + minlist_to_csv(input_fastq, output_file); } else if (option == "--csvbm") { - minlist_to_csvbm(minimizer_list, seq_number, output_file); + //minlist_to_csvbm(input_fastq, output_file); + csvbm_streaming(input_fastq, output_file); } else { std::cerr << "Invalid option: " << option << std::endl; return 1;