Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 987be5d9 authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

init, c++ version of pre metis graph generation

parent 2983d26c
Branches
No related tags found
No related merge requests found
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <bitset>
#include <unordered_map>
#include <cmath>
#include <algorithm>
#include <chrono>
std::string reverse_complement(const std::string& sequence) {
std::string rc_sequence = sequence;
std::transform(rc_sequence.begin(), rc_sequence.end(), rc_sequence.begin(), [](char c) {
switch (c) {
case 'A': return 'T';
case 'C': return 'G';
case 'G': return 'C';
case 'T': return 'A';
default: return c;
}
});
std::reverse(rc_sequence.begin(), rc_sequence.end());
return rc_sequence;
}
// Function to convert a minimizer to the associated number
int minimizer_to_number(const std::string& minimizer) {
std::unordered_map<char, std::string> base_to_num_dict = {{'A', "00"}, {'C', "01"}, {'G', "10"}, {'T', "11"}};
std::string number_bin = "";
for (char base : minimizer) {
number_bin += base_to_num_dict[base];
}
int number = std::bitset<32>(number_bin).to_ulong();
return number;
}
// Function to read FASTQ file and return a vector of sequences
std::vector<std::string> read_fastq(const std::string& filename) {
std::ifstream file(filename);
std::vector<std::string> sequences;
std::string line;
if (!file.is_open()) {
std::cerr << "Error opening file: " << filename << std::endl;
return sequences;
}
// Skip the header line
while (std::getline(file, line)) {
// Read the sequence line
std::getline(file, line);
sequences.push_back(line);
// Skip the quality score lines
std::getline(file, line);
std::getline(file, line);
}
file.close();
return sequences;
}
// Function to get a vector of minimizers from the read file
std::vector<std::vector<int>> get_minimizers(const std::string& filename) {
// Start the timer
auto start = std::chrono::high_resolution_clock::now();
int window_size = 10; // window to look for the minimizer
int minimizer_size = 6; // length of minimizer kmer
int minimizers_number = std::pow(4, minimizer_size); // 4 power length of minimizers
std::vector<std::vector<int>> minimizer_list(minimizers_number); // each minimizer possible is associated to an index in the list,
// list at index i = list of reads that contains minimizer i
std::vector<std::string> sequences = read_fastq(filename);
for (int read_id = 0; read_id < sequences.size(); read_id++) {
const std::string& read_seq = sequences[read_id];
int sequence_size = read_seq.size();
std::string read_seq_rv = reverse_complement(read_seq); // reverse complement of the read sequence
std::string last_minimizer_added = std::string(minimizer_size, 'Z'); // initialize last minimizer added to a string of 'Z'
for (int i = 0; i <= sequence_size - window_size; i++) {
std::string sub_fw = read_seq.substr(i, window_size); // forward substring of length window_size
std::string sub_rv = read_seq_rv.substr(sequence_size - window_size - i, window_size); // reverse substring of length window_size
std::string minimizer = std::string(minimizer_size, 'Z'); // initialize minimizer to a string of 'Z'
for (int j = 0; j <= window_size - minimizer_size; j++) { // this is far from an optimal search
std::string sub_minimizer = sub_fw.substr(j, minimizer_size); // forward substring of length minimizer_size
if (sub_minimizer < minimizer) { // if sub_minimizer is smaller than current minimizer, update minimizer
minimizer = sub_minimizer;
}
sub_minimizer = sub_rv.substr(j, minimizer_size); // reverse substring of length minimizer_size
if (sub_minimizer < minimizer) { // if sub_minimizer is smaller than current minimizer, update minimizer
minimizer = sub_minimizer;
}
}
if (minimizer != last_minimizer_added) { // avoid duplicates
int minimizer_index = minimizer_to_number(minimizer); // convert minimizer to number
minimizer_list[minimizer_index].push_back(read_id); // add read_id to the list of reads that contains minimizer
last_minimizer_added = minimizer; // update last minimizer added
}
}
}
// End the timer and print the elapsed time
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
std::cout << elapsed.count() << " for minimizer list" << std::endl;
return minimizer_list;
}
void minlist_to_graph(int reads_number, const std::vector<std::vector<int>>& minimizer_list, const std::string& graph_file_path) {
// Create a vector of vectors to store the graph
std::vector<std::vector<int>> graph_lines(reads_number);
// Start the timer
auto start = std::chrono::high_resolution_clock::now();
// Iterate over the minimizer list
for (const auto& neighbours_list : minimizer_list) {
// Iterate over the neighbours list
for (int i = 0; i < neighbours_list.size(); i++) {
int read_id = neighbours_list[i];
// Add the neighbours to the graph
for (int j = 0; j < neighbours_list.size(); j++) {
if (j != i) {
graph_lines[read_id].push_back(neighbours_list[j] + 1);
}
}
}
}
// Calculate the number of edges in the graph
int edge_number = 0;
for (const auto& neighbours_list_by_id : graph_lines) {
edge_number += neighbours_list_by_id.size();
}
edge_number /= 2;
// Open the output file
std::ofstream output_graph_file(graph_file_path);
// Write the number of vertices and edges to the output file
output_graph_file << reads_number << " " << edge_number << std::endl;
// Write the graph to the output file
for (const auto& neighbours_list_by_id : graph_lines) {
for (int i = 0; i < neighbours_list_by_id.size(); i++) {
output_graph_file << neighbours_list_by_id[i];
if (i != neighbours_list_by_id.size() - 1) {
output_graph_file << " ";
}
}
output_graph_file << std::endl;
}
// End the timer and print the elapsed time
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
std::cout << elapsed.count() << " min_list of graph" << std::endl;
}
int main() {
std::string filename = "partition_C/shuffled_reads.fastq";
std::vector<std::vector<int>> minimizer_list = get_minimizers(filename);
minlist_to_graph(10000, minimizer_list, "partition_C/reads.graph");
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment