diff --git a/partitioning/fast_clustering.cpp b/partitioning/fast_clustering.cpp index dc7c23052e09f10d0ec541171240dd5143a40643..b7109f8be9328a6a130f5e85e343d11675cd86d7 100755 --- a/partitioning/fast_clustering.cpp +++ b/partitioning/fast_clustering.cpp @@ -57,32 +57,31 @@ std::vector<std::string> read_fastq(const std::string& filename) { std::string get_cluster_id(const std::string& read, const std::string& seq_to_find, int split_level) { /* - get the <split_level> bases following the primer in the read, check if the read is in reverse complement + get the <split_level> bases following the primer in the read, also try to find the primer in the reverse complement sequence */ - - std::string read_rc = reverse_complement(read); // reverse complement of the read sequence - double score_fw, score_rc; // alignment score with the forward and reverse complement sequence - int x_fw, y_fw, x_rc, y_rc; //x_fw and x_rc not used + int x_fw, y_fw, x_rc, y_rc; // positions start and stop of the alignments found, x_fw and x_rc not used // call local alignment algorithm on the forward sequence SmithWaterman(read, seq_to_find, score_fw, x_fw, y_fw); - // don't test for the reverse complement if near perfect alignment already found + // don't test for the reverse complement if near perfect alignment already found (maximum possible score = 10) if (score_fw >= 9.5 && read.size() > y_fw + split_level) { - return read.substr(y_fw, split_level); + return read.substr(y_fw, split_level); // extract the cluster id } + std::string read_rc = reverse_complement(read); // get reverse complement of the read sequence + // test local alignement with the reverse complement sequence SmithWaterman(read_rc, seq_to_find, score_rc, x_rc, y_rc); if (score_fw >= score_rc) { // primer found in forward sequence - if (score_fw >= 8 && read.size() > y_fw + split_level) { + if (score_fw >= 7.5 && read.size() > y_fw + split_level) { return read.substr(y_fw, split_level); } } else { // primer found in reverse complement sequence - if (score_rc >= 8 && read_rc.size() > y_rc + split_level) { + if (score_rc >= 7.5 && read_rc.size() > y_rc + split_level) { return read_rc.substr(y_rc, split_level); } } @@ -93,7 +92,7 @@ std::string get_cluster_id(const std::string& read, const std::string& seq_to_fi void clustering(const std::string& reads_path, const std::string& start_primer, int split_level, const std::string& output_dir) { - /* + /* split_lvel = nbr of bases to use for the cluster id */ //delete all fasta files in the clusters dir @@ -127,7 +126,7 @@ void clustering(const std::string& reads_path, const std::string& start_primer, std::getline(input_read_file, line); std::getline(input_read_file, line); - // find the position fo the primer and get the bases following it + // find the position for the primer and get the bases following it std::string cluster_id = get_cluster_id(sequence, seq_to_find, split_level); if (cluster_id == "None") { @@ -157,16 +156,11 @@ int main(int argc, char* argv[]) { // get the input and output paths from the arguments std::string input_fastq = argv[1]; - std::string output_dir = argv[2]; + std::string output_dir = argv[2]; // must be an existing dir // start a timer auto start = std::chrono::high_resolution_clock::now(); - std::string data_name = "100k"; - - input_fastq = "new_tests/"+data_name+"/shuffled_reads_"+data_name+".fastq"; - output_dir = "new_tests/"+data_name+"/clusters_"+data_name; - std::string start_primer = "GTTCAGAGTTCTACAGTCCGACGATCC"; clustering(input_fastq, start_primer, 3, output_dir);