diff --git a/reads_consensus.py b/reads_consensus.py index eb9e929dfdbf50d342f11dfc286ceeb174376d30..0a58291150bc13721f6096d2c2db1b7ea325d9fa 100755 --- a/reads_consensus.py +++ b/reads_consensus.py @@ -14,7 +14,7 @@ import dna_file_reader as dfr # used to get the consensus sequence from a .fastq reads file of an ordered assembly -KMER_SIZE = 30 # arbitrary constant, increase the risk of finding loop if too short +KMER_SIZE = 35 # arbitrary constant, increase the risk of finding loop if too short def pre_filter(sequence: str) -> bool: @@ -50,7 +50,7 @@ def count_kmers_dsk(input_path: str, min_occ: int) -> dict: returns the dict of kmer occurrences filtered with a threshold of minimum occurrence """ #start = time.time() - if os.stat(input_path).st_size == 0: + if not os.path.isfile(input_path) or os.stat(input_path).st_size == 0: #print(input_path,"is an empty file") return {} @@ -129,10 +129,10 @@ def build_result_sequence(total_path, current_kmer, stop_kmer, kmer_occurrences_ return path_list # try the second path if it has a non negligible weight compared to best path and is not a loop - if second_next_weight > best_next_weight/2 and not overlap_kmers_dict.get(second_next_kmer, False): #best_next_weight/2 - overlap_kmers_dict[second_next_kmer] = True - path_list = build_result_sequence(total_path + second_next_kmer[-1], second_next_kmer, stop_kmer, kmer_occurrences_dict, overlap_kmers_dict.copy(), path_list) - overlap_kmers_dict[second_next_kmer] = False + #if second_next_weight > best_next_weight/2 and not overlap_kmers_dict.get(second_next_kmer, False): + # overlap_kmers_dict[second_next_kmer] = True + # path_list = build_result_sequence(total_path + second_next_kmer[-1], second_next_kmer, stop_kmer, kmer_occurrences_dict, overlap_kmers_dict.copy(), path_list) + # overlap_kmers_dict[second_next_kmer] = False # the chosen next base is placed at the right of the result sequence @@ -351,6 +351,7 @@ def kmer_consensus(input_path: str, output_path: str, start_sequence: str, stop_ # build the resulting sequence from the dictionary of following bases result_sequence_list = build_with_known_extremities(start_sequence, stop_sequence, kmer_occurrences_dict) #result_sequence = build_from_start(start_sequence, kmer_occurrences_dict) + print(result_sequence_list) #result_sequence = build_without_extremities(kmer_occurrences_dict) #build_time = time.time() - start