diff --git a/kmer_counting_dsk.py b/kmer_counting_dsk.py new file mode 100755 index 0000000000000000000000000000000000000000..f5793915f92a6a5c9cbd2c7bba2b56d0baa5e52e --- /dev/null +++ b/kmer_counting_dsk.py @@ -0,0 +1,63 @@ +#!/usr/bin/python3 + +import os +import sys +import inspect +import subprocess + + +currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +sys.path.insert(0, os.path.dirname(currentdir)+"/synthesis_modules") +import dna_file_reader as dfr + + +def count_kmers(input_path: str, kmer_size: int, min_occ: int) -> dict: + """ + use the dsk tool to count the occurrences of each kmers in the reads + returns the dict of kmer occurrences filtered with a threshold of minimum occurrence + """ + #start = time.time() + if not os.path.isfile(input_path) or os.stat(input_path).st_size == 0: + #print(input_path,"is an empty file") + return {} + + + dsk_script_path = currentdir+"/dsk/build/bin/dsk" + dsk_tempfile_path = currentdir+"/dsk/tmp/kmer_count" + dsk_fasta_input = currentdir+"/dsk/tmp/reads.fasta" + # dsk need a .fasta file in input, so the .fastq we have is converted in the tmp dir + # a fastq is supposed to work in input for dsk, but using it causes a weird behavior where only a part of the reads are processed + + dfr.fastq_to_fasta(input_path, dsk_fasta_input) + + kmer_count_command = dsk_script_path+' -file '+dsk_fasta_input+' -out '+dsk_tempfile_path+' -kmer-size '+ str(kmer_size)+' -abundance-min '+str(min_occ)+' -verbose 0' + subprocess.call('/bin/bash -c "$DSK"', shell=True, env={'DSK': kmer_count_command}) + + # convert the result into a txt file -> lines of "kmer occurrences" + dsk_ascii_script_path = currentdir+"/dsk/build/bin/dsk2ascii" # -file dsk_tempfile_path -out test.txt + dsk_tempfile_txt_path = currentdir+"/dsk/tmp/kmer_count.txt" + + count_convert_command = dsk_ascii_script_path+' -file '+dsk_tempfile_path+'.h5 -out '+dsk_tempfile_txt_path+' -verbose 0' + subprocess.call('/bin/bash -c "$CONVERT"', shell=True, env={'CONVERT': count_convert_command}) + + #count_time = time.time() - start + #print("\ncounting",count_time,"seconds") + #start = time.time() + + kmer_occurrences_dict = {} + # parse the file of kmer occurrences and save the lines in a dictionary + count_file = open(dsk_tempfile_txt_path) + line = count_file.readline() + while line != "": + kmer_sequence, occurrences_str = line.split(" ") + occurrences = int(occurrences_str) + kmer_occurrences_dict[kmer_sequence] = occurrences + # also save the occurrences for the reverse complement of the kmer since dsk use canonical representation of kmers + kmer_occurrences_dict[dfr.reverse_complement(kmer_sequence)] = occurrences + line = count_file.readline() + count_file.close() + + #print("\nloading",time.time() - start,"seconds") + + return kmer_occurrences_dict + diff --git a/reads_consensus.py b/reads_consensus.py index 0a58291150bc13721f6096d2c2db1b7ea325d9fa..a4630189eb02c952884ed50dfb20ea15273d60eb 100755 --- a/reads_consensus.py +++ b/reads_consensus.py @@ -44,58 +44,6 @@ def pre_filter(sequence: str) -> bool: return False -def count_kmers_dsk(input_path: str, min_occ: int) -> dict: - """ - use the dsk tool to count the occurrences of each kmers in the reads - returns the dict of kmer occurrences filtered with a threshold of minimum occurrence - """ - #start = time.time() - if not os.path.isfile(input_path) or os.stat(input_path).st_size == 0: - #print(input_path,"is an empty file") - return {} - - - dsk_script_path = currentdir+"/dsk/build/bin/dsk" - dsk_tempfile_path = currentdir+"/dsk/tmp/kmer_count" - dsk_fasta_input = currentdir+"/dsk/tmp/reads.fasta" - # dsk need a .fasta file in input, so the .fastq we have is converted in the tmp dir - # a fastq is supposed to work in input for dsk, but using it causes a weird behavior where only a part of the reads are processed - - dfr.fastq_to_fasta(input_path, dsk_fasta_input) - - kmer_count_command = dsk_script_path+' -file '+dsk_fasta_input+' -out '+dsk_tempfile_path+' -kmer-size '+ str(KMER_SIZE)+' -abundance-min '+str(min_occ)+' -verbose 0' - subprocess.call('/bin/bash -c "$DSK"', shell=True, env={'DSK': kmer_count_command}) - - # convert the result into a txt file -> lines of "kmer occurrences" - dsk_ascii_script_path = currentdir+"/dsk/build/bin/dsk2ascii" # -file dsk_tempfile_path -out test.txt - dsk_tempfile_txt_path = currentdir+"/dsk/tmp/kmer_count.txt" - - count_convert_command = dsk_ascii_script_path+' -file '+dsk_tempfile_path+'.h5 -out '+dsk_tempfile_txt_path+' -verbose 0' - subprocess.call('/bin/bash -c "$CONVERT"', shell=True, env={'CONVERT': count_convert_command}) - - #count_time = time.time() - start - #print("\ncounting",count_time,"seconds") - #start = time.time() - - kmer_occurrences_dict = {} - # parse the file of kmer occurrences and save the lines in a dictionary - count_file = open(dsk_tempfile_txt_path) - line = count_file.readline() - while line != "": - kmer_sequence, occurrences_str = line.split(" ") - occurrences = int(occurrences_str) - kmer_occurrences_dict[kmer_sequence] = occurrences - # also save the occurrences for the reverse complement of the kmer since dsk use canonical representation of kmers - kmer_occurrences_dict[dfr.reverse_complement(kmer_sequence)] = occurrences - line = count_file.readline() - count_file.close() - - #print("\nloading",time.time() - start,"seconds") - - return kmer_occurrences_dict - - - def build_result_sequence(total_path, current_kmer, stop_kmer, kmer_occurrences_dict, overlap_kmers_dict, path_list) -> list: while current_kmer != stop_kmer: @@ -343,7 +291,7 @@ def kmer_consensus(input_path: str, output_path: str, start_sequence: str, stop_ #start = time.time() # count the occurrences of kmers in the reads - kmer_occurrences_dict = count_kmers_dsk(input_path, min_occ) + kmer_occurrences_dict = dsk.count_kmers(input_path, KMER_SIZE, min_occ) #count_time = time.time() - start #start = time.time() diff --git a/reads_consensus_class.py b/reads_consensus_class.py index df93149c83dc1a666833572425a6d2da667faeef..d32702b58b3a18a6c15bbdc62eac02df51a4af7c 100755 --- a/reads_consensus_class.py +++ b/reads_consensus_class.py @@ -8,6 +8,7 @@ import time import argparse import graphviz +import kmer_counting_dsk as dsk currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) sys.path.insert(0, os.path.dirname(currentdir)+"/synthesis_modules") @@ -390,55 +391,6 @@ def pre_filter(sequence: str) -> bool: return False -def count_kmers_dsk(input_path: str, kmer_size: int, min_occ: int) -> dict: - """ - use the dsk tool to count the occurrences of each kmers in the reads - returns the dict of kmer occurrences filtered with a threshold of minimum occurrence - """ - #start = time.time() - if not os.path.isfile(input_path) or os.stat(input_path).st_size == 0: - print(input_path,"doesn't exist or is an empty file") - return {} - - dsk_script_path = currentdir+"/dsk/build/bin/dsk" - dsk_tempfile_path = currentdir+"/dsk/tmp/kmer_count" - dsk_fasta_input = currentdir+"/dsk/tmp/reads.fasta" - # dsk need a .fasta file in input, so the .fastq we have is converted in the tmp dir - # a fastq is supposed to work in input for dsk, but using it causes a weird behavior where only a part of the reads are processed - - dfr.fastq_to_fasta(input_path, dsk_fasta_input) - - kmer_count_command = dsk_script_path+' -file '+dsk_fasta_input+' -out '+dsk_tempfile_path+' -kmer-size '+ str(kmer_size)+' -abundance-min '+str(min_occ)+' -verbose 0' - subprocess.call('/bin/bash -c "$DSK"', shell=True, env={'DSK': kmer_count_command}) - - # convert the result into a txt file -> lines of "kmer occurrences" - dsk_ascii_script_path = currentdir+"/dsk/build/bin/dsk2ascii" # -file dsk_tempfile_path -out test.txt - dsk_tempfile_txt_path = currentdir+"/dsk/tmp/kmer_count.txt" - - count_convert_command = dsk_ascii_script_path+' -file '+dsk_tempfile_path+'.h5 -out '+dsk_tempfile_txt_path+' -verbose 0' - subprocess.call('/bin/bash -c "$CONVERT"', shell=True, env={'CONVERT': count_convert_command}) - - #count_time = time.time() - start - #print("\ncounting",count_time,"seconds") - #start = time.time() - - kmer_occurrences_dict = {} - # parse the file of kmer occurrences and save the lines in a dictionary - count_file = open(dsk_tempfile_txt_path) - line = count_file.readline() - while line != "": - kmer_sequence, occurrences_str = line.split(" ") - occurrences = int(occurrences_str) - kmer_occurrences_dict[kmer_sequence] = occurrences - # also save the occurrences for the reverse complement of the kmer since dsk use canonical representation of kmers - kmer_occurrences_dict[dfr.reverse_complement(kmer_sequence)] = occurrences - line = count_file.readline() - count_file.close() - - #print("\nloading",time.time() - start,"seconds") - - return kmer_occurrences_dict - def build_with_known_extremities(start_primer: str, stop_primer: str, kmer_occurrences_dict: dict) -> dict: """ @@ -501,7 +453,7 @@ def kmer_consensus(input_path: str, output_path: str, start_primer: str, stop_pr #start = time.time() # count the occurrences of kmers in the reads - kmer_occurrences_dict = count_kmers_dsk(input_path, kmer_size, min_occ) + kmer_occurrences_dict = dsk.count_kmers(input_path, kmer_size, min_occ) if kmer_size < len(start_primer): start_primer = start_primer[:KMER_SIZE]