diff --git a/partitioning.py b/partitioning.py new file mode 100755 index 0000000000000000000000000000000000000000..01bcf53d1bbb89fe19a6c9f5e0e68c09b6e34a5e --- /dev/null +++ b/partitioning.py @@ -0,0 +1,154 @@ +#!/usr/bin/python3 + +import os +import sys +import inspect +import time +import random + +import read_matrix as rm + +currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +sys.path.insert(0, os.path.dirname(currentdir)+"/synthesis_modules") +import dna_file_reader as dfr +import synthesis_simulation as ss + + + +# read fastq +# get minimizers +# hash dict key: minimizer, value: list of reads +# sort to file: line X : list of reads with common minimizer with read X + + +def minimizer_to_number(minimizer: str): + """ + convert a minimizer to the associated number + used to link to an index in the list of minimizer + """ + base_to_num_dict = {"A":"00", "C":"01", "G": "10", "T":"11"} + number_bin = "" + for base in minimizer: + number_bin += base_to_num_dict[base] + number = int(number_bin, 2) + + return number + + +def get_minimizers(reads_path: str): + """ + get a dict of minimizers from the read file + """ + #TODO minimizers that can also be found in primers should not be used + + window_size = 10 # window to look for the minimizer + minimizer_size = 6 # length of minimizer kmer + + minimizers_number = int(4**6) # 4 power lengh of minimizers + minimizer_list = [[] for _ in range(minimizers_number)] # each minimizer possible is associated to an index in the list, + # list at index i = list of reads that contains minimizer i + + reads_dict = dfr.read_fastq(reads_path) + + for read_id, read_seq in enumerate(reads_dict.values()): + sequence_size = len(read_seq) + read_seq_rv = dfr.reverse_complement(read_seq) + last_minimizer_added = "Z"*minimizer_size + + for i in range(sequence_size-window_size+1): + sub_fw = read_seq[i:i+window_size] + sub_rv = read_seq_rv[sequence_size-window_size-i:sequence_size-i] + + minimizer = "Z"*minimizer_size # default to erase with first real minimizer found + + for j in range(window_size-minimizer_size+1): # this is far from an optimal search + sub_minimizer = sub_fw[j:j+minimizer_size] + if sub_minimizer < minimizer: + minimizer = sub_minimizer + + sub_minimizer = sub_rv[j:j+minimizer_size] + if sub_minimizer < minimizer: + minimizer = sub_minimizer + + if minimizer != last_minimizer_added: # avoid duplicates + minimizer_index = minimizer_to_number(minimizer) + minimizer_list[minimizer_index].append(read_id) + last_minimizer_added = minimizer + + return minimizer_list, len(reads_dict) + + +def minlist_to_graph(reads_number, minimizer_list, graph_file_path) -> None: + + graph_lines = [[] for _ in range(reads_number)] # each line corresponds to a reads, content is other reads that shares at least 1 minimizer + + for neighbours_list in minimizer_list: + if len(neighbours_list) > 1: + for i, read_id in enumerate(neighbours_list): + read_id_neighbours = neighbours_list[:i]+neighbours_list[i+1:] + for id_neighbour in read_id_neighbours: + neighbour_name = str(id_neighbour+1) # metis want nodes names to start at 1 + #if neighbour_name not in graph_lines[read_id]: + graph_lines[read_id].append(neighbour_name) + + + edge_number = sum(len(l) for l in graph_lines) //2 + + with open(graph_file_path, "w") as output_graph_file: + output_graph_file.write(str(reads_number)+" "+str(edge_number)) + for neighbours_list_by_id in graph_lines: + output_graph_file.write("\n" +" ".join(neighbours_list_by_id)) + + + +def metis_output_to_soluce(metis_soluce_path, output_path): + + soluce_dict = {} + + with open(metis_soluce_path, "r") as metis_soluce: + line = metis_soluce.readline().replace("\n", "") + line_number = 0 + while line != "": + soluce_dict[line] = soluce_dict.get(line, []) + [line_number] + line = metis_soluce.readline().replace("\n", "") + line_number += 1 + + with open(output_path, "w") as output_soluce: + for i in range(len(soluce_dict)): + line = ", ".join(["r"+str(k) for k in soluce_dict[str(i)]]) + output_soluce.write(line+"\n") + + + #print(soluce_dict) + + +def graph_generation(reads_dir) -> None: + """ + generate a matrix of simulated reads + column corresponds to a read + line to a minimizer + cell x y = 1 if the read y contains the minimizer x, else 0 + """ + + reads_path = reads_dir +"/shuffled_reads.fastq" + graph_file_path = reads_dir +"/reads_graph.graph" + + minimizer_list, reads_number = get_minimizers(reads_path) + minlist_to_graph(reads_number, minimizer_list, graph_file_path) + + + +if __name__ == "__main__": + + + print("generate graph...") + + dir_path = "matrix_tests/matrix_10k_2/" + + #graph_generation(dir_path) + # cmd : gpmetis matrix_tests/matrix_10k_2/reads_graph.graph 226 -ufactor 300 + + metis_output_to_soluce(dir_path+"reads_graph.graph.part.30", dir_path+"/metis_soluce.txt") + + print("\tcompleted !") +