diff --git a/post_processing.py b/post_processing.py index 72339c292e99e38c381d1eaed4469ccfe8a438a0..6703f71fe98a07a0c8a66e8072e89270d47e739b 100755 --- a/post_processing.py +++ b/post_processing.py @@ -12,12 +12,6 @@ do the opposite of pre_processing.py take dna sequences and convert them into files """ -def compressed_name(file_path): - return file_path+".gz" - -def uncompressed_name(file_path): - return file_path.replace(".gz", "") - def convert_to_binary(input_dir_path, compressed_dir_path): """ @@ -26,7 +20,7 @@ def convert_to_binary(input_dir_path, compressed_dir_path): for filename in os.listdir(input_dir_path): file_path = os.path.join(input_dir_path, filename) - result_file_path = os.path.join(compressed_dir_path, compressed_name(filename)) + result_file_path = os.path.join(compressed_dir_path, pre_processing.get_compressed_name(filename)) # checking if it is a file if os.path.isfile(file_path): @@ -44,14 +38,12 @@ def uncompress_files(compressed_dir_path, uncompressed_dir_path): """ for filename in os.listdir(compressed_dir_path): file_path = os.path.join(compressed_dir_path, filename) - result_file_path = os.path.join(uncompressed_dir_path, uncompressed_name(filename)) + uncompressed_file_path = os.path.join(uncompressed_dir_path, pre_processing.get_uncompressed_name(filename)) # checking if it is a file if os.path.isfile(file_path): - pre_processing.unzip_file(file_path) - # move the unzipped file - os.replace(uncompressed_name(file_path), result_file_path) + pre_processing.unzip_file(file_path, uncompressed_file_path) elif os.path.isdir(file_path): print("error post processing (uncompress_files) : directory found in compressed_dir_path", filename) diff --git a/pre_processing.py b/pre_processing.py index dfbc2640c08912c832bff01b05eb545cbcab995a..53c645273972aace9049e95ee43769e59351ac30 100755 --- a/pre_processing.py +++ b/pre_processing.py @@ -17,28 +17,72 @@ splitting files that are too large for 1 assembly """ -def zip_file(file_path, output_path): +def zip_file(file_path, output_path, compression_type="gzip"): """ - split the file with gzip and write it at the output path + compress the file and write it at the output path """ - compression_command = "gzip -c9 "+ file_path + " > "+output_path+".gz" - subprocess.run('/bin/bash -c "$COMMAND"', shell=True, env={'COMMAND': compression_command}) + + if compression_type == "gzip": + compression_command = "gzip -c9 "+ file_path + " > "+output_path + subprocess.run('/bin/bash -c "$COMMAND"', shell=True, env={'COMMAND': compression_command}) + return + + if compression_type == "cmix": + if not os.path.isfile(output_path): + compression_command = "/udd/oboulle/Documents/result_analysis/compression_analysis/cmix/cmix -c "+ file_path + " "+output_path + subprocess.run('/bin/bash -c "$COMMAND"', shell=True, env={'COMMAND': compression_command}) + else: + print("already done",output_path) + return + # type not supported + print("compression error, unknown format:",compression_type) + exit(0) -def unzip_file(file_path): +def unzip_file(file_path, output_path, compression_type="gzip"): """ - unzip the file and write it just where it is + uncompress the file and write it just where it is """ - decompression_command = "gzip -d "+ file_path - subprocess.run('/bin/bash -c "$COMMAND"', shell=True, env={'COMMAND': decompression_command}) - - -def compressed_name(file_path): - return file_path+".gz" - -def uncompressed_name(file_path): - return file_path.replace(".gz", "") + + if compression_type == "gzip": + decompression_command = "gzip -d "+ file_path + subprocess.run('/bin/bash -c "$COMMAND"', shell=True, env={'COMMAND': decompression_command}) + # move the unzipped file to the defined output path + os.replace(get_uncompressed_name(file_path, "gzip"), output_path) + return + + if compression_type == "cmix": + decompression_command = "/udd/oboulle/Documents/result_analysis/compression_analysis/cmix/cmix -d "+ file_path + " "+output_path + subprocess.run('/bin/bash -c "$COMMAND"', shell=True, env={'COMMAND': decompression_command}) + return + # type not supported + print("decompression error, unknown format:",compression_type) + exit(0) + +def get_compressed_name(file_path, compression_type="gzip"): + + if compression_type == "gzip": + return file_path+".gz" + + if compression_type == "cmix": + return file_path+".cx" + # type not supported + print("get_compressed_name error, unknown format:",compression_type) + exit(0) + + +def get_uncompressed_name(file_path, compression_type="gzip"): + + if compression_type == "gzip": + return file_path.replace(".gz", "") + + if compression_type == "cmix": + return file_path.replace(".cx", "") + # type not supported + print("get_uncompressed_name error, unknown format:",compression_type) + exit(0) + def insert_path_in_files(input_dir_path: str, rearanged_files_dir_path: str) -> None: """ @@ -86,18 +130,18 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str) # compress the file file_path = os.path.join(rearanged_files_dir_path, filename) - compressed_file_path = os.path.join(compressed_dir_path, filename) + comp_file_path = get_compressed_name(os.path.join(compressed_dir_path, filename)) # checking if it is a file if os.path.isfile(file_path): - zip_file(file_path, compressed_file_path) + zip_file(file_path, comp_file_path) elif os.path.isdir(file_path): print("error pre processing (compress_all) : directory found in rearanged dir path", filename) exit(0) # get binary size of the compressed file - binary_len = len(file_to_dna.convert_file_to_bits(compressed_name(compressed_file_path))) + binary_len = len(file_to_dna.convert_file_to_bits(comp_file_path)) if binary_len <= max_binary_length: # if acceptable length, it's perfect files_compressed_size[filename] = binary_len # save the compressed size for this file @@ -105,7 +149,7 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str) else: # file too large, nedd to split it - os.remove(compressed_name(compressed_file_path)) # delete the compressed file + os.remove(comp_file_path) # delete the compressed file # read the original file as bytes with open(file_path, "rb") as input_file: @@ -147,10 +191,10 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str) with open(split_file_path, "wb") as f: # write the bytes content f.write(split_file_bytes_content) + compressed_subfile_path = get_compressed_name(get_uncompressed_name(comp_file_path) + split_file_footer) # compress the split_file to the compressed directory - zip_file(split_file_path, compressed_file_path + split_file_footer) + zip_file(split_file_path, compressed_subfile_path) - compressed_subfile_path = compressed_name(compressed_file_path + split_file_footer) # check the size of the subfile binary_len = len(file_to_dna.convert_file_to_bits(compressed_subfile_path)) @@ -166,28 +210,35 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str) # if the sub file is recreated, the new size will overwrite this one def merge_short_files(files_compressed_size: dict): + """ + each molecule can store a maximum fixed number of bits + since it's more efficient to have a few long molecule than a lot of short molecule, + small files can be merged in one same molecule + """ #sorted_files_sizes = sorted(files_compressed_size.items(), key=lambda item: item[1], reverse=True) # sort dict by sizes from highest to lowest - files_compressed_size_bis = {} + #files_compressed_size_bis = {} - merged_files_paths = [] + #merged_files_paths = [] - new_merge = False # true if at least one new merge has been made + new_merge = False # set to true if at least one new merge has been made for i, filename in enumerate(list(files_compressed_size.keys())[:-1]): - unmerged = True # set to false if a merge is made - file_compressed_size = files_compressed_size[filename] + file_compressed_size = files_compressed_size[filename] # get size of the compressed file - if file_compressed_size >= max_binary_length: # impossible to merge because too large - continue + if file_compressed_size is None or file_compressed_size >= max_binary_length: # impossible to merge because too large, or has already be used in a merge (set to None) + continue # skip this file for filename_2 in list(files_compressed_size.keys())[i+1:]: file_compressed_size_2 = files_compressed_size[filename_2] - if file_compressed_size + file_compressed_size_2 <= max_binary_length: + if file_compressed_size_2 is None or file_compressed_size + file_compressed_size_2 <= max_binary_length: + # the sum of the 2 compressed files is lower than what can be stored, + # so the original files will be merged and recompressed, + # the compression of a merging is supposed to be smaller than the sum of compressions of each file - # merge the 2 files + # get the binary content of each file with open(os.path.join(rearanged_files_dir_path, filename), "rb") as input_file: bytes_content = b"".join(input_file.readlines()) with open(os.path.join(rearanged_files_dir_path, filename_2), "rb") as input_file: @@ -196,15 +247,17 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str) # remove the "merged_" from the name of already merged files for visibility merged_file_name = "merged_" + filename.replace("merged_","") + "+" + filename_2.replace("merged_","") merged_file_path = os.path.join(rearanged_files_dir_path, merged_file_name) - with open(merged_file_path, "wb") as f: # write the bytes content + compressed_merged_file_path = get_compressed_name(os.path.join(compressed_dir_path, merged_file_name)) + print("new merge :",filename,"and",filename_2) + + with open(merged_file_path, "wb") as f: # write the sum of bytes content f.write(bytes_content + bytes_content_2) - compressed_merged_file_path = os.path.join(compressed_dir_path, merged_file_name) - - # add the merged file path to the bis dict with it's compressed size + # compress the merged file created zip_file(merged_file_path, compressed_merged_file_path) - merged_binary_len = len(file_to_dna.convert_file_to_bits(compressed_name(compressed_merged_file_path))) + # test its size just in case, but it should fit in a molecule + merged_binary_len = len(file_to_dna.convert_file_to_bits(compressed_merged_file_path)) if merged_binary_len >= max_binary_length: print("error merging result too large", compressed_merged_file_path) @@ -213,23 +266,25 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str) # add the merged file to the dict because it can still be used for other merging if it's short enough files_compressed_size[merged_file_name] = merged_binary_len - # remove the 2 compressed files of the 2 files - os.remove(os.path.join(compressed_dir_path, compressed_name(filename))) - os.remove(os.path.join(compressed_dir_path, compressed_name(filename_2))) + # remove the 2 old compressed files of the 2 files + os.remove(os.path.join(compressed_dir_path, get_compressed_name(filename))) + os.remove(os.path.join(compressed_dir_path, get_compressed_name(filename_2))) + + # set the compressed size of the 2 files to None to avoid them to be reused for merging + files_compressed_size[filename] = None + files_compressed_size[filename_2] = None - # set the compressed size of the 2 files to a too high number to avoid them to be reused for merging - files_compressed_size[filename] = 2*max_binary_length - files_compressed_size[filename_2] = 2*max_binary_length + new_merge = True # keep in memory that at least one new merge has been made in this loop - new_merge = True - break + break # leave the second loop, but others merges can still be done in the continuation of the first loop # continue to try to create other merging if at least one merge has been made - # otherwise, the loop can end since it no longer find possible merges if new_merge: print("continue merging...") + print(files_compressed_size) merge_short_files(files_compressed_size) - + # otherwise, the loop can end since it no longer find possible merges + print(files_compressed_size) merge_short_files(files_compressed_size) @@ -245,7 +300,7 @@ def convert_to_sequence(compressed_dir_path, payload_fragments_dir_path): # checking if it is a file if os.path.isfile(file_path): - output_file_path = os.path.join(payload_fragments_dir_path, uncompressed_name(filename)) + output_file_path = os.path.join(payload_fragments_dir_path, get_uncompressed_name(filename)) dna_sequence = file_to_dna.encode_file(file_path, output_file_path) # convert binaries into a dna sequence and save result in the output file