Mentions légales du service

Skip to content
Snippets Groups Projects
Commit adaa1986 authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

refactorings, comments, generalise zipping methods, improve merging small files

parent be62c47f
Branches
No related tags found
No related merge requests found
...@@ -12,12 +12,6 @@ do the opposite of pre_processing.py ...@@ -12,12 +12,6 @@ do the opposite of pre_processing.py
take dna sequences and convert them into files take dna sequences and convert them into files
""" """
def compressed_name(file_path):
return file_path+".gz"
def uncompressed_name(file_path):
return file_path.replace(".gz", "")
def convert_to_binary(input_dir_path, compressed_dir_path): def convert_to_binary(input_dir_path, compressed_dir_path):
""" """
...@@ -26,7 +20,7 @@ def convert_to_binary(input_dir_path, compressed_dir_path): ...@@ -26,7 +20,7 @@ def convert_to_binary(input_dir_path, compressed_dir_path):
for filename in os.listdir(input_dir_path): for filename in os.listdir(input_dir_path):
file_path = os.path.join(input_dir_path, filename) file_path = os.path.join(input_dir_path, filename)
result_file_path = os.path.join(compressed_dir_path, compressed_name(filename)) result_file_path = os.path.join(compressed_dir_path, pre_processing.get_compressed_name(filename))
# checking if it is a file # checking if it is a file
if os.path.isfile(file_path): if os.path.isfile(file_path):
...@@ -44,14 +38,12 @@ def uncompress_files(compressed_dir_path, uncompressed_dir_path): ...@@ -44,14 +38,12 @@ def uncompress_files(compressed_dir_path, uncompressed_dir_path):
""" """
for filename in os.listdir(compressed_dir_path): for filename in os.listdir(compressed_dir_path):
file_path = os.path.join(compressed_dir_path, filename) file_path = os.path.join(compressed_dir_path, filename)
result_file_path = os.path.join(uncompressed_dir_path, uncompressed_name(filename)) uncompressed_file_path = os.path.join(uncompressed_dir_path, pre_processing.get_uncompressed_name(filename))
# checking if it is a file # checking if it is a file
if os.path.isfile(file_path): if os.path.isfile(file_path):
pre_processing.unzip_file(file_path) pre_processing.unzip_file(file_path, uncompressed_file_path)
# move the unzipped file
os.replace(uncompressed_name(file_path), result_file_path)
elif os.path.isdir(file_path): elif os.path.isdir(file_path):
print("error post processing (uncompress_files) : directory found in compressed_dir_path", filename) print("error post processing (uncompress_files) : directory found in compressed_dir_path", filename)
......
...@@ -17,28 +17,72 @@ splitting files that are too large for 1 assembly ...@@ -17,28 +17,72 @@ splitting files that are too large for 1 assembly
""" """
def zip_file(file_path, output_path): def zip_file(file_path, output_path, compression_type="gzip"):
""" """
split the file with gzip and write it at the output path compress the file and write it at the output path
""" """
compression_command = "gzip -c9 "+ file_path + " > "+output_path+".gz"
subprocess.run('/bin/bash -c "$COMMAND"', shell=True, env={'COMMAND': compression_command}) if compression_type == "gzip":
compression_command = "gzip -c9 "+ file_path + " > "+output_path
subprocess.run('/bin/bash -c "$COMMAND"', shell=True, env={'COMMAND': compression_command})
return
if compression_type == "cmix":
if not os.path.isfile(output_path):
compression_command = "/udd/oboulle/Documents/result_analysis/compression_analysis/cmix/cmix -c "+ file_path + " "+output_path
subprocess.run('/bin/bash -c "$COMMAND"', shell=True, env={'COMMAND': compression_command})
else:
print("already done",output_path)
return
# type not supported
print("compression error, unknown format:",compression_type)
exit(0)
def unzip_file(file_path): def unzip_file(file_path, output_path, compression_type="gzip"):
""" """
unzip the file and write it just where it is uncompress the file and write it just where it is
""" """
decompression_command = "gzip -d "+ file_path
subprocess.run('/bin/bash -c "$COMMAND"', shell=True, env={'COMMAND': decompression_command}) if compression_type == "gzip":
decompression_command = "gzip -d "+ file_path
subprocess.run('/bin/bash -c "$COMMAND"', shell=True, env={'COMMAND': decompression_command})
def compressed_name(file_path): # move the unzipped file to the defined output path
return file_path+".gz" os.replace(get_uncompressed_name(file_path, "gzip"), output_path)
return
def uncompressed_name(file_path):
return file_path.replace(".gz", "") if compression_type == "cmix":
decompression_command = "/udd/oboulle/Documents/result_analysis/compression_analysis/cmix/cmix -d "+ file_path + " "+output_path
subprocess.run('/bin/bash -c "$COMMAND"', shell=True, env={'COMMAND': decompression_command})
return
# type not supported
print("decompression error, unknown format:",compression_type)
exit(0)
def get_compressed_name(file_path, compression_type="gzip"):
if compression_type == "gzip":
return file_path+".gz"
if compression_type == "cmix":
return file_path+".cx"
# type not supported
print("get_compressed_name error, unknown format:",compression_type)
exit(0)
def get_uncompressed_name(file_path, compression_type="gzip"):
if compression_type == "gzip":
return file_path.replace(".gz", "")
if compression_type == "cmix":
return file_path.replace(".cx", "")
# type not supported
print("get_uncompressed_name error, unknown format:",compression_type)
exit(0)
def insert_path_in_files(input_dir_path: str, rearanged_files_dir_path: str) -> None: def insert_path_in_files(input_dir_path: str, rearanged_files_dir_path: str) -> None:
""" """
...@@ -86,18 +130,18 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str) ...@@ -86,18 +130,18 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
# compress the file # compress the file
file_path = os.path.join(rearanged_files_dir_path, filename) file_path = os.path.join(rearanged_files_dir_path, filename)
compressed_file_path = os.path.join(compressed_dir_path, filename) comp_file_path = get_compressed_name(os.path.join(compressed_dir_path, filename))
# checking if it is a file # checking if it is a file
if os.path.isfile(file_path): if os.path.isfile(file_path):
zip_file(file_path, compressed_file_path) zip_file(file_path, comp_file_path)
elif os.path.isdir(file_path): elif os.path.isdir(file_path):
print("error pre processing (compress_all) : directory found in rearanged dir path", filename) print("error pre processing (compress_all) : directory found in rearanged dir path", filename)
exit(0) exit(0)
# get binary size of the compressed file # get binary size of the compressed file
binary_len = len(file_to_dna.convert_file_to_bits(compressed_name(compressed_file_path))) binary_len = len(file_to_dna.convert_file_to_bits(comp_file_path))
if binary_len <= max_binary_length: # if acceptable length, it's perfect if binary_len <= max_binary_length: # if acceptable length, it's perfect
files_compressed_size[filename] = binary_len # save the compressed size for this file files_compressed_size[filename] = binary_len # save the compressed size for this file
...@@ -105,7 +149,7 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str) ...@@ -105,7 +149,7 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
else: else:
# file too large, nedd to split it # file too large, nedd to split it
os.remove(compressed_name(compressed_file_path)) # delete the compressed file os.remove(comp_file_path) # delete the compressed file
# read the original file as bytes # read the original file as bytes
with open(file_path, "rb") as input_file: with open(file_path, "rb") as input_file:
...@@ -147,10 +191,10 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str) ...@@ -147,10 +191,10 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
with open(split_file_path, "wb") as f: # write the bytes content with open(split_file_path, "wb") as f: # write the bytes content
f.write(split_file_bytes_content) f.write(split_file_bytes_content)
compressed_subfile_path = get_compressed_name(get_uncompressed_name(comp_file_path) + split_file_footer)
# compress the split_file to the compressed directory # compress the split_file to the compressed directory
zip_file(split_file_path, compressed_file_path + split_file_footer) zip_file(split_file_path, compressed_subfile_path)
compressed_subfile_path = compressed_name(compressed_file_path + split_file_footer)
# check the size of the subfile # check the size of the subfile
binary_len = len(file_to_dna.convert_file_to_bits(compressed_subfile_path)) binary_len = len(file_to_dna.convert_file_to_bits(compressed_subfile_path))
...@@ -166,28 +210,35 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str) ...@@ -166,28 +210,35 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
# if the sub file is recreated, the new size will overwrite this one # if the sub file is recreated, the new size will overwrite this one
def merge_short_files(files_compressed_size: dict): def merge_short_files(files_compressed_size: dict):
"""
each molecule can store a maximum fixed number of bits
since it's more efficient to have a few long molecule than a lot of short molecule,
small files can be merged in one same molecule
"""
#sorted_files_sizes = sorted(files_compressed_size.items(), key=lambda item: item[1], reverse=True) # sort dict by sizes from highest to lowest #sorted_files_sizes = sorted(files_compressed_size.items(), key=lambda item: item[1], reverse=True) # sort dict by sizes from highest to lowest
files_compressed_size_bis = {} #files_compressed_size_bis = {}
merged_files_paths = [] #merged_files_paths = []
new_merge = False # true if at least one new merge has been made new_merge = False # set to true if at least one new merge has been made
for i, filename in enumerate(list(files_compressed_size.keys())[:-1]): for i, filename in enumerate(list(files_compressed_size.keys())[:-1]):
unmerged = True # set to false if a merge is made file_compressed_size = files_compressed_size[filename] # get size of the compressed file
file_compressed_size = files_compressed_size[filename]
if file_compressed_size >= max_binary_length: # impossible to merge because too large if file_compressed_size is None or file_compressed_size >= max_binary_length: # impossible to merge because too large, or has already be used in a merge (set to None)
continue continue # skip this file
for filename_2 in list(files_compressed_size.keys())[i+1:]: for filename_2 in list(files_compressed_size.keys())[i+1:]:
file_compressed_size_2 = files_compressed_size[filename_2] file_compressed_size_2 = files_compressed_size[filename_2]
if file_compressed_size + file_compressed_size_2 <= max_binary_length: if file_compressed_size_2 is None or file_compressed_size + file_compressed_size_2 <= max_binary_length:
# the sum of the 2 compressed files is lower than what can be stored,
# so the original files will be merged and recompressed,
# the compression of a merging is supposed to be smaller than the sum of compressions of each file
# merge the 2 files # get the binary content of each file
with open(os.path.join(rearanged_files_dir_path, filename), "rb") as input_file: with open(os.path.join(rearanged_files_dir_path, filename), "rb") as input_file:
bytes_content = b"".join(input_file.readlines()) bytes_content = b"".join(input_file.readlines())
with open(os.path.join(rearanged_files_dir_path, filename_2), "rb") as input_file: with open(os.path.join(rearanged_files_dir_path, filename_2), "rb") as input_file:
...@@ -196,15 +247,17 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str) ...@@ -196,15 +247,17 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
# remove the "merged_" from the name of already merged files for visibility # remove the "merged_" from the name of already merged files for visibility
merged_file_name = "merged_" + filename.replace("merged_","") + "+" + filename_2.replace("merged_","") merged_file_name = "merged_" + filename.replace("merged_","") + "+" + filename_2.replace("merged_","")
merged_file_path = os.path.join(rearanged_files_dir_path, merged_file_name) merged_file_path = os.path.join(rearanged_files_dir_path, merged_file_name)
with open(merged_file_path, "wb") as f: # write the bytes content compressed_merged_file_path = get_compressed_name(os.path.join(compressed_dir_path, merged_file_name))
print("new merge :",filename,"and",filename_2)
with open(merged_file_path, "wb") as f: # write the sum of bytes content
f.write(bytes_content + bytes_content_2) f.write(bytes_content + bytes_content_2)
compressed_merged_file_path = os.path.join(compressed_dir_path, merged_file_name) # compress the merged file created
# add the merged file path to the bis dict with it's compressed size
zip_file(merged_file_path, compressed_merged_file_path) zip_file(merged_file_path, compressed_merged_file_path)
merged_binary_len = len(file_to_dna.convert_file_to_bits(compressed_name(compressed_merged_file_path))) # test its size just in case, but it should fit in a molecule
merged_binary_len = len(file_to_dna.convert_file_to_bits(compressed_merged_file_path))
if merged_binary_len >= max_binary_length: if merged_binary_len >= max_binary_length:
print("error merging result too large", compressed_merged_file_path) print("error merging result too large", compressed_merged_file_path)
...@@ -213,23 +266,25 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str) ...@@ -213,23 +266,25 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
# add the merged file to the dict because it can still be used for other merging if it's short enough # add the merged file to the dict because it can still be used for other merging if it's short enough
files_compressed_size[merged_file_name] = merged_binary_len files_compressed_size[merged_file_name] = merged_binary_len
# remove the 2 compressed files of the 2 files # remove the 2 old compressed files of the 2 files
os.remove(os.path.join(compressed_dir_path, compressed_name(filename))) os.remove(os.path.join(compressed_dir_path, get_compressed_name(filename)))
os.remove(os.path.join(compressed_dir_path, compressed_name(filename_2))) os.remove(os.path.join(compressed_dir_path, get_compressed_name(filename_2)))
# set the compressed size of the 2 files to None to avoid them to be reused for merging
files_compressed_size[filename] = None
files_compressed_size[filename_2] = None
# set the compressed size of the 2 files to a too high number to avoid them to be reused for merging new_merge = True # keep in memory that at least one new merge has been made in this loop
files_compressed_size[filename] = 2*max_binary_length
files_compressed_size[filename_2] = 2*max_binary_length
new_merge = True break # leave the second loop, but others merges can still be done in the continuation of the first loop
break
# continue to try to create other merging if at least one merge has been made # continue to try to create other merging if at least one merge has been made
# otherwise, the loop can end since it no longer find possible merges
if new_merge: if new_merge:
print("continue merging...") print("continue merging...")
print(files_compressed_size)
merge_short_files(files_compressed_size) merge_short_files(files_compressed_size)
# otherwise, the loop can end since it no longer find possible merges
print(files_compressed_size) print(files_compressed_size)
merge_short_files(files_compressed_size) merge_short_files(files_compressed_size)
...@@ -245,7 +300,7 @@ def convert_to_sequence(compressed_dir_path, payload_fragments_dir_path): ...@@ -245,7 +300,7 @@ def convert_to_sequence(compressed_dir_path, payload_fragments_dir_path):
# checking if it is a file # checking if it is a file
if os.path.isfile(file_path): if os.path.isfile(file_path):
output_file_path = os.path.join(payload_fragments_dir_path, uncompressed_name(filename)) output_file_path = os.path.join(payload_fragments_dir_path, get_uncompressed_name(filename))
dna_sequence = file_to_dna.encode_file(file_path, output_file_path) # convert binaries into a dna sequence and save result in the output file dna_sequence = file_to_dna.encode_file(file_path, output_file_path) # convert binaries into a dna sequence and save result in the output file
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment