refactorings, comments, generalise zipping methods, improve merging small files

adaa1986 · BOULLE Olivier · be62c47f · adaa1986 · adaa1986
Commit adaa1986 authored 1 year ago by BOULLE Olivier
--- a/post_processing.py
+++ b/post_processing.py
@@ -12,12 +12,6 @@ do the opposite of pre_processing.py
 take dna sequences and convert them into files
 """
-def compressed_name(file_path):
-    return file_path+".gz"
-def uncompressed_name(file_path):
-    return file_path.replace(".gz", "")
 def convert_to_binary(input_dir_path, compressed_dir_path):
    """
@@ -26,7 +20,7 @@ def convert_to_binary(input_dir_path, compressed_dir_path):
    for filename in os.listdir(input_dir_path):
        file_path = os.path.join(input_dir_path, filename)
-        result_file_path = os.path.join(compressed_dir_path, compressed_name(filename))
+        result_file_path = os.path.join(compressed_dir_path, pre_processing.get_compressed_name(filename))
        # checking if it is a file
        if os.path.isfile(file_path):
@@ -44,14 +38,12 @@ def uncompress_files(compressed_dir_path, uncompressed_dir_path):
    """
    for filename in os.listdir(compressed_dir_path):
        file_path = os.path.join(compressed_dir_path, filename)
-        result_file_path = os.path.join(uncompressed_dir_path, uncompressed_name(filename))
+        uncompressed_file_path = os.path.join(uncompressed_dir_path, pre_processing.get_uncompressed_name(filename))
        # checking if it is a file
        if os.path.isfile(file_path):
-            pre_processing.unzip_file(file_path)
+            pre_processing.unzip_file(file_path, uncompressed_file_path)
-            # move the unzipped file
-            os.replace(uncompressed_name(file_path), result_file_path)
        elif os.path.isdir(file_path):
            print("error post processing (uncompress_files) : directory found in compressed_dir_path", filename)

--- a/pre_processing.py
+++ b/pre_processing.py
@@ -17,28 +17,72 @@ splitting files that are too large for 1 assembly
 """
-def zip_file(file_path, output_path):
+def zip_file(file_path, output_path, compression_type="gzip"):
    """
-    split the file with gzip and write it at the output path
+    compress the file and write it at the output path
    """
-    compression_command = "gzip -c9 "+ file_path + " > "+output_path+".gz"
-    subprocess.run('/bin/bash -c "$COMMAND"', shell=True, env={'COMMAND': compression_command})
+    if compression_type == "gzip":
+        compression_command = "gzip -c9 "+ file_path + " > "+output_path
+        subprocess.run('/bin/bash -c "$COMMAND"', shell=True, env={'COMMAND': compression_command})
+        return
+    if compression_type == "cmix":
+        if not os.path.isfile(output_path):
+            compression_command = "/udd/oboulle/Documents/result_analysis/compression_analysis/cmix/cmix -c "+ file_path + " "+output_path
+            subprocess.run('/bin/bash -c "$COMMAND"', shell=True, env={'COMMAND': compression_command})
+        else:
+            print("already done",output_path)
+        return
+    # type not supported
+    print("compression error, unknown format:",compression_type)
+    exit(0)
-def unzip_file(file_path):
+def unzip_file(file_path, output_path, compression_type="gzip"):
    """
-    unzip the file and write it just where it is
+    uncompress the file and write it just where it is
    """
-    decompression_command = "gzip -d "+ file_path
-    subprocess.run('/bin/bash -c "$COMMAND"', shell=True, env={'COMMAND': decompression_command})
+    if compression_type == "gzip":
+        decompression_command = "gzip -d "+ file_path
+        subprocess.run('/bin/bash -c "$COMMAND"', shell=True, env={'COMMAND': decompression_command})
-def compressed_name(file_path):
+        # move the unzipped file to the defined output path
-    return file_path+".gz"
+        os.replace(get_uncompressed_name(file_path, "gzip"), output_path)
+        return
-def uncompressed_name(file_path):
-    return file_path.replace(".gz", "")
+    if compression_type == "cmix":
+        decompression_command = "/udd/oboulle/Documents/result_analysis/compression_analysis/cmix/cmix -d "+ file_path + " "+output_path
+        subprocess.run('/bin/bash -c "$COMMAND"', shell=True, env={'COMMAND': decompression_command})
+        return
+    # type not supported
+    print("decompression error, unknown format:",compression_type)
+    exit(0)
+def get_compressed_name(file_path, compression_type="gzip"):
+    if compression_type == "gzip":
+        return file_path+".gz"
+    if compression_type == "cmix":
+        return file_path+".cx"
+    # type not supported
+    print("get_compressed_name error, unknown format:",compression_type)
+    exit(0)
+def get_uncompressed_name(file_path, compression_type="gzip"):
+    if compression_type == "gzip":
+        return file_path.replace(".gz", "")
+    if compression_type == "cmix":
+        return file_path.replace(".cx", "")
+    # type not supported
+    print("get_uncompressed_name error, unknown format:",compression_type)
+    exit(0)
 def insert_path_in_files(input_dir_path: str, rearanged_files_dir_path: str) -> None:
    """
@@ -86,18 +130,18 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
        # compress the file
        file_path = os.path.join(rearanged_files_dir_path, filename)
-        compressed_file_path = os.path.join(compressed_dir_path, filename)
+        comp_file_path = get_compressed_name(os.path.join(compressed_dir_path, filename))
        # checking if it is a file
        if os.path.isfile(file_path):
-            zip_file(file_path, compressed_file_path)
+            zip_file(file_path, comp_file_path)
        elif os.path.isdir(file_path):
            print("error pre processing (compress_all) : directory found in rearanged dir path", filename)
            exit(0)
        # get binary size of the compressed file
-        binary_len = len(file_to_dna.convert_file_to_bits(compressed_name(compressed_file_path)))
+        binary_len = len(file_to_dna.convert_file_to_bits(comp_file_path))
        if binary_len <= max_binary_length: # if acceptable length, it's perfect
            files_compressed_size[filename] = binary_len # save the compressed size for this file
@@ -105,7 +149,7 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
        else:
            # file too large, nedd to split it
-            os.remove(compressed_name(compressed_file_path)) # delete the compressed file
+            os.remove(comp_file_path) # delete the compressed file
            # read the original file as bytes
            with open(file_path, "rb") as input_file:
@@ -147,10 +191,10 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
                    with open(split_file_path, "wb") as f: # write the bytes content
                        f.write(split_file_bytes_content)
+                    compressed_subfile_path = get_compressed_name(get_uncompressed_name(comp_file_path) + split_file_footer)
                    # compress the split_file to the compressed directory
-                    zip_file(split_file_path, compressed_file_path + split_file_footer)
+                    zip_file(split_file_path, compressed_subfile_path)
-                    compressed_subfile_path = compressed_name(compressed_file_path + split_file_footer)
                    # check the size of the subfile
                    binary_len = len(file_to_dna.convert_file_to_bits(compressed_subfile_path))
@@ -166,28 +210,35 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
                        # if the sub file is recreated, the new size will overwrite this one
    def merge_short_files(files_compressed_size: dict):
+        """
+        each molecule can store a maximum fixed number of bits
+        since it's more efficient to have a few long molecule than a lot of short molecule,
+        small files can be merged in one same molecule
+        """
        #sorted_files_sizes = sorted(files_compressed_size.items(), key=lambda item: item[1], reverse=True) # sort dict by sizes from highest to lowest
-        files_compressed_size_bis = {}
+        #files_compressed_size_bis = {}
-        merged_files_paths = []
+        #merged_files_paths = []
-        new_merge = False # true if at least one new merge has been made
+        new_merge = False # set to true if at least one new merge has been made
        for i, filename in enumerate(list(files_compressed_size.keys())[:-1]):
-            unmerged = True # set to false if a merge is made
+            file_compressed_size = files_compressed_size[filename] # get size of the compressed file
-            file_compressed_size = files_compressed_size[filename]
-            if file_compressed_size >= max_binary_length: # impossible to merge because too large
+            if file_compressed_size is None or file_compressed_size >= max_binary_length: # impossible to merge because too large, or has already be used in a merge (set to None)
-                continue
+                continue # skip this file
            for filename_2 in list(files_compressed_size.keys())[i+1:]:
                file_compressed_size_2 = files_compressed_size[filename_2]
-                if file_compressed_size + file_compressed_size_2 <= max_binary_length:
+                if file_compressed_size_2 is None or file_compressed_size + file_compressed_size_2 <= max_binary_length:
+                    # the sum of the 2 compressed files is lower than what can be stored,
+                    # so the original files will be merged and recompressed,
+                    # the compression of a merging is supposed to be smaller than the sum of compressions of each file
-                    # merge the 2 files
+                    # get the binary content of each file
                    with open(os.path.join(rearanged_files_dir_path, filename), "rb") as input_file:
                        bytes_content = b"".join(input_file.readlines())
                    with open(os.path.join(rearanged_files_dir_path, filename_2), "rb") as input_file:
@@ -196,15 +247,17 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
                    # remove the "merged_" from the name of already merged files for visibility
                    merged_file_name = "merged_" + filename.replace("merged_","") + "+" + filename_2.replace("merged_","") 
                    merged_file_path = os.path.join(rearanged_files_dir_path, merged_file_name)
-                    with open(merged_file_path, "wb") as f: # write the bytes content
+                    compressed_merged_file_path = get_compressed_name(os.path.join(compressed_dir_path, merged_file_name))
+                    print("new merge :",filename,"and",filename_2)
+                    with open(merged_file_path, "wb") as f: # write the sum of bytes content
                        f.write(bytes_content + bytes_content_2)
-                    compressed_merged_file_path = os.path.join(compressed_dir_path, merged_file_name)
+                    # compress the merged file created
-                    # add the merged file path to the bis dict with it's compressed size
                    zip_file(merged_file_path, compressed_merged_file_path)
-                    merged_binary_len = len(file_to_dna.convert_file_to_bits(compressed_name(compressed_merged_file_path)))
+                    # test its size just in case, but it should fit in a molecule
+                    merged_binary_len = len(file_to_dna.convert_file_to_bits(compressed_merged_file_path))
                    if merged_binary_len >= max_binary_length: 
                        print("error merging result too large", compressed_merged_file_path)
@@ -213,23 +266,25 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
                    # add the merged file to the dict because it can still be used for other merging if it's short enough
                    files_compressed_size[merged_file_name] = merged_binary_len
-                    # remove the 2 compressed files of the 2 files
+                    # remove the 2 old compressed files of the 2 files
-                    os.remove(os.path.join(compressed_dir_path, compressed_name(filename)))
+                    os.remove(os.path.join(compressed_dir_path, get_compressed_name(filename)))
-                    os.remove(os.path.join(compressed_dir_path, compressed_name(filename_2)))
+                    os.remove(os.path.join(compressed_dir_path, get_compressed_name(filename_2)))
+                    # set the compressed size of the 2 files to None to avoid them to be reused for merging
+                    files_compressed_size[filename] = None
+                    files_compressed_size[filename_2] = None
-                    # set the compressed size of the 2 files to a too high number to avoid them to be reused for merging
+                    new_merge = True # keep in memory that at least one new merge has been made in this loop
-                    files_compressed_size[filename] = 2*max_binary_length
-                    files_compressed_size[filename_2] = 2*max_binary_length
-                    new_merge = True
+                    break # leave the second loop, but others merges can still be done in the continuation of the first loop
-                    break
        # continue to try to create other merging if at least one merge has been made
-        # otherwise, the loop can end since it no longer find possible merges
        if new_merge:
            print("continue merging...")
+            print(files_compressed_size)
            merge_short_files(files_compressed_size)
+        # otherwise, the loop can end since it no longer find possible merges   
    print(files_compressed_size)
    merge_short_files(files_compressed_size)
@@ -245,7 +300,7 @@ def convert_to_sequence(compressed_dir_path, payload_fragments_dir_path):
        # checking if it is a file
        if os.path.isfile(file_path):
-            output_file_path = os.path.join(payload_fragments_dir_path, uncompressed_name(filename))
+            output_file_path = os.path.join(payload_fragments_dir_path, get_uncompressed_name(filename))
            dna_sequence = file_to_dna.encode_file(file_path, output_file_path) # convert binaries into a dna sequence and save result in the output file