diff --git a/pre_processing.py b/pre_processing.py index 53c645273972aace9049e95ee43769e59351ac30..6e774e51b9088725f57d54314fb00b248fa46326 100755 --- a/pre_processing.py +++ b/pre_processing.py @@ -16,8 +16,10 @@ zipping all files splitting files that are too large for 1 assembly """ +compression_type = "gzip" # set the used compression method -def zip_file(file_path, output_path, compression_type="gzip"): + +def zip_file(file_path, output_path): """ compress the file and write it at the output path """ @@ -39,7 +41,7 @@ def zip_file(file_path, output_path, compression_type="gzip"): exit(0) -def unzip_file(file_path, output_path, compression_type="gzip"): +def unzip_file(file_path, output_path): """ uncompress the file and write it just where it is """ @@ -48,7 +50,7 @@ def unzip_file(file_path, output_path, compression_type="gzip"): decompression_command = "gzip -d "+ file_path subprocess.run('/bin/bash -c "$COMMAND"', shell=True, env={'COMMAND': decompression_command}) # move the unzipped file to the defined output path - os.replace(get_uncompressed_name(file_path, "gzip"), output_path) + os.replace(get_uncompressed_name(file_path), output_path) return if compression_type == "cmix": @@ -60,7 +62,7 @@ def unzip_file(file_path, output_path, compression_type="gzip"): exit(0) -def get_compressed_name(file_path, compression_type="gzip"): +def get_compressed_name(file_path): if compression_type == "gzip": return file_path+".gz" @@ -72,7 +74,7 @@ def get_compressed_name(file_path, compression_type="gzip"): exit(0) -def get_uncompressed_name(file_path, compression_type="gzip"): +def get_uncompressed_name(file_path): if compression_type == "gzip": return file_path.replace(".gz", "") @@ -227,56 +229,58 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str) for i, filename in enumerate(list(files_compressed_size.keys())[:-1]): file_compressed_size = files_compressed_size[filename] # get size of the compressed file - if file_compressed_size is None or file_compressed_size >= max_binary_length: # impossible to merge because too large, or has already be used in a merge (set to None) + if file_compressed_size is None or file_compressed_size > max_binary_length: # impossible to merge because too large, or has already be used in a merge (set to None) continue # skip this file for filename_2 in list(files_compressed_size.keys())[i+1:]: file_compressed_size_2 = files_compressed_size[filename_2] - if file_compressed_size_2 is None or file_compressed_size + file_compressed_size_2 <= max_binary_length: - # the sum of the 2 compressed files is lower than what can be stored, - # so the original files will be merged and recompressed, - # the compression of a merging is supposed to be smaller than the sum of compressions of each file - - # get the binary content of each file - with open(os.path.join(rearanged_files_dir_path, filename), "rb") as input_file: - bytes_content = b"".join(input_file.readlines()) - with open(os.path.join(rearanged_files_dir_path, filename_2), "rb") as input_file: - bytes_content_2 = b"".join(input_file.readlines()) - - # remove the "merged_" from the name of already merged files for visibility - merged_file_name = "merged_" + filename.replace("merged_","") + "+" + filename_2.replace("merged_","") - merged_file_path = os.path.join(rearanged_files_dir_path, merged_file_name) - compressed_merged_file_path = get_compressed_name(os.path.join(compressed_dir_path, merged_file_name)) - print("new merge :",filename,"and",filename_2) - - with open(merged_file_path, "wb") as f: # write the sum of bytes content - f.write(bytes_content + bytes_content_2) - - # compress the merged file created - zip_file(merged_file_path, compressed_merged_file_path) - - # test its size just in case, but it should fit in a molecule - merged_binary_len = len(file_to_dna.convert_file_to_bits(compressed_merged_file_path)) - - if merged_binary_len >= max_binary_length: - print("error merging result too large", compressed_merged_file_path) - exit(0) - - # add the merged file to the dict because it can still be used for other merging if it's short enough - files_compressed_size[merged_file_name] = merged_binary_len - - # remove the 2 old compressed files of the 2 files - os.remove(os.path.join(compressed_dir_path, get_compressed_name(filename))) - os.remove(os.path.join(compressed_dir_path, get_compressed_name(filename_2))) - - # set the compressed size of the 2 files to None to avoid them to be reused for merging - files_compressed_size[filename] = None - files_compressed_size[filename_2] = None - - new_merge = True # keep in memory that at least one new merge has been made in this loop + if file_compressed_size_2 is None or file_compressed_size + file_compressed_size_2 > max_binary_length: + continue # skip this file + + # the sum of the 2 compressed files is lower than what can be stored, + # so the original files will be merged and recompressed, + # the compression of a merging is supposed to be smaller than the sum of compressions of each file + + # get the binary content of each file + with open(os.path.join(rearanged_files_dir_path, filename), "rb") as input_file: + bytes_content = b"".join(input_file.readlines()) + with open(os.path.join(rearanged_files_dir_path, filename_2), "rb") as input_file: + bytes_content_2 = b"".join(input_file.readlines()) - break # leave the second loop, but others merges can still be done in the continuation of the first loop + # remove the "merged_" from the name of already merged files for visibility + merged_file_name = "merged_" + filename.replace("merged_","") + "+" + filename_2.replace("merged_","") + merged_file_path = os.path.join(rearanged_files_dir_path, merged_file_name) + compressed_merged_file_path = get_compressed_name(os.path.join(compressed_dir_path, merged_file_name)) + print("new merge :",filename,"and",filename_2) + + with open(merged_file_path, "wb") as f: # write the sum of bytes content + f.write(bytes_content + bytes_content_2) + + # compress the merged file created + zip_file(merged_file_path, compressed_merged_file_path) + + # test its size just in case, but it should fit in a molecule + merged_binary_len = len(file_to_dna.convert_file_to_bits(compressed_merged_file_path)) + + if merged_binary_len >= max_binary_length: + print("error merging result too large", compressed_merged_file_path) + exit(0) + + # add the merged file to the dict because it can still be used for other merging if it's short enough + files_compressed_size[merged_file_name] = merged_binary_len + + # remove the 2 old compressed files of the 2 files + os.remove(os.path.join(compressed_dir_path, get_compressed_name(filename))) + os.remove(os.path.join(compressed_dir_path, get_compressed_name(filename_2))) + + # set the compressed size of the 2 files to None to avoid them to be reused for merging + files_compressed_size[filename] = None + files_compressed_size[filename_2] = None + + new_merge = True # keep in memory that at least one new merge has been made in this loop + + break # leave the second loop, but others merges can still be done in the continuation of the first loop # continue to try to create other merging if at least one merge has been made if new_merge: