diff --git a/file_to_dna.py b/file_to_dna.py index baab5a37094805ea64a40bb2af1e09bc1d002248..30cc83496202dea4b42b6edee23a07c3f7f6cc6e 100755 --- a/file_to_dna.py +++ b/file_to_dna.py @@ -112,9 +112,13 @@ def encode_file(input_path: str, output_path: str) -> None: objective is to fit the payload in the lowest number of blocks, but all blocks must be the same size after addition of non coding stuff (primer/buffer/overhang/bsaI) """ + #TODO binary_string = convert_file_to_bits(input_path) # get the binary string representing the file content #binary_string = input_path #TODO REMOVE + if (len(binary_string) + CHECK_SUM_SIZE) % 8 in [1, 4, 6]: # add a non coding 0 when size of incompatible length for dna conversion + binary_string = "0" + binary_string + dna_payload_size = bdc.size_dna_from_bit_len_abaab(len(binary_string) + CHECK_SUM_SIZE) # length of the payload after conversion in dna #print("payload size",str(dna_payload_size)) @@ -132,14 +136,11 @@ def encode_file(input_path: str, output_path: str) -> None: # estimate the total dna sequence length after addition of non coding parts total_dna_size = dna_payload_size + dna_for_assembly_size #print("total_dna_size",str(total_dna_size)) - - # possibility to add a non coding base at the end of the sequence to make the last block at correct size when it is not possible to do by adding bits of 0 (very rare case) - add_base_at_end = False - + # round the number of base per blocks, make sure it is above the minimal block size final_assembly_size = block_number * max(math.ceil(total_dna_size / block_number), min_total_block_size) - # case when some bits need to be added to increase the number of bases + # case when some bits needs to be added to increase the number of bases if final_assembly_size != total_dna_size: # calculate the number of bases to add to the payload to get a round number of equal length blocks @@ -153,12 +154,6 @@ def encode_file(input_path: str, output_path: str) -> None: binary_string = math.ceil(filler_length) * "0" + binary_string #print("updated binary size", str(len(binary_string))) - # rare case where adding 0 can not solve the problem of round blocks - if dna_payload_needed_size % 5 == 3: - # need a non coding base at the end - #print("added non coding base") - add_base_at_end = True - # apply a filter to the binary string -> shuffle the data to avoid long rows of 0 or 1, and avoid rows repetitions binary_string = binary_string[::-1] # reverse the binary string, because 2 files can have the same start with ziping methods @@ -172,7 +167,7 @@ def encode_file(input_path: str, output_path: str) -> None: # convert binaries into dna sequence sequence = bdc.binary_to_dna_abaab(filtered_binary_string) - if add_base_at_end: sequence += bdc.bit_to_dna_balance_GC("0", sequence[-1]) # add a non coding base different from the preceding one + #if add_base_at_end: sequence += bdc.bit_to_dna_balance_GC("0", sequence[-1]) # add a non coding base different from the preceding one total_sequence_size = len(sequence)+ dna_for_assembly_size @@ -189,7 +184,6 @@ def encode_file(input_path: str, output_path: str) -> None: # split the sequence into blocks of correct size to add the non payload stuff later # start the block count at 1 sub_sequences_dict = {} - if block_number == 1: sub_sequences_dict["1"] = sequence elif block_number == 2: @@ -226,7 +220,7 @@ def decode_file(input_path: str, output_path: str) -> None: sub_sequences_dict = dfr.read_fasta(input_path) sequence = "".join(sub_sequences_dict.values()) - #sequence = input_path + #sequence = input_path #TODO REMOVE # convert the dna sequence into a binary string binary_from_dna_string = bdc.dna_to_binary_abaab(sequence) @@ -283,8 +277,8 @@ if __name__ == '__main__': continue print("i=",str(i)) - seq = encode_file("", binary) - binary_result = decode_file(seq, "toto") + seq = encode_file(binary, "") + binary_result = decode_file(seq, "") if binary != binary_result: #print(binary)