diff --git a/binary_dna_conversion.py b/binary_dna_conversion.py index 1510a0f0fc8a6f9587fac9f050918849f38b00e4..b2a9b726b2d50abe6149cfe9916091d1164e8942 100755 --- a/binary_dna_conversion.py +++ b/binary_dna_conversion.py @@ -206,7 +206,7 @@ def size_dna_from_bit_len_abaab(bit_length): return base_length + 2 if rest == 6: return base_length + 4 - print("error size_of_dna_from_bit_len : size not multiple of 2 :",str(bit_length)) + print("error size_of_dna_from_bit_len_abaab : size not multiple of 2 :",str(bit_length)) exit(1) diff --git a/file_to_dna.py b/file_to_dna.py index b6a041d07bddc75a4b1ff613e758967d98dfd574..baab5a37094805ea64a40bb2af1e09bc1d002248 100755 --- a/file_to_dna.py +++ b/file_to_dna.py @@ -13,6 +13,7 @@ import dna_file_reader as dfr """ Can be used to convert any type of file use the byte encoding of the file to convert to a dna sequence +the file need to have a round number of octets a single substitution error in the sequences can lead to a corrupted file (except for .txt files) """ @@ -30,7 +31,6 @@ intern_extremity_size = overhang_size + bsaI_size + buffer_size # size of non-pa max_total_block_size = 300 # maximum allowed size for a complete block min_total_block_size = 300 # minimum allowed size for a complete block -force_size_to_max = True # set to false if blocks can have any size between min and max, else force the size of all blocks to be the max size n_block_max = 10 # maximum assembled number of blocks @@ -113,6 +113,7 @@ def encode_file(input_path: str, output_path: str) -> None: """ binary_string = convert_file_to_bits(input_path) # get the binary string representing the file content + #binary_string = input_path #TODO REMOVE dna_payload_size = bdc.size_dna_from_bit_len_abaab(len(binary_string) + CHECK_SUM_SIZE) # length of the payload after conversion in dna #print("payload size",str(dna_payload_size)) @@ -125,42 +126,31 @@ def encode_file(input_path: str, output_path: str) -> None: print("error file to dna : block number too high :",str(block_number),"for",input_path) exit(1) - # size of non payload dna part that is contained in the blocks - dna_non_payload_size = 2*extern_extremity_size + 2*(block_number-1)*intern_extremity_size + # size of non payload dna part that is contained in the blocks and used for the assembly + dna_for_assembly_size = 2*extern_extremity_size + 2*(block_number-1)*intern_extremity_size # estimate the total dna sequence length after addition of non coding parts - total_dna_size = dna_payload_size + dna_non_payload_size + total_dna_size = dna_payload_size + dna_for_assembly_size #print("total_dna_size",str(total_dna_size)) - if block_number == 1 and total_dna_size < min_total_block_size: - #TODO fill the block - print("error file to dna : 1 block too short", str(total_dna_size)) - exit(1) - pass - - # add a non coding base at the end of the sequence to make the last block at correct size when it is not possible to do by adding bits of 0 (very rare case) + # possibility to add a non coding base at the end of the sequence to make the last block at correct size when it is not possible to do by adding bits of 0 (very rare case) add_base_at_end = False - - if force_size_to_max or total_dna_size % block_number > 0: # need to add non payload bits to adjust the size of blocks + # round the number of base per blocks, make sure it is above the minimal block size + final_assembly_size = block_number * max(math.ceil(total_dna_size / block_number), min_total_block_size) - # all blocks must be the maximum size - if force_size_to_max: # find the next length the total sequence needs to be to have a division in blocks of maximum size - final_assembly_size = block_number * max_total_block_size - else: # all blocks still need to have the same round size - # find the next length the total sequence needs to be to have equal division in blocks - final_assembly_size = block_number * math.ceil(total_dna_size / block_number) - #print("final_assembly_size",str(final_assembly_size)) - - # calculate the number of bits to add to get a round number of equal length blocks - dna_payload_needed_size = final_assembly_size - dna_non_payload_size + # case when some bits need to be added to increase the number of bases + if final_assembly_size != total_dna_size: + + # calculate the number of bases to add to the payload to get a round number of equal length blocks + dna_payload_needed_size = final_assembly_size - dna_for_assembly_size #print(bdc.size_binary_from_dna_len_abaab(dna_payload_needed_size) ) + # get number of bits to add filler_length = bdc.size_binary_from_dna_len_abaab(dna_payload_needed_size) - len(binary_string) - CHECK_SUM_SIZE #print("dna_payload_needed_size",str(dna_payload_needed_size)," +",filler_length,"bits") # fill with '0' at the beginning of the binary # not the end because some zip can end with octets of 0, which makes difficult to remove only the non coding '0' binary_string = math.ceil(filler_length) * "0" + binary_string - #print("updated binary size", str(len(binary_string))) # rare case where adding 0 can not solve the problem of round blocks @@ -184,7 +174,7 @@ def encode_file(input_path: str, output_path: str) -> None: if add_base_at_end: sequence += bdc.bit_to_dna_balance_GC("0", sequence[-1]) # add a non coding base different from the preceding one - total_sequence_size = len(sequence)+ dna_non_payload_size + total_sequence_size = len(sequence)+ dna_for_assembly_size # test for errors that should never occur (I hope ...) # round number of blocks, no blocks too large, no blocks to small @@ -193,10 +183,8 @@ def encode_file(input_path: str, output_path: str) -> None: print("\tseq payload size",str(len(sequence))) print("\ttotal estimated seq size",total_sequence_size) print("\t",str(block_number),"blocks of",str(total_sequence_size/block_number)) - print("TODO remove me") - #TODO - return exit(1) + #print("\t",str(block_number),"blocks of",str(total_sequence_size/block_number)) # split the sequence into blocks of correct size to add the non payload stuff later # start the block count at 1 sub_sequences_dict = {} @@ -223,9 +211,11 @@ def encode_file(input_path: str, output_path: str) -> None: # add number and blocks size to path, add type output_path = output_path + "_" + str(block_number) + "x" + str(total_sequence_size//block_number) + ".fasta" + #return sequence #TODO REMOVE + dfr.save_dict_to_fasta(sub_sequences_dict, output_path) - + def decode_file(input_path: str, output_path: str) -> None: """ @@ -236,7 +226,8 @@ def decode_file(input_path: str, output_path: str) -> None: sub_sequences_dict = dfr.read_fasta(input_path) sequence = "".join(sub_sequences_dict.values()) - + #sequence = input_path + # convert the dna sequence into a binary string binary_from_dna_string = bdc.dna_to_binary_abaab(sequence) @@ -266,6 +257,8 @@ def decode_file(input_path: str, output_path: str) -> None: while binary_string.startswith(8*"0"): # 1/256 (2**8) chance to remove actual data ! but 8*0 is ascii char NULL binary_string = binary_string[8:] + #return binary_string #TODO REMOVE + # convert binaries into bytes n = int(binary_string, 2) bytes = n.to_bytes((n.bit_length() + 7) // 8, 'big') @@ -281,18 +274,23 @@ if __name__ == '__main__': #binary_string = sys.argv[1] #seq = encode_file(doc_path) - print(get_max_binary_len()) - exit(0) - for i in range(0,300000): - """binary = str(bin(i))[2:] + #print(get_max_binary_len()) + #exit(0) - print(compute_check_sum(binary), calculate_check_sum2(binary), compute_check_sum(binary) == calculate_check_sum2(binary)) - if not compute_check_sum(binary) == calculate_check_sum2(binary): - exit(1)""" - #print("") + for i in range(400,10000, 8): + binary = i*"1" + if len(binary) % 2 != 0: + continue + print("i=",str(i)) - seq = encode_file("toto", i) - decode_file(seq, "toto") + seq = encode_file("", binary) + binary_result = decode_file(seq, "toto") + + if binary != binary_result: + #print(binary) + #print(seq) + #print(binary_result) + exit(0) #encode_file("", "test") #seq = binary_to_dna_abaab(binary_string) #print(seq)