Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 5712f04e authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

refactoring, correct comments, greatly simplify binary division and size adjutments

parent d75d2e9f
No related branches found
No related tags found
No related merge requests found
...@@ -206,7 +206,7 @@ def size_dna_from_bit_len_abaab(bit_length): ...@@ -206,7 +206,7 @@ def size_dna_from_bit_len_abaab(bit_length):
return base_length + 2 return base_length + 2
if rest == 6: if rest == 6:
return base_length + 4 return base_length + 4
print("error size_of_dna_from_bit_len : size not multiple of 2 :",str(bit_length)) print("error size_of_dna_from_bit_len_abaab : size not multiple of 2 :",str(bit_length))
exit(1) exit(1)
......
...@@ -13,6 +13,7 @@ import dna_file_reader as dfr ...@@ -13,6 +13,7 @@ import dna_file_reader as dfr
""" """
Can be used to convert any type of file Can be used to convert any type of file
use the byte encoding of the file to convert to a dna sequence use the byte encoding of the file to convert to a dna sequence
the file need to have a round number of octets
a single substitution error in the sequences can lead to a corrupted file (except for .txt files) a single substitution error in the sequences can lead to a corrupted file (except for .txt files)
""" """
...@@ -30,7 +31,6 @@ intern_extremity_size = overhang_size + bsaI_size + buffer_size # size of non-pa ...@@ -30,7 +31,6 @@ intern_extremity_size = overhang_size + bsaI_size + buffer_size # size of non-pa
max_total_block_size = 300 # maximum allowed size for a complete block max_total_block_size = 300 # maximum allowed size for a complete block
min_total_block_size = 300 # minimum allowed size for a complete block min_total_block_size = 300 # minimum allowed size for a complete block
force_size_to_max = True # set to false if blocks can have any size between min and max, else force the size of all blocks to be the max size
n_block_max = 10 # maximum assembled number of blocks n_block_max = 10 # maximum assembled number of blocks
...@@ -113,6 +113,7 @@ def encode_file(input_path: str, output_path: str) -> None: ...@@ -113,6 +113,7 @@ def encode_file(input_path: str, output_path: str) -> None:
""" """
binary_string = convert_file_to_bits(input_path) # get the binary string representing the file content binary_string = convert_file_to_bits(input_path) # get the binary string representing the file content
#binary_string = input_path #TODO REMOVE
dna_payload_size = bdc.size_dna_from_bit_len_abaab(len(binary_string) + CHECK_SUM_SIZE) # length of the payload after conversion in dna dna_payload_size = bdc.size_dna_from_bit_len_abaab(len(binary_string) + CHECK_SUM_SIZE) # length of the payload after conversion in dna
#print("payload size",str(dna_payload_size)) #print("payload size",str(dna_payload_size))
...@@ -125,42 +126,31 @@ def encode_file(input_path: str, output_path: str) -> None: ...@@ -125,42 +126,31 @@ def encode_file(input_path: str, output_path: str) -> None:
print("error file to dna : block number too high :",str(block_number),"for",input_path) print("error file to dna : block number too high :",str(block_number),"for",input_path)
exit(1) exit(1)
# size of non payload dna part that is contained in the blocks # size of non payload dna part that is contained in the blocks and used for the assembly
dna_non_payload_size = 2*extern_extremity_size + 2*(block_number-1)*intern_extremity_size dna_for_assembly_size = 2*extern_extremity_size + 2*(block_number-1)*intern_extremity_size
# estimate the total dna sequence length after addition of non coding parts # estimate the total dna sequence length after addition of non coding parts
total_dna_size = dna_payload_size + dna_non_payload_size total_dna_size = dna_payload_size + dna_for_assembly_size
#print("total_dna_size",str(total_dna_size)) #print("total_dna_size",str(total_dna_size))
if block_number == 1 and total_dna_size < min_total_block_size: # possibility to add a non coding base at the end of the sequence to make the last block at correct size when it is not possible to do by adding bits of 0 (very rare case)
#TODO fill the block
print("error file to dna : 1 block too short", str(total_dna_size))
exit(1)
pass
# add a non coding base at the end of the sequence to make the last block at correct size when it is not possible to do by adding bits of 0 (very rare case)
add_base_at_end = False add_base_at_end = False
# round the number of base per blocks, make sure it is above the minimal block size
if force_size_to_max or total_dna_size % block_number > 0: # need to add non payload bits to adjust the size of blocks final_assembly_size = block_number * max(math.ceil(total_dna_size / block_number), min_total_block_size)
# all blocks must be the maximum size # case when some bits need to be added to increase the number of bases
if force_size_to_max: # find the next length the total sequence needs to be to have a division in blocks of maximum size if final_assembly_size != total_dna_size:
final_assembly_size = block_number * max_total_block_size
else: # all blocks still need to have the same round size # calculate the number of bases to add to the payload to get a round number of equal length blocks
# find the next length the total sequence needs to be to have equal division in blocks dna_payload_needed_size = final_assembly_size - dna_for_assembly_size
final_assembly_size = block_number * math.ceil(total_dna_size / block_number)
#print("final_assembly_size",str(final_assembly_size))
# calculate the number of bits to add to get a round number of equal length blocks
dna_payload_needed_size = final_assembly_size - dna_non_payload_size
#print(bdc.size_binary_from_dna_len_abaab(dna_payload_needed_size) ) #print(bdc.size_binary_from_dna_len_abaab(dna_payload_needed_size) )
# get number of bits to add
filler_length = bdc.size_binary_from_dna_len_abaab(dna_payload_needed_size) - len(binary_string) - CHECK_SUM_SIZE filler_length = bdc.size_binary_from_dna_len_abaab(dna_payload_needed_size) - len(binary_string) - CHECK_SUM_SIZE
#print("dna_payload_needed_size",str(dna_payload_needed_size)," +",filler_length,"bits") #print("dna_payload_needed_size",str(dna_payload_needed_size)," +",filler_length,"bits")
# fill with '0' at the beginning of the binary # not the end because some zip can end with octets of 0, which makes difficult to remove only the non coding '0' # fill with '0' at the beginning of the binary # not the end because some zip can end with octets of 0, which makes difficult to remove only the non coding '0'
binary_string = math.ceil(filler_length) * "0" + binary_string binary_string = math.ceil(filler_length) * "0" + binary_string
#print("updated binary size", str(len(binary_string))) #print("updated binary size", str(len(binary_string)))
# rare case where adding 0 can not solve the problem of round blocks # rare case where adding 0 can not solve the problem of round blocks
...@@ -184,7 +174,7 @@ def encode_file(input_path: str, output_path: str) -> None: ...@@ -184,7 +174,7 @@ def encode_file(input_path: str, output_path: str) -> None:
if add_base_at_end: sequence += bdc.bit_to_dna_balance_GC("0", sequence[-1]) # add a non coding base different from the preceding one if add_base_at_end: sequence += bdc.bit_to_dna_balance_GC("0", sequence[-1]) # add a non coding base different from the preceding one
total_sequence_size = len(sequence)+ dna_non_payload_size total_sequence_size = len(sequence)+ dna_for_assembly_size
# test for errors that should never occur (I hope ...) # test for errors that should never occur (I hope ...)
# round number of blocks, no blocks too large, no blocks to small # round number of blocks, no blocks too large, no blocks to small
...@@ -193,10 +183,8 @@ def encode_file(input_path: str, output_path: str) -> None: ...@@ -193,10 +183,8 @@ def encode_file(input_path: str, output_path: str) -> None:
print("\tseq payload size",str(len(sequence))) print("\tseq payload size",str(len(sequence)))
print("\ttotal estimated seq size",total_sequence_size) print("\ttotal estimated seq size",total_sequence_size)
print("\t",str(block_number),"blocks of",str(total_sequence_size/block_number)) print("\t",str(block_number),"blocks of",str(total_sequence_size/block_number))
print("TODO remove me")
#TODO
return
exit(1) exit(1)
#print("\t",str(block_number),"blocks of",str(total_sequence_size/block_number))
# split the sequence into blocks of correct size to add the non payload stuff later # start the block count at 1 # split the sequence into blocks of correct size to add the non payload stuff later # start the block count at 1
sub_sequences_dict = {} sub_sequences_dict = {}
...@@ -223,9 +211,11 @@ def encode_file(input_path: str, output_path: str) -> None: ...@@ -223,9 +211,11 @@ def encode_file(input_path: str, output_path: str) -> None:
# add number and blocks size to path, add type # add number and blocks size to path, add type
output_path = output_path + "_" + str(block_number) + "x" + str(total_sequence_size//block_number) + ".fasta" output_path = output_path + "_" + str(block_number) + "x" + str(total_sequence_size//block_number) + ".fasta"
#return sequence #TODO REMOVE
dfr.save_dict_to_fasta(sub_sequences_dict, output_path) dfr.save_dict_to_fasta(sub_sequences_dict, output_path)
def decode_file(input_path: str, output_path: str) -> None: def decode_file(input_path: str, output_path: str) -> None:
""" """
...@@ -236,7 +226,8 @@ def decode_file(input_path: str, output_path: str) -> None: ...@@ -236,7 +226,8 @@ def decode_file(input_path: str, output_path: str) -> None:
sub_sequences_dict = dfr.read_fasta(input_path) sub_sequences_dict = dfr.read_fasta(input_path)
sequence = "".join(sub_sequences_dict.values()) sequence = "".join(sub_sequences_dict.values())
#sequence = input_path
# convert the dna sequence into a binary string # convert the dna sequence into a binary string
binary_from_dna_string = bdc.dna_to_binary_abaab(sequence) binary_from_dna_string = bdc.dna_to_binary_abaab(sequence)
...@@ -266,6 +257,8 @@ def decode_file(input_path: str, output_path: str) -> None: ...@@ -266,6 +257,8 @@ def decode_file(input_path: str, output_path: str) -> None:
while binary_string.startswith(8*"0"): # 1/256 (2**8) chance to remove actual data ! but 8*0 is ascii char NULL while binary_string.startswith(8*"0"): # 1/256 (2**8) chance to remove actual data ! but 8*0 is ascii char NULL
binary_string = binary_string[8:] binary_string = binary_string[8:]
#return binary_string #TODO REMOVE
# convert binaries into bytes # convert binaries into bytes
n = int(binary_string, 2) n = int(binary_string, 2)
bytes = n.to_bytes((n.bit_length() + 7) // 8, 'big') bytes = n.to_bytes((n.bit_length() + 7) // 8, 'big')
...@@ -281,18 +274,23 @@ if __name__ == '__main__': ...@@ -281,18 +274,23 @@ if __name__ == '__main__':
#binary_string = sys.argv[1] #binary_string = sys.argv[1]
#seq = encode_file(doc_path) #seq = encode_file(doc_path)
print(get_max_binary_len()) #print(get_max_binary_len())
exit(0) #exit(0)
for i in range(0,300000):
"""binary = str(bin(i))[2:]
print(compute_check_sum(binary), calculate_check_sum2(binary), compute_check_sum(binary) == calculate_check_sum2(binary)) for i in range(400,10000, 8):
if not compute_check_sum(binary) == calculate_check_sum2(binary): binary = i*"1"
exit(1)""" if len(binary) % 2 != 0:
#print("") continue
print("i=",str(i)) print("i=",str(i))
seq = encode_file("toto", i) seq = encode_file("", binary)
decode_file(seq, "toto") binary_result = decode_file(seq, "toto")
if binary != binary_result:
#print(binary)
#print(seq)
#print(binary_result)
exit(0)
#encode_file("", "test") #encode_file("", "test")
#seq = binary_to_dna_abaab(binary_string) #seq = binary_to_dna_abaab(binary_string)
#print(seq) #print(seq)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment