Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 502af0ff authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

find hash keys

parent 850aa966
No related branches found
No related tags found
No related merge requests found
import sys import sys
import dna_file_reader as dfr import dna_file_reader as dfr
import hashing
import source_encoding
from numpy import source
def check_homopolymere(sequence, max_h): def check_homopolymere(sequence, max_h):
...@@ -64,21 +67,47 @@ def check_loop(sequence, loop_size, window_size): ...@@ -64,21 +67,47 @@ def check_loop(sequence, loop_size, window_size):
return loop_nbr return loop_nbr
def sequence_check(sequence, window_size): def sequence_check(sequence, window_size, verbose=False):
h_max = check_homopolymere(sequence, 3) h_nbr = check_homopolymere(sequence, 3)
print("number of homopolymere larger than",3,":",h_nbr) if verbose: print("number of homopolymere larger than",3,":",h_nbr)
min_GC_percent, max_GC_percent = check_GC(sequence, window_size) min_GC_percent, max_GC_percent = check_GC(sequence, window_size)
print("GC percentage :",min_GC_percent,"% to",max_GC_percent,"%") if verbose: print("GC percentage :",min_GC_percent,"% to",max_GC_percent,"%")
loop_nbr = check_loop(sequence, 6, window_size) loop_nbr = check_loop(sequence, 6, window_size)
print("number of potential loop :",loop_nbr) if verbose: print("number of potential loop :",loop_nbr)
if h_max == 0 and min_GC_percent > 40 and max_GC_percent < 55 and loop_nbr == 0: if h_nbr == 0 and min_GC_percent > 40 and max_GC_percent < 55 and loop_nbr == 0:
print("sequence is correct") if verbose: print("sequence is correct")
return True return True
else: else:
print("sequence is not correct") if verbose: print("sequence is not correct")
return False return False
def hash_until_correct(sequence, start_key=0):
hash_key = start_key
hash_filter = hashing.hash_string_to_formated_number(str(hash_key), len(sequence), 4)
hashed_sequence = source_encoding.apply_filter(sequence, hash_filter)
while not sequence_check(hashed_sequence, 60):
hash_key += 1
hash_filter = hashing.hash_string_to_formated_number(str(hash_key), len(sequence), 4)
hashed_sequence = source_encoding.apply_filter(sequence, hash_filter)
return hash_key
def find_hash_keys(sequence):
tot_hash = ""
for i in range(5):
sub_seq = sequence[200*i:200*(i+1)]
hash_key = hash_until_correct(sub_seq)
hash_filter = hashing.hash_string_to_formated_number(str(hash_key), 200, 4)
hashed_sub_seq = source_encoding.apply_filter(sub_seq, hash_filter)
tot_hash += hashed_sub_seq
print(hash_key)
print(tot_hash)
sequence_check(tot_hash, 60, True)
# =================== main ======================= # # =================== main ======================= #
if __name__ == '__main__': if __name__ == '__main__':
...@@ -89,4 +118,5 @@ if __name__ == '__main__': ...@@ -89,4 +118,5 @@ if __name__ == '__main__':
sequence_path = sys.argv[1] sequence_path = sys.argv[1]
_, sequence = dfr.read_single_sequence_fasta(sequence_path) _, sequence = dfr.read_single_sequence_fasta(sequence_path)
sequence_check(sequence)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment