Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 058f9a87 authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

decode test

parent 502af0ff
Branches
No related tags found
No related merge requests found
...@@ -2,10 +2,14 @@ import sys ...@@ -2,10 +2,14 @@ import sys
import dna_file_reader as dfr import dna_file_reader as dfr
import hashing import hashing
import source_encoding import source_encoding
from numpy import source import source_decoding
def check_homopolymere(sequence, max_h): def check_homopolymere(sequence, max_h):
"""
count the number of homopolymeres in the sequence
"""
h_nbr = 0 #number of homopolymere larger than h_max found h_nbr = 0 #number of homopolymere larger than h_max found
row_size = 0 #size of the current row row_size = 0 #size of the current row
last_nucleotide = "" #previous nucleotide in the sequence last_nucleotide = "" #previous nucleotide in the sequence
...@@ -54,6 +58,9 @@ def check_GC(sequence, window_size): ...@@ -54,6 +58,9 @@ def check_GC(sequence, window_size):
def check_loop(sequence, loop_size, window_size): def check_loop(sequence, loop_size, window_size):
"""
count the number of potential loops (reverse complement of a sub sequence in a local window)
"""
loop_nbr = 0 loop_nbr = 0
if len(sequence) < 2*loop_size: #sequence too short for any loop if len(sequence) < 2*loop_size: #sequence too short for any loop
return 0 return 0
...@@ -67,7 +74,10 @@ def check_loop(sequence, loop_size, window_size): ...@@ -67,7 +74,10 @@ def check_loop(sequence, loop_size, window_size):
return loop_nbr return loop_nbr
def sequence_check(sequence, window_size, verbose=False): def sequence_check(sequence, window_size=60, verbose=False):
"""
test if a the conditions for a correct sequence are met
"""
h_nbr = check_homopolymere(sequence, 3) h_nbr = check_homopolymere(sequence, 3)
if verbose: print("number of homopolymere larger than",3,":",h_nbr) if verbose: print("number of homopolymere larger than",3,":",h_nbr)
min_GC_percent, max_GC_percent = check_GC(sequence, window_size) min_GC_percent, max_GC_percent = check_GC(sequence, window_size)
...@@ -75,7 +85,7 @@ def sequence_check(sequence, window_size, verbose=False): ...@@ -75,7 +85,7 @@ def sequence_check(sequence, window_size, verbose=False):
loop_nbr = check_loop(sequence, 6, window_size) loop_nbr = check_loop(sequence, 6, window_size)
if verbose: print("number of potential loop :",loop_nbr) if verbose: print("number of potential loop :",loop_nbr)
if h_nbr == 0 and min_GC_percent > 40 and max_GC_percent < 55 and loop_nbr == 0: if h_nbr == 0 and min_GC_percent >= 40 and max_GC_percent <= 55 and loop_nbr == 0:
if verbose: print("sequence is correct") if verbose: print("sequence is correct")
return True return True
else: else:
...@@ -84,11 +94,13 @@ def sequence_check(sequence, window_size, verbose=False): ...@@ -84,11 +94,13 @@ def sequence_check(sequence, window_size, verbose=False):
def hash_until_correct(sequence, start_key=0): def hash_until_correct(sequence, start_key=0):
"""
use a hash on the sequence until it meets the conditions, return the hash key
"""
hash_key = start_key hash_key = start_key
hash_filter = hashing.hash_string_to_formated_number(str(hash_key), len(sequence), 4) hash_filter = hashing.hash_string_to_formated_number(str(hash_key), len(sequence), 4)
hashed_sequence = source_encoding.apply_filter(sequence, hash_filter) hashed_sequence = source_encoding.apply_filter(sequence, hash_filter)
while not sequence_check(hashed_sequence, 60): while not sequence_check(hashed_sequence):
hash_key += 1 hash_key += 1
hash_filter = hashing.hash_string_to_formated_number(str(hash_key), len(sequence), 4) hash_filter = hashing.hash_string_to_formated_number(str(hash_key), len(sequence), 4)
hashed_sequence = source_encoding.apply_filter(sequence, hash_filter) hashed_sequence = source_encoding.apply_filter(sequence, hash_filter)
...@@ -108,6 +120,16 @@ def find_hash_keys(sequence): ...@@ -108,6 +120,16 @@ def find_hash_keys(sequence):
print(tot_hash) print(tot_hash)
sequence_check(tot_hash, 60, True) sequence_check(tot_hash, 60, True)
def decode_poeme(sequence):
hash_keys = [2137,4123,1324,833,3]
decoded_seq = ""
for i in range(5):
sub_seq = sequence[200*i:200*(i+1)]
hash_filter = hashing.hash_string_to_formated_number(str(hash_keys[i]), 200, 4)
decoded_sub_seq = source_decoding.remove_filter(sub_seq, hash_filter)
decoded_seq += decoded_sub_seq
print(decoded_seq)
# =================== main ======================= # # =================== main ======================= #
if __name__ == '__main__': if __name__ == '__main__':
...@@ -119,4 +141,5 @@ if __name__ == '__main__': ...@@ -119,4 +141,5 @@ if __name__ == '__main__':
_, sequence = dfr.read_single_sequence_fasta(sequence_path) _, sequence = dfr.read_single_sequence_fasta(sequence_path)
print(sequence_check(sequence, verbose=True))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment