Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 13ba168d authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

forbidden sequences

parent 7a467d36
No related branches found
No related tags found
No related merge requests found
......@@ -79,10 +79,22 @@ def check_loop(sequence: str, loop_size: int, window_size: int) -> int:
rev_compl_sub_sequence = dfr.reverse_complement(sub_sequence)
if rev_compl_sub_sequence in sequence[index+loop_size:index+loop_size+window_size]:
loop_nbr += 1
return loop_nbr
def check_forbidden_sequences(sequence: str) -> bool:
"""
check if some forbidden sequences are in the sequence
"""
forbidden_sequences = ["GGTCTC", "GAGACC"] # don't forget the reverse complement
seq_nbr = 0
for seq in forbidden_sequences:
if seq in sequence:
seq_nbr += 1
return seq_nbr
def sequence_check(sequence: str, window_size=60, verbose=False) -> bool:
"""
test if a the conditions for a correct sequence are met, return True if all 3 constraints are valid
......@@ -91,10 +103,12 @@ def sequence_check(sequence: str, window_size=60, verbose=False) -> bool:
if verbose: print("number of homopolymere larger than",3,":",h_nbr)
min_GC_percent, max_GC_percent = check_GC(sequence, window_size)
if verbose: print("GC percentage :",min_GC_percent,"% to",max_GC_percent,"%")
loop_nbr = check_loop(sequence, 6, window_size)
loop_nbr = check_loop(sequence, 11, window_size)
if verbose: print("number of potential loop :",loop_nbr)
forbidden_sequences_nbr = check_forbidden_sequences(sequence)
if verbose: print("number of forbidden_sequences :",forbidden_sequences_nbr)
if h_nbr == 0 and min_GC_percent >= 45 and max_GC_percent <= 55 and loop_nbr == 0:
if h_nbr == 0 and min_GC_percent >= 45 and max_GC_percent <= 55 and loop_nbr == 0 and forbidden_sequences_nbr == 0:
if verbose: print("sequence is correct")
return True
else:
......@@ -167,19 +181,20 @@ if __name__ == '__main__':
print("usage : sequence_control.py sequence_path")
sys.exit(1)
_, sequence = dfr.read_single_sequence_fasta(sys.argv[1])
# _, sequence = dfr.read_single_sequence_fasta(sys.argv[1])
find_hash_keys(sequence)
decode_sequence(sequence)
#find_hash_keys(sequence)
#decode_sequence(sequence)
#source_encoding.encode_to_indexed_fragments(sequence, 472, "doc_short_indexed_fragments.fasta")
"""
input_sequences = dfr.read_fasta(sys.argv[1])
for fragment in input_sequences.values():
sequence_check(fragment, verbose=True)
"""
for fragment_name, fragment in input_sequences.items():
print("control",fragment_name)
sequence_check(fragment, window_size=472, verbose=True)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment