From cf32f11339c740db15653cd8210973633b567dd0 Mon Sep 17 00:00:00 2001 From: oboulle <olivier.boulle@inria.fr> Date: Tue, 18 Jun 2024 17:50:55 +0200 Subject: [PATCH] update of primer gen --- primers_generation_V2.py | 159 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 primers_generation_V2.py diff --git a/primers_generation_V2.py b/primers_generation_V2.py new file mode 100644 index 0000000..9cf7167 --- /dev/null +++ b/primers_generation_V2.py @@ -0,0 +1,159 @@ +#!/usr/bin/python3 + +import os +import sys +import inspect +import subprocess +import random + +currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +sys.path.insert(0, os.path.dirname(currentdir)+"/source_encoding") +import sequence_control + +import dna_file_reader as dfr + + +""" +manage primers generation and selection, + +use the primer_generator script +""" + +def generate_compatible_primers(primer_number: int, output_primers_path=None): + """ + use primer_generator script to generate a lot of primers that doesn't hybridize with the given sequence + """ + + #___generate a lot of primers with good properties___# + + primer_generator_dir = currentdir+"/dna-storage-primer-tools" + primers_gen_script = primer_generator_dir+"/bin/DSPgen" # path to the script + primers_param_path = primer_generator_dir+"/data/param_dnarxiv_V2.txt" # file containing the parameters for the primer_generator scripts + if output_primers_path is None: + output_primers_path = primer_generator_dir+"/data/temp/primers.fasta" # output path for generated primers + + scm_option = "1" # salt/magnesium correc7on method: 0 = Santalucia | 1 = Owczarzy | 2 = IDT + SEED = "0" + #TODO enable dtg ? + + # generate a file containing a lot of primers fullfilling the general conditions set in param_dnarxiv.txt + primers_gen_command = primers_gen_script+' '+str(primer_number)+' '+primers_param_path+' '+output_primers_path+' -scm '+scm_option+' -seed '+SEED+' -dtg 0' + print(primers_gen_command) + subprocess.call('/bin/bash -c "$PRIMERGEN"', shell=True, env={'PRIMERGEN': primers_gen_command}) + + + +def filter_primers_with_oligos(oligo_path, input_primers_path=None, output_filtered_primers_path=None): + """ + select primers without hybridation with the encoded data + """ + + primer_generator_dir = currentdir+"/dna-storage-primer-tools" + primers_filter_script = primer_generator_dir+"/bin/DSPhyb" # path to the script + primers_param_path = primer_generator_dir+"/data/param_dnarxiv_V2.txt" # file containing the parameters for the primer_generator scripts + hybridation_results = primer_generator_dir+"/data/temp/hybridations.txt" + if input_primers_path is None: + input_primers_path = primer_generator_dir+"/data/temp/primers.fasta" # input path for generated primers from generate_compatible_primers() + + if output_filtered_primers_path is None: + output_filtered_primers_path = primer_generator_dir+"/data/temp/compatible_primers.fasta" # output path for filtered primers that doesn't hybridize with the assembly + + # filter the primers that hybridize with the whole sequence of encoded data assembled with the overhangs + filter_primers_command = primers_filter_script+' '+primers_param_path+' '+input_primers_path+' '+oligo_path+' '+hybridation_results + + subprocess.call('/bin/bash -c "$PRIMERFILTER"', shell=True, env={'PRIMERFILTER': filter_primers_command}) + #print() # needed cause the printer go at the start of the last printed line + + # get the list of hybrided primers from the output file + + with open(hybridation_results, 'r') as file: + list_of_rejected_primers = [] + line = file.readline() + while line != "": + rejected_primer = int(line.split(" ")[0].replace("Primer_", "")) + if len(list_of_rejected_primers) == 0: + list_of_rejected_primers.append(rejected_primer) + elif list_of_rejected_primers[-1] != rejected_primer: + list_of_rejected_primers.append(rejected_primer) + line = file.readline() + + print(list_of_rejected_primers) + + primers_dict = dfr.read_fasta(input_primers_path) + keys_list = list(primers_dict.keys()) + + # delete the entries for rejected primers + for primer_num in list_of_rejected_primers: + del primers_dict[keys_list[primer_num]] + + checked_primers_dict = dict((name, primer) for name, primer in primers_dict.items() if sequence_control.sequence_check(primer)) # check for potential constraints in primers + + #random.shuffle(checked_primers_list) + + dfr.save_dict_to_fasta(checked_primers_dict, output_filtered_primers_path) + + +def test_hybridation(primer: str, selected_primers_list: list, max_hybridisation_value=4) -> bool: + """ + test the hybridisation of the primer with a list of other primers + """ + primer_rc = dfr.reverse_complement(primer) # get the reverse complement of the primer + + for selected_primer in selected_primers_list: + + for i in range(len(primer)-max_hybridisation_value): + # test if any part of the primer is contained in another primer from the list (= hybridisation) + if primer[i:i+max_hybridisation_value+1] in selected_primer: + return False + # test also for the reverse complement + if primer_rc[i:i+max_hybridisation_value+1] in selected_primer: + return False + # return true if no hybridisation at all + return True + + +def compare_GC(primer_A: str, primer_B: str) -> bool: + """ + primers have a special temperature in PCR depending on %GC, a couple (start;stop) needs to have a close tmp, so the same %GC is better + return true if the %GC is the same for the 2 primers + """ + start_GC = (primer_A.count("C") + primer_A.count("G"))/(primer_A.count("A")+primer_A.count("T")) # get the %GC of the first primer + stop_GC = (primer_B.count("C") + primer_B.count("G"))/(primer_B.count("A")+primer_B.count("T")) # get the %GC of the second primer + return start_GC == stop_GC + + +def select_primers(primers_list: list, number_assembly: int) -> list: + """ + select a list of primers (2 primers for each assembly) + primers needs to not hybridize with each other + """ + selected_primers_list = [] + + for index_primer, potential_start_primer in enumerate(primers_list): + # the primer must not hybridise with any previous selected primers + if test_hybridation(potential_start_primer, selected_primers_list, 4): + + for potential_stop_primer in primers_list[index_primer+1:]: # search a corresponding stop primer in the rest of list + # the 2 primers needs the same %GC, and a lower risk of hybridation + if compare_GC(potential_stop_primer, potential_start_primer) and test_hybridation(potential_stop_primer, [potential_start_primer]): + # the primer must not hybridise with any previous selected primers + if test_hybridation(potential_stop_primer, selected_primers_list, 4): + + selected_primers_list += [potential_start_primer, potential_stop_primer] # add the couple to the list + print(str(len(selected_primers_list)//2)+"/"+str(number_assembly)) + + # if all required primers have been selected + if len(selected_primers_list) == number_assembly*2: + #print(selected_primers_list) + return selected_primers_list # end the script with the result + + # the corresponding stop primer has been found + break # return to start primers search + + # not enough couple of primers found + print("error in fragment design : not enough correct primers couple found, found",str(len(selected_primers_list)),"but wanted",str(number_assembly*2)) + print("can be solved by using a longer random sequence in primers_generation to have a longer primer list ") + + return selected_primers_list + + -- GitLab