From cf32f11339c740db15653cd8210973633b567dd0 Mon Sep 17 00:00:00 2001
From: oboulle <olivier.boulle@inria.fr>
Date: Tue, 18 Jun 2024 17:50:55 +0200
Subject: [PATCH] update of primer gen

---
 primers_generation_V2.py | 159 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 159 insertions(+)
 create mode 100644 primers_generation_V2.py

diff --git a/primers_generation_V2.py b/primers_generation_V2.py
new file mode 100644
index 0000000..9cf7167
--- /dev/null
+++ b/primers_generation_V2.py
@@ -0,0 +1,159 @@
+#!/usr/bin/python3
+
+import os
+import sys
+import inspect
+import subprocess
+import random
+
+currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+sys.path.insert(0, os.path.dirname(currentdir)+"/source_encoding")
+import sequence_control
+
+import dna_file_reader as dfr
+
+
+"""
+manage primers generation and selection,
+
+use the primer_generator script
+"""
+
+def generate_compatible_primers(primer_number: int, output_primers_path=None):
+    """
+    use primer_generator script to generate a lot of primers that doesn't hybridize with the given sequence
+    """
+    
+    #___generate a lot of primers with good properties___#
+    
+    primer_generator_dir = currentdir+"/dna-storage-primer-tools"
+    primers_gen_script = primer_generator_dir+"/bin/DSPgen" # path to the script
+    primers_param_path = primer_generator_dir+"/data/param_dnarxiv_V2.txt" # file containing the parameters for the primer_generator scripts
+    if output_primers_path is None:
+        output_primers_path = primer_generator_dir+"/data/temp/primers.fasta" # output path for generated primers
+
+    scm_option = "1" # salt/magnesium correc7on method: 0 = Santalucia | 1 = Owczarzy | 2 = IDT
+    SEED = "0"
+    #TODO enable dtg ?
+
+    # generate a file containing a lot of primers fullfilling the general conditions set in param_dnarxiv.txt
+    primers_gen_command = primers_gen_script+' '+str(primer_number)+' '+primers_param_path+' '+output_primers_path+' -scm '+scm_option+' -seed '+SEED+' -dtg 0'
+    print(primers_gen_command)
+    subprocess.call('/bin/bash -c "$PRIMERGEN"', shell=True, env={'PRIMERGEN': primers_gen_command})
+    
+
+
+def filter_primers_with_oligos(oligo_path, input_primers_path=None, output_filtered_primers_path=None):
+    """
+    select primers without hybridation with the encoded data
+    """
+    
+    primer_generator_dir = currentdir+"/dna-storage-primer-tools"
+    primers_filter_script = primer_generator_dir+"/bin/DSPhyb" # path to the script
+    primers_param_path = primer_generator_dir+"/data/param_dnarxiv_V2.txt" # file containing the parameters for the primer_generator scripts
+    hybridation_results = primer_generator_dir+"/data/temp/hybridations.txt"
+    if input_primers_path is None:
+        input_primers_path = primer_generator_dir+"/data/temp/primers.fasta" # input path for generated primers from generate_compatible_primers()
+
+    if output_filtered_primers_path is None:
+        output_filtered_primers_path = primer_generator_dir+"/data/temp/compatible_primers.fasta" # output path for filtered primers that doesn't hybridize with the assembly
+
+    # filter the primers that hybridize with the whole sequence of encoded data assembled with the overhangs
+    filter_primers_command = primers_filter_script+' '+primers_param_path+' '+input_primers_path+' '+oligo_path+' '+hybridation_results
+
+    subprocess.call('/bin/bash -c "$PRIMERFILTER"', shell=True, env={'PRIMERFILTER': filter_primers_command})
+    #print() # needed cause the printer go at the start of the last printed line
+
+    # get the list of hybrided primers from the output file
+
+    with open(hybridation_results, 'r') as file:
+        list_of_rejected_primers = []
+        line = file.readline()
+        while line != "":
+            rejected_primer = int(line.split(" ")[0].replace("Primer_", ""))
+            if len(list_of_rejected_primers) == 0:
+                list_of_rejected_primers.append(rejected_primer)
+            elif list_of_rejected_primers[-1] != rejected_primer:
+                list_of_rejected_primers.append(rejected_primer)
+            line = file.readline()
+
+    print(list_of_rejected_primers)
+
+    primers_dict = dfr.read_fasta(input_primers_path)
+    keys_list = list(primers_dict.keys())
+
+    # delete the entries for rejected primers
+    for primer_num in list_of_rejected_primers:
+        del primers_dict[keys_list[primer_num]]
+
+    checked_primers_dict = dict((name, primer) for name, primer in primers_dict.items() if sequence_control.sequence_check(primer)) # check for potential constraints in primers
+
+    #random.shuffle(checked_primers_list)
+    
+    dfr.save_dict_to_fasta(checked_primers_dict, output_filtered_primers_path)
+        
+
+def test_hybridation(primer: str, selected_primers_list: list, max_hybridisation_value=4) -> bool:
+    """
+    test the hybridisation of the primer with a list of other primers
+    """
+    primer_rc = dfr.reverse_complement(primer) # get the reverse complement of the primer
+
+    for selected_primer in selected_primers_list:
+
+        for i in range(len(primer)-max_hybridisation_value):
+            # test if any part of the primer is contained in another primer from the list (= hybridisation)
+            if primer[i:i+max_hybridisation_value+1] in selected_primer:
+                return False
+            # test also for the reverse complement
+            if primer_rc[i:i+max_hybridisation_value+1] in selected_primer:
+                return False
+    # return true if no hybridisation at all
+    return True
+
+
+def compare_GC(primer_A: str, primer_B: str) -> bool:
+    """
+    primers have a special temperature in PCR depending on %GC, a couple (start;stop) needs to have a close tmp, so the same %GC is better
+    return true if the %GC is the same for the 2 primers
+    """
+    start_GC = (primer_A.count("C") + primer_A.count("G"))/(primer_A.count("A")+primer_A.count("T")) # get the %GC of the first primer
+    stop_GC = (primer_B.count("C") + primer_B.count("G"))/(primer_B.count("A")+primer_B.count("T")) # get the %GC of the second primer
+    return start_GC == stop_GC
+
+
+def select_primers(primers_list: list, number_assembly: int) -> list:
+    """
+    select a list of primers (2 primers for each assembly)
+    primers needs to not hybridize with each other
+    """
+    selected_primers_list = []
+       
+    for index_primer, potential_start_primer in enumerate(primers_list):
+        # the primer must not hybridise with any previous selected primers
+        if test_hybridation(potential_start_primer, selected_primers_list, 4):
+                        
+            for potential_stop_primer in primers_list[index_primer+1:]: # search a corresponding stop primer in the rest of list
+                # the 2 primers needs the same %GC, and a lower risk of hybridation
+                if compare_GC(potential_stop_primer, potential_start_primer) and test_hybridation(potential_stop_primer, [potential_start_primer]):
+                    # the primer must not hybridise with any previous selected primers
+                    if test_hybridation(potential_stop_primer, selected_primers_list, 4):
+                        
+                        selected_primers_list += [potential_start_primer, potential_stop_primer] # add the couple to the list
+                        print(str(len(selected_primers_list)//2)+"/"+str(number_assembly))   
+                        
+                        # if all required primers have been selected
+                        if len(selected_primers_list) == number_assembly*2:
+                            #print(selected_primers_list)
+                            return selected_primers_list # end the script with the result
+                        
+                        # the corresponding stop primer has been found
+                        break # return to start primers search
+    
+    # not enough couple of primers found
+    print("error in fragment design : not enough correct primers couple found, found",str(len(selected_primers_list)),"but wanted",str(number_assembly*2))
+    print("can be solved by using a longer random sequence in primers_generation to have a longer primer list ")
+    
+    return selected_primers_list
+
+
-- 
GitLab