commentaires, comptage des sequences non associées

91431ae1 · BOULLE Olivier · 03d53cb2 · 91431ae1
Commit 91431ae1 authored 4 years ago by BOULLE Olivier
--- a/demultiplexing/demultiplexing.py
+++ b/demultiplexing/demultiplexing.py
@@ -24,7 +24,7 @@ class Fastq_seq:
        self.value = list[1]
        self.description = list[2]
        self.score = list[3]
-        self.primers_points = {}
+        self.primers_points = {} # 1 point is added for a primer for each kmer the sequence contains

    def add_point_primer(self, primer_name):
        if primer_name in self.primers_points:
@@ -46,33 +46,38 @@ for file in os.listdir(primers_dir_path):
    primers_dict.update(dfr.read_fasta(primers_dir_path + "/" + file))

 sorted_seq_by_primers = {} # dictionary containing the fastq_sequences sorted by the corresponding primers
-fastq_seq_list = []
-kmer_size = 5 # size of the sub_sequences of the primers to search in the fastq_sequences
-point_threshold = 5 # threshold of the minimum number of kmer found in the fastq_sequences to link it to a primer
+kmer_size = 10 # size of the sub_sequences of the primers to search in the fastq_sequences
+point_threshold = 10 # threshold of the minimum number of kmer found in the fastq_sequences to link it to a primer
+unlinked_sequences_counter = 0

 for fq in dfr.read_fastq(input_path):
    fastq_seq = Fastq_seq(fq) # create the fastq object
-    fastq_seq_list.append(fastq_seq)

    for primer_name, primer_value in primers_dict.items():
        primer_size = len(primer_value)
        if "left" in primer_name:
-            # look at the beginning of the fastq_sequence
+            # look at the beginning of the fastq_sequence for left_part primers
            fastq_part = fastq_seq.value[:primer_size + 5]
            primer_name = primer_name.replace("left_","")
        else:
-            # look at the end of the fastq_sequence
+            # look at the end of the fastq_sequence for right_part primers
            fastq_part = fastq_seq.value[-primer_size - 5:]
            primer_name = primer_name.replace("right_","")
-
-        kmer_list = [] # kmers contained in this primer
+        # search if the subsequences of the primer are in the sequence
        for k in range(primer_size-kmer_size+1):
            kmer = primer_value[k:k+kmer_size]
            if kmer in fastq_part:
                fastq_seq.add_point_primer(primer_name)
+    # if no kmers has been found, the sequence cannot be linked to a primer
+    if fastq_seq.primers_points == {}:
+        unlinked_sequences_counter += 1
+        continue

+    # primer with the highest number of kmers found in the sequence
    primer_with_most_points = max(fastq_seq.primers_points, key=fastq_seq.primers_points.get)
-    if fastq_seq.primers_points[primer_with_most_points] > point_threshold:
+    if fastq_seq.primers_points[primer_with_most_points] < point_threshold:
+        unlinked_sequences_counter += 1
+    else:
        if primer_with_most_points in sorted_seq_by_primers:
            sorted_seq_by_primers[primer_with_most_points].append(fastq_seq)
        else:
@@ -84,8 +89,10 @@ except OSError:
    shutil.rmtree(output_dir_path)
    os.mkdir(output_dir_path)

+# write the fastq files for the linked sequences
 for primer_name, fastq_sequences_list in sorted_seq_by_primers.items():
    sequences_output = open(output_dir_path+"/"+primer_name+".fastq", "w+")
+    print(primer_name, ":",len(fastq_sequences_list), "lectures associées")
    for fastq_sequence in fastq_sequences_list:
        sequences_output.write(fastq_sequence.name+"\n")
        sequences_output.write(fastq_sequence.value+"\n")
@@ -93,4 +100,5 @@ for primer_name, fastq_sequences_list in sorted_seq_by_primers.items():
        sequences_output.write(fastq_sequence.score+"\n")
    sequences_output.close()

+print(unlinked_sequences_counter,"lectures non associées")