Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 91431ae1 authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

commentaires, comptage des sequences non associées

parent 03d53cb2
No related branches found
No related tags found
No related merge requests found
......@@ -24,7 +24,7 @@ class Fastq_seq:
self.value = list[1]
self.description = list[2]
self.score = list[3]
self.primers_points = {}
self.primers_points = {} # 1 point is added for a primer for each kmer the sequence contains
def add_point_primer(self, primer_name):
if primer_name in self.primers_points:
......@@ -46,33 +46,38 @@ for file in os.listdir(primers_dir_path):
primers_dict.update(dfr.read_fasta(primers_dir_path + "/" + file))
sorted_seq_by_primers = {} # dictionary containing the fastq_sequences sorted by the corresponding primers
fastq_seq_list = []
kmer_size = 5 # size of the sub_sequences of the primers to search in the fastq_sequences
point_threshold = 5 # threshold of the minimum number of kmer found in the fastq_sequences to link it to a primer
kmer_size = 10 # size of the sub_sequences of the primers to search in the fastq_sequences
point_threshold = 10 # threshold of the minimum number of kmer found in the fastq_sequences to link it to a primer
unlinked_sequences_counter = 0
for fq in dfr.read_fastq(input_path):
fastq_seq = Fastq_seq(fq) # create the fastq object
fastq_seq_list.append(fastq_seq)
for primer_name, primer_value in primers_dict.items():
primer_size = len(primer_value)
if "left" in primer_name:
# look at the beginning of the fastq_sequence
# look at the beginning of the fastq_sequence for left_part primers
fastq_part = fastq_seq.value[:primer_size + 5]
primer_name = primer_name.replace("left_","")
else:
# look at the end of the fastq_sequence
# look at the end of the fastq_sequence for right_part primers
fastq_part = fastq_seq.value[-primer_size - 5:]
primer_name = primer_name.replace("right_","")
kmer_list = [] # kmers contained in this primer
# search if the subsequences of the primer are in the sequence
for k in range(primer_size-kmer_size+1):
kmer = primer_value[k:k+kmer_size]
if kmer in fastq_part:
fastq_seq.add_point_primer(primer_name)
# if no kmers has been found, the sequence cannot be linked to a primer
if fastq_seq.primers_points == {}:
unlinked_sequences_counter += 1
continue
# primer with the highest number of kmers found in the sequence
primer_with_most_points = max(fastq_seq.primers_points, key=fastq_seq.primers_points.get)
if fastq_seq.primers_points[primer_with_most_points] > point_threshold:
if fastq_seq.primers_points[primer_with_most_points] < point_threshold:
unlinked_sequences_counter += 1
else:
if primer_with_most_points in sorted_seq_by_primers:
sorted_seq_by_primers[primer_with_most_points].append(fastq_seq)
else:
......@@ -84,8 +89,10 @@ except OSError:
shutil.rmtree(output_dir_path)
os.mkdir(output_dir_path)
# write the fastq files for the linked sequences
for primer_name, fastq_sequences_list in sorted_seq_by_primers.items():
sequences_output = open(output_dir_path+"/"+primer_name+".fastq", "w+")
print(primer_name, ":",len(fastq_sequences_list), "lectures associées")
for fastq_sequence in fastq_sequences_list:
sequences_output.write(fastq_sequence.name+"\n")
sequences_output.write(fastq_sequence.value+"\n")
......@@ -93,4 +100,5 @@ for primer_name, fastq_sequences_list in sorted_seq_by_primers.items():
sequences_output.write(fastq_sequence.score+"\n")
sequences_output.close()
print(unlinked_sequences_counter,"lectures non associées")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment