Commit cd9fb014 authored by Mathieu Giraud's avatar Mathieu Giraud

Merge branch 'feature-s/3054-avoir-plus-d-informations-sur-pear' into 'dev'

Resolve "Avoir plus d'informations sur PEAR"

Closes #3054

See merge request !194
parents 9ce7a54c a7042415
Pipeline #44269 passed with stages
in 6 minutes and 5 seconds
#!/usr/bin/env python
#-*- coding: utf-8 -*-
# ===============================
# Script by Florian Thonier
# florian@vidjil.org
# ===============================
from __future__ import division
import operator
import sys, os
import json
from operator import itemgetter
from optparse import OptionParser #@UnusedWildImport
from _collections import defaultdict
# ===============================
VERSION = "0.02"
TIMESTAMP = "2018-04-27"
# ===============================
def get_value(line):
''' Return the value from a line '''
return line.split(': ')[1]
def convert_value(line):
''' Return the values from reads result lines '''
raw_value = get_value(line)
numbers = raw_value.split("(")[0].replace(" ","").split("/")
return numbers
def pear_converter(fileIn, fileOut):
''' Read line by line the log file, store data into a dict, and export it as json '''
json_data = defaultdict(lambda: {})
fi = open( fileIn, "r")
fo = open( fileOut, "w")
for line in fi:
line = line.replace("\n", "")
### version, setting and paramters
if "PEAR v" in line :
json_data["settings"]["version"] = line
elif "Forward reads file" in line :
json_data["settings"]["forward_file"] = get_value(line)
elif "Reverse reads file" in line :
json_data["settings"]["reverse_file"] = get_value(line)
elif "PHRED" in line :
json_data["settings"]["phred"] = get_value(line)
elif "Scoring method" in line :
json_data["settings"]["scoring_methode"] = get_value(line)
elif "Minimum overlap" in line :
json_data["settings"]["minimum_overlap"] = get_value(line)
### bases frequencies
elif "A:" in line :
json_data["base_frequency"]["base_frequency_a"] = get_value(line)
elif "C:" in line :
json_data["base_frequency"]["base_frequency_c"] = get_value(line)
elif "G:" in line :
json_data["base_frequency"]["base_frequency_g"] = get_value(line)
elif "T:" in line :
json_data["base_frequency"]["base_frequency_t"] = get_value(line)
elif "uncalled bases" in line :
json_data["base_frequency"]["uncalled_base"] = line.replace(" uncalled bases", "").replace(" ", "")
### output file
elif "Assembled reads file" in line :
json_data["output_file"]["assembled_reads"] = get_value(line)
elif "Discarded reads file" in line :
json_data["output_file"]["discarded_reads"] = get_value(line)
elif "Unassembled forward reads file" in line :
json_data["output_file"]["unassembled_forward"] = get_value(line)
elif "Unassembled reverse reads file" in line :
json_data["output_file"]["unassembled_reverse"] = get_value(line)
### number of reads
elif "Assembled reads" in line :
json_data["reads"]["reads_assembled_number"] = int(convert_value(line)[0].replace(",",""))
json_data["reads"]["reads_total_number"] = int(convert_value(line)[1].replace(",",""))
elif "Discarded reads" in line :
json_data["reads"]["reads_discarded_number"] = int(convert_value(line)[0].replace(",",""))
elif "Not assembled reads" in line :
json_data["reads"]["reads_not_assembled_number"] = int(convert_value(line)[0].replace(",",""))
### Warnings
json_data["warning"] = []
# assembled reads
percentage_assembled = int(json_data["reads"]["reads_assembled_number"])*100 / json_data["reads"]["reads_total_number"]
json_data["reads"]["percentage_assembled"] = percentage_assembled
percentage_not_assembled = int(json_data["reads"]["reads_not_assembled_number"])*100 / json_data["reads"]["reads_total_number"]
json_data["reads"]["percentage_not_assembled"] = percentage_not_assembled
if percentage_not_assembled > 50.00 :
json_data["warning"].append("Very few reads assembled")
elif percentage_not_assembled > 20.00 :
json_data["warning"].append("Few reads assembled")
# discarded reads
percentage_discarded = json_data["reads"]["reads_discarded_number"]*100 / json_data["reads"]["reads_total_number"]
json_data["reads"]["percentage_discarded"] = percentage_discarded
if percentage_discarded > 10.00 :
json_data["warning"].append("High level of discarded reads")
### export as json
try:
json.dump(json_data, fo, sort_keys=True, indent=4, ensure_ascii=False)
# print("Dump : %s" % (fileOut))
fo.close()
except: # pragma: no cover
print("Dump : FAILED (%s)" % (fileOut))
return -1
return
if __name__ == '__main__':
#print( description )
### Description ###
usage = "Convert the result of pear preprocess log into a json file.\n"
usage += "usage: %prog -i input_pear_log -o output.json"
parser = OptionParser(usage=usage)
### Options ###
parser.add_option("-v", "--verbose",
action="store_true", dest="verbose", default=True,
help="make lots of noise [default]")
parser.add_option("-i", "--input",
metavar="FILE", help="input log file from PEAR merging")
parser.add_option("-o", "--output",
metavar="FILE", help="output file in json format.")
### Getter des options ###
argv = sys.argv
(options, argv) = parser.parse_args(argv)
if not options.input: # if filename is not given
parser.error('input-log file not given')
if not options.output: # if filename is not given
parser.error('output file not given')
if not ".json" in options.output: # if extension is not given
options.output = options.output + ".json"
pear_converter(options.input, options.output )
=== Pre-process 2 ===
python pear.py /home/vidjil-ci/opt sequence_R1.fastq sequence_R2.fastq demo_set.fastq -r2
===============
Output log in demo_set.fastq.pre.log
____ _____ _ ____
| _ \| ____| / \ | _ \
| |_) | _| / _ \ | |_) |
| __/| |___ / ___ \| _ <
|_| |_____/_/ \_\_| \_\
PEAR v0.9.10 [May 30, 2016]
Citation - PEAR: a fast and accurate Illumina Paired-End reAd mergeR
Zhang et al (2014) Bioinformatics 30(5): 614-620 | doi:10.1093/bioinformatics/btt593
Forward reads file.................: sequence_R1.fastq
Reverse reads file.................: sequence_R2.fastq
PHRED..............................: 33
Using empirical frequencies........: YES
Statistical method.................: OES
Maximum assembly length............: 999999
Minimum assembly length............: 50
p-value............................: 0.010000
Quality score threshold (trimming).: 0
Minimum read size after trimming...: 1
Maximal ratio of uncalled bases....: 1.000000
Minimum overlap....................: 10
Scoring method.....................: Scaled score
Threads............................: 1
Allocating memory..................: 200,000,000 bytes
Computing empirical frequencies....: DONE
A: 0.269874
C: 0.257653
G: 0.238808
T: 0.233665
9000 uncalled bases
Assemblying reads: 0%Assemblying reads: 100%
Assembled reads ...................: 2,972 / 3,000 (99.067%)
Discarded reads ...................: 0 / 3,000 (0.000%)
Not assembled reads ...............: 28 / 3,000 (0.933%)
Assembled reads file...............: demo_set.fastq.assembled.fastq
Discarded reads file...............: demo_set.fastq.discarded.fastq
Unassembled forward reads file.....: demo_set.fastq.unassembled.forward.fastq
Unassembled reverse reads file.....: demo_set.fastq.unassembled.reverse.fastq
=== Pre-process 2 ===
python pear.py /home/vidjil-ci/opt sequence_R1.fastq sequence_R2.fastq demo_set.fastq -r2
===============
Output log in /mnt/vda/prod/result/tmp/pre/out-039723//demo_set_R_6.fastq.pre.log
____ _____ _ ____
| _ \| ____| / \ | _ \
| |_) | _| / _ \ | |_) |
| __/| |___ / ___ \| _ <
|_| |_____/_/ \_\_| \_\
PEAR v0.9.10 [May 30, 2016]
Citation - PEAR: a fast and accurate Illumina Paired-End reAd mergeR
Zhang et al (2014) Bioinformatics 30(5): 614-620 | doi:10.1093/bioinformatics/btt593
Forward reads file.................: sequence_R1.fastq
Reverse reads file.................: sequence_R2.fastq
PHRED..............................: 33
Using empirical frequencies........: YES
Statistical method.................: OES
Maximum assembly length............: 999999
Minimum assembly length............: 50
p-value............................: 0.010000
Quality score threshold (trimming).: 0
Minimum read size after trimming...: 1
Maximal ratio of uncalled bases....: 1.000000
Minimum overlap....................: 10
Scoring method.....................: Scaled score
Threads............................: 1
Allocating memory..................: 200,000,000 bytes
Computing empirical frequencies....: DONE
A: 0.269874
C: 0.257653
G: 0.238808
T: 0.233665
9000 uncalled bases
Assemblying reads: 0%Assemblying reads: 100%
Assembled reads ...................: 972 / 3,000 (32.400%)
Discarded reads ...................: 500 / 3,000 (16.666%)
Not assembled reads ...............: 1528 / 3,000 (50.933%)
Assembled reads file...............: demo_set.fastq.assembled.fastq
Discarded reads file...............: demo_set.fastq.discarded.fastq
Unassembled forward reads file.....: demo_set.fastq.unassembled.forward.fastq
Unassembled reverse reads file.....: demo_set.fastq.unassembled.reverse.fastq
!LAUNCH: python ../../pear_structured_log.py -i ../data/pear_log.log -o ../data/pear_strucured.json; cat ../data/pear_strucured.json
$ Correct number of assembled reads
1:"reads_assembled_number": 2972
1:"reads_total_number": 3000
$ Correct number of unassembled reads
1:"reads_not_assembled_number": 28
$ Correct information on input files used by pear
1:"assembled_reads": "demo_set.fastq.assembled.fastq",
1:"discarded_reads": "demo_set.fastq.discarded.fastq",
1:"unassembled_forward": "demo_set.fastq.unassembled.forward.fastq",
1:"unassembled_reverse": "demo_set.fastq.unassembled.reverse.fastq"
$ Correct parameter return
1:"forward_file": "sequence_R1.fastq",
1:"minimum_overlap": "10",
1:"phred": "33",
1:"reverse_file": "sequence_R2.fastq",
1:"scoring_methode": "Scaled score",
1:"version": "PEAR v0.9.10 \[May 30, 2016\]"
$ Correct nucleotides frequencies
1:"base_frequency_a": "0.269874",
1:"base_frequency_c": "0.257653",
1:"base_frequency_g": "0.238808",
1:"base_frequency_t": "0.233665",
\ No newline at end of file
!LAUNCH: python ../../pear_structured_log.py -i ../data/pear_log_warning.log -o ../data/pear_strucured_waring.json; cat ../data/pear_strucured_waring.json
$ Correct percentage of reads
1:"percentage_assembled": 32.4,
1:"percentage_discarded": 16.666666666666668,
1:"percentage_not_assembled": 50.93333333333333,
$ Correct number of reads
1:"reads_assembled_number": 972,
1:"reads_discarded_number": 500,
1:"reads_not_assembled_number": 1528,
1:"reads_total_number": 3000
$ correct add of warning into the structured log
1:"Very few reads assembled"
1:"High level of discarded reads"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment