pear_structured_log.py 5.68 KB
Newer Older
1 2 3 4 5 6 7
#!/usr/bin/env python
#-*- coding: utf-8 -*-

# ===============================
# Script by Florian Thonier
# florian@vidjil.org
# ===============================
8
from __future__ import division
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
import operator
import sys, os
import json
from operator     import itemgetter
from optparse     import OptionParser #@UnusedWildImport
from _collections import defaultdict
# ===============================
VERSION   = "0.02"
TIMESTAMP = "2018-04-27"
# ===============================


def get_value(line):
    ''' Return the value from a line '''
    return line.split(': ')[1]

def convert_value(line):
    ''' Return the values from reads result lines '''
    raw_value = get_value(line)
    numbers = raw_value.split("(")[0].replace(" ","").split("/")
    return numbers


def pear_converter(fileIn, fileOut):
    ''' Read line by line the log file, store data into a dict, and export it as json '''
    json_data = defaultdict(lambda: {})

    fi = open( fileIn,  "r")
    fo = open( fileOut, "w")

    for line in fi:
      line = line.replace("\n", "")
      ### version, setting and paramters
      if "PEAR v" in line : 
        json_data["settings"]["version"] = line
      elif "Forward reads file" in line : 
        json_data["settings"]["forward_file"] = get_value(line)
      elif "Reverse reads file" in line : 
        json_data["settings"]["reverse_file"] = get_value(line)
      elif "PHRED" in line : 
        json_data["settings"]["phred"] = get_value(line)
      elif "Scoring method" in line : 
        json_data["settings"]["scoring_methode"] = get_value(line)
      elif "Minimum overlap" in line : 
        json_data["settings"]["minimum_overlap"] = get_value(line)

      ### bases frequencies
      elif "A:" in line : 
        json_data["base_frequency"]["base_frequency_a"]   = get_value(line)
      elif "C:" in line : 
        json_data["base_frequency"]["base_frequency_c"]   = get_value(line)
      elif "G:" in line : 
        json_data["base_frequency"]["base_frequency_g"]   = get_value(line)
      elif "T:" in line : 
        json_data["base_frequency"]["base_frequency_t"]   = get_value(line)
      elif "uncalled bases" in line : 
        json_data["base_frequency"]["uncalled_base"] = line.replace(" uncalled bases", "").replace("  ", "")

      ### output file
      elif "Assembled reads file" in line : 
        json_data["output_file"]["assembled_reads"] = get_value(line)
      elif "Discarded reads file" in line : 
        json_data["output_file"]["discarded_reads"] = get_value(line)
      elif "Unassembled forward reads file" in line : 
        json_data["output_file"]["unassembled_forward"] = get_value(line)
      elif "Unassembled reverse reads file" in line : 
        json_data["output_file"]["unassembled_reverse"] = get_value(line)
      
      ### number of reads
      elif "Assembled reads" in line : 
79 80
        json_data["reads"]["reads_assembled_number"]     = int(convert_value(line)[0].replace(",",""))
        json_data["reads"]["reads_total_number"]         = int(convert_value(line)[1].replace(",",""))
81
      elif "Discarded reads" in line : 
82
        json_data["reads"]["reads_discarded_number"]     = int(convert_value(line)[0].replace(",",""))
83
      elif "Not assembled reads" in line : 
84
        json_data["reads"]["reads_not_assembled_number"] = int(convert_value(line)[0].replace(",",""))
85
      
86 87 88
    ### Warnings
    json_data["warning"] = []
    # assembled reads
89 90 91
    percentage_assembled = int(json_data["reads"]["reads_assembled_number"])*100 / json_data["reads"]["reads_total_number"]
    json_data["reads"]["percentage_assembled"] = percentage_assembled
    percentage_not_assembled = int(json_data["reads"]["reads_not_assembled_number"])*100 / json_data["reads"]["reads_total_number"]
92 93 94 95 96 97
    json_data["reads"]["percentage_not_assembled"] = percentage_not_assembled
    if percentage_not_assembled > 50.00 : 
      json_data["warning"].append("Very few reads assembled")
    elif percentage_not_assembled > 20.00 : 
      json_data["warning"].append("Few reads assembled")
    # discarded reads
98
    percentage_discarded     = json_data["reads"]["reads_discarded_number"]*100     / json_data["reads"]["reads_total_number"]
99 100 101
    json_data["reads"]["percentage_discarded"] = percentage_discarded
    if percentage_discarded > 10.00 : 
      json_data["warning"].append("High level of discarded reads")
102 103 104 105 106


    ### export as json
    try:
        json.dump(json_data, fo, sort_keys=True, indent=4, ensure_ascii=False)
107
        # print("Dump : %s" % (fileOut))
108 109
        fo.close()
    except:  # pragma: no cover
110
        print("Dump : FAILED (%s)" % (fileOut))
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
        return -1

    return


if __name__ == '__main__':
    #print( description )

    ### Description ###
    usage  = "Convert the result of pear preprocess log into a json file.\n"
    usage += "usage: %prog -i input_pear_log -o output.json"
    parser = OptionParser(usage=usage)

    ### Options ###
    parser.add_option("-v", "--verbose",
                      action="store_true", dest="verbose", default=True,
                      help="make lots of noise [default]")
    parser.add_option("-i", "--input",
                      metavar="FILE", help="input log file from PEAR merging")
    parser.add_option("-o", "--output",
                      metavar="FILE", help="output file in json format.")

    ### Getter des options ###
    argv = sys.argv
    (options, argv) = parser.parse_args(argv)
    if not options.input:   # if filename is not given
        parser.error('input-log file not given')
    if not options.output:   # if filename is not given
        parser.error('output file not given')

    if not ".json" in options.output:   # if extension is not given
        options.output = options.output + ".json"


    pear_converter(options.input, options.output )