Commit 5f31a762 authored by MARIJON Pierre's avatar MARIJON Pierre

Reformat and explicity colname of contig report

parent 604885e6
......@@ -4,7 +4,6 @@ rule canu_contig_studies:
canu_read2tig=canu_read2tig,
canu_gkp_store=canu_gkp_store,
canu_ovl_store=canu_ovl_store,
canu_bog_gfa=canu_bog + ".gfa"
output:
canu_contig_report + ".csv",
......@@ -12,7 +11,7 @@ rule canu_contig_studies:
shell:
" ".join([
package_path + "./script/ext_contig_report.py", "{input.canu_read2tig}", "{input.canu_bog_gfa}",
package_path + "./script/ext_contig_report.py", "{input.canu_read2tig}",
config['canu']['ov_dump_bin'], "{input.canu_ovl_store}", "{input.canu_gkp_store}", canu_contig_report
])
......
......@@ -17,23 +17,6 @@ import networkx as nx
from collections import defaultdict
def normalized_node(node):
return int(node[4:].lstrip('0'))
def read_gfa(filename):
G = nx.Graph()
with open(filename, "r") as gfafile:
gfaparser = csv.reader(gfafile, delimiter="\t")
for row in gfaparser:
if row[0] == "S":
G.add_node(normalized_node(row[1]))
elif row[0] == "L":
G.add_edge(normalized_node(row[1]), normalized_node(row[3]))
return G
class Overlap:
def __init__(self, ov_bin, ov_store, gk_store, error):
......@@ -94,6 +77,92 @@ class Overlap:
return ov.loc[ov.index[i]]
##########
# WARNING
#########
#
# Big code duplication with script basename2graphename.py
import csv
def parse_canu_read_name(filename, invert):
index = (0) if invert else (1)
id2readname = dict()
with open(filename, "r") as in_csv:
reader = csv.reader(in_csv, delimiter='\t')
for row in reader:
id2readname[row[0]] = row[1].split(" ")[0]
id2readname = {v: k for k, v in id2readname.items()} if invert else id2readname
return id2readname
def build_correspondance_function(filename, invert):
id2readname = parse_canu_read_name(filename, invert)
if not invert:
def corres_func(iid):
return str(id2readname[clean_read_name(iid[:-1])]) + iid[-1]
else:
def corres_func(iid):
return id2readname[clean_read_name(iid[:-1])] + iid[-1]
return corres_func
def clean_read_name(name):
return name.replace("read", "").lstrip('0')
def read_contig_layout(filename):
import csv
read2info = defaultdict(dict)
tig2extremity = defaultdict(set)
with open(filename, "r") as fd:
iterator = csv.reader(fd, delimiter="\t")
key = iterator.__next__()
key.pop(0)
actual_tig = 0
for row in iterator:
if row[3] > row[4]:
row[3], row[4] = row[4], row[3]
# index all info
for k, v in zip(key, row[1:]):
read2info[row[0]][k] = v
#index extremity
if actual_tig == 0:
actual_tig = row[1]
actual_begin = row[0]
actual_end = row[0]
if actual_tig != row[1]:
tig2extremity[actual_tig] = (actual_begin, actual_end)
actual_tig = row[1]
actual_begin = row[0]
actual_end = row[0]
if int(row[4]) > int(read2info[actual_end]["end"]):
actual_end = row[0]
return read2info, tig2extremity
def __search_contig_extremity(overlap, read2info, n):
nP = str(int(overlap.best_ov(n, True)["id_b"]))
nM = str(int(overlap.best_ov(n, False)["id_b"]))
ret = list()
if nP in read2info and read2info[nP]["tigID"] == read2info[n]["tigID"]:
return "-"
elif nM in read2info and read2info[nM]["tigID"] == read2info[n]["tigID"]:
return "+"
else:
raise Exception("We can't find free read extremity for "+n)
def main(args):
......@@ -101,7 +170,6 @@ def main(args):
formatter_class=argparse.
ArgumentDefaultsHelpFormatter)
parser.add_argument("read2tig", help="canu read to tig file")
parser.add_argument("bog", help="canu bog")
parser.add_argument("ovBinary", help="ovStoreDump binary path")
parser.add_argument("ovStore", help="ovStore dir path")
parser.add_argument("gkpStore", help="gatekeeper dir path")
......@@ -110,72 +178,57 @@ def main(args):
arg = vars(parser.parse_args(args))
read2tig = arg["read2tig"]
bog = arg["bog"]
ovBinary = arg["ovBinary"]
ovStore = arg["ovStore"]
gkpStore = arg["gkpStore"]
outBaseName = arg["outBaseName"]
error = arg["error"]
read2tig = arg["read2tig"]
ovBinary = arg["ovBinary"]
ovStore = arg["ovStore"]
gkpStore = arg["gkpStore"]
outBaseName = arg["outBaseName"]
error = arg["error"]
overlap = Overlap(ovBinary, ovStore, gkpStore, error)
read2tig_df = pandas.read_csv(read2tig, "\t", index_col=0)
col_name = ["tig", "read name", "free extremity", "read name in paf",
"overlap error", "span"]
result = list()
overlap_series = list()
read2info, tig2extremity = read_contig_layout(read2tig)
for tig, reads in tig2extremity.items():
if reads[0] == reads[1]:
continue
for read in reads:
try:
extremity = __search_contig_extremity(overlap, read2info, read)
except:
extremity = "None"
line = list()
line.append(tig)
line.append(read)
line.append(extremity)
if extremity == "+":
ov = overlap.best_ov(read)
elif extremity == "-":
ov = overlap.best_ov(read, False)
else:
ov = overlap.best_ov(read)
bog_graph = read_gfa(bog)
line.append(ov["id_b"])
line.append(ov["error"])
overlap_series.append(ov["error"])
line.append(ov["span"])
result.append(line)
pandas.Series(overlap_series).hist().get_figure().savefig(outBaseName+".png")
with open(outBaseName + ".csv", "w") as fd:
writer = csv.writer(fd, delimiter=",")
writer.writerow(col_name)
for line in result:
writer.writerow(line)
result = pandas.DataFrame(columns=("name_5", "nb_bog5", "name_paf55",
"error_paf55", "span_paf55",
"name_paf53", "error_paf53",
"span_paf53", "name_3", "nb_bog3",
"name_paf35", "error_paf35",
"span_paf35", "name_paf33",
"error_paf33", "span_paf33"))
group = read2tig_df.groupby(["tigID"])
for name, contig in group:
# not run on contig with just on read
if len(contig) > 2:
for index in contig.index:
# find the first node in the bog
if index in bog_graph.node:
result.loc[name] = [1 for _ in range(len(result.columns))]
result.loc[name]["name_5"] = index
result.loc[name]["nb_bog5"] = sum(1 for _ in nx.all_neighbors(bog_graph, index))
ov = overlap.best_ov(index)
result.loc[name]["name_paf55"] = ov["id_b"]
result.loc[name]["error_paf55"] = ov["error"]
result.loc[name]["span_paf55"] = ov["span"]
ov = overlap.best_ov(index, False)
result.loc[name]["name_paf53"] = ov["id_b"]
result.loc[name]["error_paf53"] = ov["error"]
result.loc[name]["span_paf53"] = ov["span"]
break
for index in reversed(contig.index):
# find the last node in the bog
if index in bog_graph.node:
result.loc[name]["name_3"] = index
result.loc[name]["nb_bog3"] = sum(1 for _ in nx.all_neighbors(bog_graph, index))
ov = overlap.best_ov(index)
result.loc[name]["name_paf35"] = ov["id_b"]
result.loc[name]["error_paf35"] = ov["error"]
result.loc[name]["span_paf35"] = ov["span"]
ov = overlap.best_ov(index, False)
result.loc[name]["name_paf33"] = ov["id_b"]
result.loc[name]["error_paf33"] = ov["error"]
result.loc[name]["span_paf33"] = ov["span"]
break
pandas.concat([result["error_paf55"], result["error_paf53"],
result["error_paf35"], result["error_paf33"]]
).hist().get_figure().savefig(outBaseName+".png")
result.to_csv(outBaseName+".csv")
if __name__ == "__main__":
main(sys.argv[1:])
......@@ -117,8 +117,7 @@ def canu_only(template_path, canu_correction_report, canu_fasta, canu_bog_gfa,
param["canu_contig_paf_legend"] = str(base64.b64encode(generate_legend(canu_contig_on_paf_legend).read()))[2:-1]
param["canu_stat"] = canu_stat
param["canu_contig_report_csv"] = pandas.read_csv(canu_contig_report_csv, index_col=0).to_html()
param["canu_contig_report_csv"] = pandas.read_csv(canu_contig_report_csv).to_html(index=False)
param["canu_contig_report_png"] = str(base64.b64encode(open(canu_contig_report_png, "rb").read()))[2:-1]
# build path search
......@@ -204,7 +203,7 @@ def canu_minimap(template_path, canu_correction_report, canu_fasta, canu_bog_gfa
param["canu_stat"] = canu_stat
param["canu_contig_report_csv"] = pandas.read_csv(canu_contig_report_csv, index_col=0).to_html()
param["canu_contig_report_csv"] = pandas.read_csv(canu_contig_report_csv).to_html(index=False)
param["canu_contig_report_png"] = str(base64.b64encode(open(canu_contig_report_png, "rb").read()))[2:-1]
# build path search
......@@ -329,7 +328,7 @@ def canu_miniasm(template_path, canu_correction_report, canu_fasta, canu_bog_gfa
param["canu_stat"] = canu_stat
param["canu_contig_report_csv"] = pandas.read_csv(canu_contig_report_csv, index_col=0).to_html()
param["canu_contig_report_csv"] = pandas.read_csv(canu_contig_report_csv).to_html(index=False)
param["canu_contig_report_png"] = str(base64.b64encode(open(canu_contig_report_png, "rb").read()))[2:-1]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment