Commit 8cc7dbb1 by MARIJON Pierre

We delegate the search of the good extremity to path_in_gfa

parent fd4f4a4d
......@@ -14,73 +14,6 @@ def project_name(dir_path):
return max(res, key=res.get)
# Big code duplication with script
import pandas
import io
import subprocess
class Overlap:
def __init__(self, ov_bin, ov_store, gk_store, error):
self._ov_bin = ov_bin
self._ov_store = ov_store
self._gk_store = gk_store
self._error = error
def _get_all_overlap(self, index, ext5=True):
cmd = [self._ov_bin, "-O", self._ov_store, "-G", self._gk_store,
"-d", str(index)]
cmd.append("-d5") if ext5 else cmd.append("-d3")
process = subprocess.Popen(cmd,
out, _ = process.communicate()
return out
def best_ov(self, index, ext5=True):
ov = pandas.read_csv(io.StringIO(self._get_all_overlap(index, ext5)),
names=["id", "id_b", "flipped", "span", "beg",
"end", "beg_b", "end_b", "error"],
# remove containment overlap
len_read = ov["end"].max()
for i in ov.index:
if ext5 :
if not ov.loc[i]["end"] == len_read :
ov.loc[i] = None
if not ov.loc[i]["beg"] == 0:
ov.loc[i] = None
ov.dropna(0, inplace=True)
if ov.empty:
return {"id": -1.0, "id_b": -1.0, "flipped": -1.0, "span": -1.0,
"beg": -1.0, "end": -1.0, "beg_b": -1.0, "end_b": -1.0,
"error": -0.0}
# sort
first_col = "beg" if ext5 else "end"
sort_order = [True, False, True] if ext5 else [False, True, True]
ov.sort_values([first_col, "span", "error"], ascending=sort_order,
i = 0
for i in range(len(ov.index)):
if ov.loc[ov.index[i]]["error"] <= self._error:
if i+1 == len(ov.index):
return ov.loc[ov.index[0]]
return ov.loc[ov.index[i]]
......@@ -88,37 +21,17 @@ class Overlap:
# Big code duplication with script
import csv
def parse_canu_read_name(filename, invert):
def parse_canu_read_name(filename):
index = (0) if invert else (1)
id2readname = dict()
with open(filename, "r") as in_csv:
reader = csv.reader(in_csv, delimiter='\t')
for row in reader:
id2readname[row[0]] = row[1].split(" ")[0]
id2readname = {v: k for k, v in id2readname.items()} if invert else id2readname
return id2readname
def build_correspondance_function(filename, invert):
id2readname = parse_canu_read_name(filename, invert)
if not invert:
def corres_func(iid):
return str(id2readname[clean_read_name(iid[:-1])]) + iid[-1]
def corres_func(iid):
return id2readname[clean_read_name(iid[:-1])] + iid[-1]
return corres_func
def clean_read_name(name):
return name.replace("read", "").lstrip('0')
def read_contig_layout(filename):
import csv
......@@ -156,31 +69,11 @@ def read_contig_layout(filename):
return read2info, tig2extremity
def __search_contig_extremity(overlap, read2info, n):
nP = str(int(overlap.best_ov(n, True)["id_b"]))
nM = str(int(overlap.best_ov(n, False)["id_b"]))
ret = list()
if nP in read2info and read2info[nP]["tigID"] == read2info[n]["tigID"]:
return n + "-"
elif nM in read2info and read2info[nM]["tigID"] == read2info[n]["tigID"]:
return n + "+"
raise Exception("We can't find free read extremity for "+n)
def generate_output_extract_graph(contig_layout, read_names, ov_bin, ov_store, gk_store, prefix):
def generate_output_extract_graph(contig_layout, read_names, prefix):
read2info, tig2extremity = read_contig_layout(contig_layout)
id2name = parse_canu_read_name(read_names)
overlap = Overlap(ov_bin, ov_store, gk_store, 100000.0)
tig2free = dict()
for tig, (beg, end) in tig2extremity.items():
if beg != end:
tig2free[tig] = (__search_contig_extremity(overlap, read2info, beg),
__search_contig_extremity(overlap, read2info, end))
except Exception as e:
tig2ext_filter = {k: (a, b) for k, (a, b) in tig2extremity.items() if a != b}
search_path = prefix + "_extremity_search.csv"
result_path = prefix + "_extremity_result.csv"
......@@ -188,20 +81,18 @@ def generate_output_extract_graph(contig_layout, read_names, ov_bin, ov_store, g
with open(search_path, "w") as out_file:
writer = csv.writer(out_file, delimiter=",")
writer.writerow(["tigA", "readA", "tigB", "readB"])
for tigA, tigB in itertools.product(tig2free.keys(), tig2free.keys()):
for tigA, tigB in itertools.product(tig2ext_filter.keys(), tig2ext_filter.keys()):
if tigA == tigB :
id2name = build_correspondance_function(read_names, False)
readAP = id2name(tig2free[tigA][0])
readBP = id2name(tig2free[tigB][0])
readAM = id2name(tig2free[tigA][1])
readBM = id2name(tig2free[tigB][1])
read_a_beg = id2name[tig2ext_filter[tigA][0]]
read_b_beg = id2name[tig2ext_filter[tigB][0]]
read_a_end = id2name[tig2ext_filter[tigA][1]]
read_b_end = id2name[tig2ext_filter[tigB][1]]
writer.writerow([tigA, readAP, tigB, readBP[:-1]])
writer.writerow([tigA, readAM, tigB, readBP[:-1]])
writer.writerow([tigA, readAP, tigB, readBM[:-1]])
writer.writerow([tigA, readAM, tigB, readBM[:-1]])
writer.writerow([tigA, read_a_beg, tigB, read_b_beg])
writer.writerow([tigA, read_a_beg, tigB, read_b_end])
writer.writerow([tigA, read_a_end, tigB, read_b_beg])
writer.writerow([tigA, read_a_end, tigB, read_b_end])
return result_path
return result_path, id2name
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment