Commit 45287c70 authored by MARIJON Pierre's avatar MARIJON Pierre

Whe can convert paf in gfa without construction of overlap graph

parent 3aa94238
......@@ -23,7 +23,7 @@ class Parser:
self.__internal = internal
self.__segments = OrderedDict()
self.__links = nx.DiGraph()
self.__links = list()
self.__containments = OrderedDict()
self.__first_meet = dict()
......@@ -69,27 +69,29 @@ class Parser:
**{self.__cores[i]: v for i, v in enumerate(re.split("\s+", line)[:12])})
l = Parser.__line_type_correction(l)
overhang = Parser.__compute_overhang(l)
maplen = max(l.end_a - l.beg_a, l.end_b - l.beg_b)
logging.debug("name A {} name B {}, beg_A {} beg_B {}".format(l.read_a, l.read_b, l.beg_a, l.beg_b))
internal_match = False
if overhang > maplen*0.8:
if overhang > maplen*0.8 or overhang > 1000:
# Strange overlap
logging.debug("Internal match between {} {}".format(l.read_a, l.read_b))
logging.debug("overhang {} maplen*x {}".format(overhang, maplen*0.8))
internal_match = True
if internal_match and not self.__internal:
# We add segment but we can remove if we want
self._add_segment(l.read_a, l.len_a)
self._add_segment(l.read_b, l.len_b)
logging.debug("We didn't keep internal match between", l.read_a, l.read_b)
return
if overhang < 1000 and not internal_match and l.beg_a <= l.beg_b and l.len_a - l.end_a < l.len_b - l.end_b:
if self.__internal:
if l.strand == "+":
self._add_link(l.read_a, "+", l.read_b, "+", l.nb_base,
l.nb_match, overhang/maplen)
else:
self._add_link(l.read_a, "+", l.read_b, "-", l.nb_base,
l.nb_match, overhang/maplen)
elif l.beg_a <= l.beg_b and l.len_a - l.end_a < l.len_b - l.end_b:
# B containe A
self._add_segment(l.read_a, l.len_a)
self._add_segment(l.read_b, l.len_b)
......@@ -100,7 +102,7 @@ class Parser:
else:
self._add_containment(l.read_b, "-", l.read_a, "+", l.beg_b,
l.nb_base)
elif overhang < 1000 and not internal_match and l.beg_a >= l.beg_b and l.len_a - l.end_a > l.len_b - l.end_b:
elif l.beg_a >= l.beg_b and l.len_a - l.end_a > l.len_b - l.end_b:
# A containe B
self._add_segment(l.read_a, l.len_a)
self._add_segment(l.read_b, l.len_b)
......@@ -111,22 +113,40 @@ class Parser:
else:
self._add_containment(l.read_a, "-", l.read_b, "+", l.beg_a,
l.nb_base)
elif l.beg_a > l.beg_b:
# A overlap B
logging.debug("A overlap B")
elif l.strand == "+":
self._add_segment(l.read_a, l.len_a)
self._add_segment(l.read_b, l.len_b)
self._add_link(l.read_a, l.read_b, l.strand, l.nb_base, l.nb_match,
overhang/maplen)
if l.beg_a > l.beg_b:
# A overlap B
logging.debug("A overlap B")
self._add_link(l.read_a, "+", l.read_b, "+", l.nb_base,
l.nb_match, overhang/maplen)
else:
# B overlap A
logging.debug("B overlap A")
self._add_link(l.read_b, "+", l.read_a, "+", l.nb_base,
l.nb_match, overhang/maplen)
else:
# B overlap A
logging.debug("B overlap A")
self._add_segment(l.read_a, l.len_a)
self._add_segment(l.read_b, l.len_b)
self._add_link(l.read_b, l.read_a, l.strand, l.nb_base, l.nb_match,
overhang/maplen)
if l.beg_a > l.len_a - l.end_a:
if l.beg_a > l.len_b - l.end_b:
self._add_link(l.read_a, "+", l.read_b, "-", l.nb_base,
l.nb_match, overhang/maplen)
else:
self._add_link(l.read_b, "+", l.read_a, "-", l.nb_base,
l.nb_match, overhang/maplen)
else:
if l.len_a - l.beg_a > l.end_b:
self._add_link(l.read_a, "-", l.read_b, "+", l.nb_base,
l.nb_match, overhang/maplen)
else:
self._add_link(l.read_b, "-", l.read_a, "+", l.nb_base,
l.nb_match, overhang/maplen)
def parse_lines(self, lines):
for line in lines:
......@@ -143,23 +163,8 @@ class Parser:
continue
yield "S\t{}\t*\tLN:i:{}".format(seq, length)
for node in self.__links.nodes():
for child in self.__links.successors(node):
link = ["L"]
for n in (node, child):
if n.endswith("_"):
link.append(n[:-1])
link.append("-")
else:
link.append(n)
link.append("+")
data = self.__links.get_edge_data(node, child)
link.append(data["ov_len"] + "M")
link.append("NM:i:" + str(int(data["ov_len"]) - int(data["nb_match"])))
link.append("om:f:{:.2f}".format(data["overhang_len"]))
yield "\t".join(link)
for link in self.__links:
yield "L\t" + "\t".join(link)
for conted, list_conter in self.__containments.items():
for (conter, straned, straner, pos, ov) in list_conter:
......@@ -176,30 +181,10 @@ class Parser:
def _add_segment(self, name, length):
self.__segments[name] = length
def _add_link(self, name_a, name_b, strand, ov_len, nb_match, overhang_maplen):
if strand == "+":
if name_a in self.__links.nodes() or name_b in self.__links.nodes():
logging.debug("add", name_a, name_b)
self.__links.add_edge(name_a, name_b,
**{"ov_len": ov_len, "nb_match": nb_match,
"overhang_len": overhang_maplen})
else:
logging.debug("add", name_a+"_", name_b+"_")
self.__links.add_edge(name_a + "_", name_b + "_",
**{"ov_len": ov_len, "nb_match": nb_match,
"overhang_len": overhang_maplen})
else:
if name_a in self.__links.nodes() or name_b + "_" in self.__links.nodes():
logging.debug("add ", name_a, name_b + "_")
self.__links.add_edge(name_a, name_b + "_",
**{"ov_len": ov_len, "nb_match": nb_match,
"overhang_len": overhang_maplen})
else:
logging.debug("add ", name_a + "_", name_b)
self.__links.add_edge(name_a + "_", name_b,
**{"ov_len": ov_len, "nb_match": nb_match,
"overhang_len": overhang_maplen})
def _add_link(self, name_a, strand_a, name_b, strand_b, ov_len, nb_match, overhang_maplen):
self.__links.append([name_a, strand_a, name_b, strand_b, ov_len+"M",
"NM:i:"+str(int(ov_len) - int(nb_match)),
"om:f:{:.2f}".format(overhang_maplen)])
def _add_containment(self, container, strand_ner, contained, strand_ned, pos, length):
if contained not in self.__containments:
......
......@@ -16,16 +16,17 @@ S 2 * LN:i:10000
S 5 * LN:i:10000
S 3 * LN:i:10000
S 4 * LN:i:10000
L 1 - 2 + 8001M NM:i:0 om:f:0.00
L 5 + 1 - 7999M NM:i:0 om:f:0.00
L 2 + 3 - 8000M NM:i:0 om:f:0.00
L 3 - 4 - 8000M NM:i:0 om:f:0.00
L 1 + 2 - 8001M NM:i:0 om:f:0.00
L 1 - 5 + 7999M NM:i:0 om:f:0.00
L 2 - 3 + 8000M NM:i:0 om:f:0.00
L 3 + 4 + 8000M NM:i:0 om:f:0.00
"""
p = paf2gfa.Parser()
p.parse_lines(line.split("\n"))
assert set(resu.split("\n")) == set(p.get_gfa().split("\n"))
print(p.get_gfa())
assert resu.strip() == p.get_gfa().strip()
def test_with_2_repetition_strand_diff():
line = """
......@@ -50,18 +51,19 @@ S 9 * LN:i:10000
S 7 * LN:i:10000
S 8 * LN:i:10000
S 6 * LN:i:10000
L 1 + 2 - 8000M NM:i:0 om:f:0.00
L 2 - 3 + 8001M NM:i:0 om:f:0.00
L 1 + 2 - 8001M NM:i:0 om:f:0.00
L 5 - 1 + 8000M NM:i:0 om:f:0.00
L 2 - 3 + 8000M NM:i:0 om:f:0.00
L 3 + 4 + 8000M NM:i:0 om:f:0.00
L 4 + 9 + 8000M NM:i:0 om:f:0.00
L 4 + 7 - 8000M NM:i:0 om:f:0.00
L 8 + 7 + 8000M NM:i:0 om:f:0.00
L 7 + 6 + 8000M NM:i:0 om:f:0.00
L 6 + 5 - 8000M NM:i:0 om:f:0.00
L 5 - 1 + 8000M NM:i:0 om:f:0.00
"""
p = paf2gfa.Parser()
p.parse_lines(line.split("\n"))
assert set(resu.split("\n")) == set(p.get_gfa().split("\n"))
print(p.get_gfa())
open("tmp.gfa", "w").write(p.get_gfa())
assert resu.strip() == p.get_gfa().strip()
......@@ -19,7 +19,7 @@ def test_A_3_same():
# <-------------
def test_A_3_diff():
line = "1\t1000\t10\t1000\t-\t2\t1000\t10\t1000\t30\t980\t255"
resu = "S\t1\t*\tLN:i:1000\nS\t2\t*\tLN:i:1000\nL\t2\t-\t1\t+\t980M\tNM:i:950\tom:f:0.00\n"
resu = "S\t1\t*\tLN:i:1000\nS\t2\t*\tLN:i:1000\nL\t1\t+\t2\t-\t980M\tNM:i:950\tom:f:0.00\n"
p = paf2gfa.Parser()
p.parse_line(line)
......@@ -43,7 +43,7 @@ def test_A_5_same():
# <----------------
def test_A_5_diff():
line = "1\t1000\t0\t980\t-\t2\t1000\t0\t980\t30\t960\t255"
resu = "S\t1\t*\tLN:i:1000\nS\t2\t*\tLN:i:1000\nL\t2\t-\t1\t+\t960M\tNM:i:930\tom:f:0.00\n"
resu = "S\t1\t*\tLN:i:1000\nS\t2\t*\tLN:i:1000\nL\t1\t-\t2\t+\t960M\tNM:i:930\tom:f:0.00\n"
p = paf2gfa.Parser()
p.parse_lines(line.split("\n"))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment