Commit ee809fbd authored by MARIJON Pierre's avatar MARIJON Pierre

Whe remove contain and internal match only durring get_gfa call or clean_data call

parent 07c8d194
......@@ -8,7 +8,7 @@ import logging
import argparse
from collections import OrderedDict, defaultdict
from collections import defaultdict
from types import SimpleNamespace
class Parser:
......@@ -20,10 +20,10 @@ class Parser:
self.__containment = containment
self.__internal = internal
self.__not_all_internal = set()
self.__segments = OrderedDict()
self.__segments = dict()
self.__links = dict()
self.__containments = OrderedDict()
self.__internals = dict()
self.__containments = defaultdict(list)
@property
def containment(self):
......@@ -58,6 +58,10 @@ class Parser:
def links(self):
return self.__links
@property
def internals(self):
return self.__internals
@property
def containments(self):
return self.__containments
......@@ -80,14 +84,12 @@ class Parser:
self._add_segment(l.read_a, l.len_a)
self._add_segment(l.read_b, l.len_b)
if self.__internal:
if l.strand == "+":
self._add_link(l.read_a, "+", l.read_b, "+", l.nb_base,
l.nb_match, overhang/maplen)
else:
self._add_link(l.read_a, "+", l.read_b, "-", l.nb_base,
l.nb_match, overhang/maplen)
if l.strand == "+":
self._add_internal(l.read_a, "+", l.read_b, "+", l.nb_base,
l.nb_match, overhang/maplen)
else:
self._add_internal(l.read_a, "+", l.read_b, "-", l.nb_base,
l.nb_match, overhang/maplen)
elif l.strand == "+" and l.beg_a <= l.beg_b and l.len_a - l.end_a < l.len_b - l.end_b:
# B containe A
self._add_segment(l.read_a, l.len_a)
......@@ -163,26 +165,22 @@ class Parser:
def generate_gfa(self):
yield "H\tVN:Z:1.0"
for seq, length in self.__segments.items():
if not self.containment and seq in self.__containments:
continue
if not self.internal and seq not in self.__not_all_internal:
continue
yield "S\t{}\t*\tLN:i:{}".format(seq, length)
for (read_a, _, read_b, _), link in self.__links.items():
if not self.containment and (read_a in self.__containments or read_b in self.__containments):
continue
for _, link in self.__links.items():
yield "L\t" + "\t".join(link)
if self.containment:
for conted, list_conter in self.__containments.items():
for (conter, straned, straner, pos, ov) in list_conter:
yield "\t".join(["C", conter, straner, conted, straned,
for _, link in self.__internals.items():
yield "L\t" + "\t".join(link)
for conted, list_conter in self.__containments.items():
for (conter, straned, straner, pos, ov) in list_conter:
yield "\t".join(["C", conter, straner, conted, straned,
str(pos), str(ov)]) + "M"
def get_gfa(self):
self.clean_data()
lines = ""
for line in self.generate_gfa():
lines += line + "\n"
......@@ -193,9 +191,6 @@ class Parser:
self.__segments[name] = length
def _add_link(self, name_a, strand_a, name_b, strand_b, ov_len, nb_match, overhang_maplen):
self.__not_all_internal.add(name_a)
self.__not_all_internal.add(name_b)
signature = (name_a, strand_a, name_b, strand_b)
value = (name_a, strand_a, name_b, strand_b, ov_len+"M",
"NM:i:"+str(int(ov_len) - int(nb_match)),
......@@ -223,16 +218,48 @@ class Parser:
else:
self.__links[signature] = value
def _add_internal(self, name_a, strand_a, name_b, strand_b, ov_len,
nb_match, overhang_maplen):
signature = (name_a, strand_a, name_b, strand_b)
value = (name_a, strand_a, name_b, strand_b, ov_len+"M",
"NM:i:"+str(int(ov_len) - int(nb_match)),
"om:f:{:.2f}".format(overhang_maplen))
self.__internals[signature] = value
def _add_containment(self, container, strand_ner, contained, strand_ned, pos, length):
if contained not in self.__containments:
self.__containments[contained] = list()
if container in self.containments and any([contained == c[0] for c in self.containments[container]]):
logging.critical("Warning {} contain {} before".format(container, contained))
return
self.__containments[contained].append([container, strand_ner,
strand_ned, pos, length])
self.__containments[contained].append((container, strand_ner,
strand_ned, pos, length))
def clean_data(self):
keep_read = set()
for read_a, _, read_b, _ in self.__links:
keep_read.add(read_a)
keep_read.add(read_b)
if self.__internal:
for read_a, _, read_b, _ in self.__internals:
keep_read.add(read_a)
keep_read.add(read_b)
else:
self.__internals = dict()
if self.__containment:
for contained, value in self.__containments.items():
keep_read.add(contained)
for container, *_ in value:
keep_read.add(container)
else:
self.__containments = dict()
for read in set(self.__segments.keys()- keep_read):
self.__segments.pop(read, 0)
@staticmethod
def __line_type_correction(l):
......@@ -279,5 +306,7 @@ def main(args):
for line in arg["paf"]:
parser.parse_line(line)
parser.clean_data()
for line in parser.generate_gfa():
arg["gfa"].write(line+"\n")
......@@ -55,7 +55,7 @@ def test_A_contained_B_keep_contain_diff():
# --------->
def test_A_contain_B_leave_contain_same():
line = "1\t2000\t500\t1500\t+\t2\t1000\t0\t1000\t30\t1000\t255"
resu = "H\tVN:Z:1.0\nS\t1\t*\tLN:i:2000\n"
resu = "H\tVN:Z:1.0\n"
p = paf2gfa.Parser(False)
p.parse_line(line)
......@@ -67,7 +67,7 @@ def test_A_contain_B_leave_contain_same():
# <---------
def test_A_contain_B_leave_contain_diff():
line = "1\t2000\t500\t1500\t-\t2\t1000\t0\t1000\t30\t1000\t255"
resu = "H\tVN:Z:1.0\nS\t1\t*\tLN:i:2000\n"
resu = "H\tVN:Z:1.0\n"
p = paf2gfa.Parser(False)
p.parse_line(line)
......@@ -79,7 +79,7 @@ def test_A_contain_B_leave_contain_diff():
# --------------------------->
def test_A_contained_B_leave_contain_same():
line = "2\t1000\t0\t1000\t+\t1\t2000\t500\t1500\t30\t1000\t255"
resu = "H\tVN:Z:1.0\nS\t1\t*\tLN:i:2000\n"
resu = "H\tVN:Z:1.0\n"
p = paf2gfa.Parser(False)
p.parse_line(line)
......@@ -91,7 +91,7 @@ def test_A_contained_B_leave_contain_same():
# <---------------------------
def test_A_contained_B_leave_contain_diff():
line = "2\t1000\t0\t1000\t-\t1\t2000\t500\t1500\t30\t1000\t255"
resu = "H\tVN:Z:1.0\nS\t1\t*\tLN:i:2000\n"
resu = "H\tVN:Z:1.0\n"
p = paf2gfa.Parser(False)
p.parse_line(line)
......
import pytest
import paf2gfa
def test_A_overlap_B_B_contained_C_keep_all():
line = """
B 8853 7897 8500 - A 25804 24891 25773 150 882 255 cm:i:16
B 8853 5997 8553 - C 2962 80 2901 511 2821 255 cm:i:76
"""
resu = """
H VN:Z:1.0
S B * LN:i:8853
S A * LN:i:25804
S C * LN:i:2962
L B + A - 882M NM:i:732 om:f:0.44
C B + C - 5997 2821M
"""
p = paf2gfa.Parser()
p.parse_lines(line.split("\n"))
assert resu.strip() == p.get_gfa().strip()
def test_A_overlap_B_B_contained_C_leave_all():
line = """
B 8853 7897 8500 - A 25804 24891 25773 150 882 255 cm:i:16
B 8853 5997 8553 - C 2962 80 2901 511 2821 255 cm:i:76
"""
resu = """
H VN:Z:1.0
S B * LN:i:8853
S A * LN:i:25804
L B + A - 882M NM:i:732 om:f:0.44
"""
p = paf2gfa.Parser(False, False)
p.parse_lines(line.split("\n"))
assert resu.strip() == p.get_gfa().strip()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment