Commit 638c05d5 authored by MARIJON Pierre's avatar MARIJON Pierre

Add posibility to remove internal match

parent 1a99174d
......@@ -15,27 +15,42 @@ import networkx as nx
class Parser:
def __init__(self, remove=False):
def __init__(self, containment=True, internal=True):
self.__cores = ["read_a", "len_a", "beg_a", "end_a", "strand",
"read_b", "len_b", "beg_b", "end_b",
"nb_match", "nb_base", "qual"]
self.__remove = remove
self.__containment = containment
self.__internal = internal
self.__segments = OrderedDict()
self.__links = nx.DiGraph()
self.__containments = OrderedDict()
self.__first_meet = dict()
@property
def remove(self):
return self.__remove
def containment(self):
return self.__containment
@containment.setter
def containment(self, value):
self.__containment = value
@containment.getter
def containment(self):
return self.__containment
@property
def internal(self):
return self.__internal
@internal.setter
def internal(self, value):
self.__internal = value
@remove.setter
def remove(self, value):
self.__remove = value
@internal.getter
def internal(self):
return self.__internal
@remove.getter
def remove(self):
return self.__remove
@property
def segments(self):
......@@ -59,14 +74,22 @@ class Parser:
maplen = max(l.end_a - l.beg_a, l.end_b - l.beg_b)
logging.debug("name A {} name B {}, beg_A {} beg_B {}".format(l.read_a, l.read_b, l.beg_a, l.beg_b))
strange_ov = False
internal_match = False
if overhang > maplen*0.8:
# Strange overlap
logging.debug("Strange Overlap between {} {}".format(l.read_a, l.read_b))
logging.debug("Internal match between {} {}".format(l.read_a, l.read_b))
logging.debug("overhang {} maplen*x {}".format(overhang, maplen*0.8))
strange_ov = True
internal_match = True
if overhang < 1000 and not strange_ov and l.beg_a <= l.beg_b and l.len_a - l.end_a < l.len_b - l.end_b:
if internal_match and not self.__internal:
# We add segment but we can remove if we want
self._add_segment(l.read_a, l.len_a)
self._add_segment(l.read_b, l.len_b)
logging.debug("We didn't keep internal match between", l.read_a, l.read_b)
return
if overhang < 1000 and not internal_match and l.beg_a <= l.beg_b and l.len_a - l.end_a < l.len_b - l.end_b:
# B containe A
self._add_segment(l.read_a, l.len_a)
self._add_segment(l.read_b, l.len_b)
......@@ -77,7 +100,7 @@ class Parser:
else:
self._add_containment(l.read_b, "-", l.read_a, "+", l.beg_b,
l.nb_base)
elif overhang < 1000 and not strange_ov and l.beg_a >= l.beg_b and l.len_a - l.end_a > l.len_b - l.end_b:
elif overhang < 1000 and not internal_match and l.beg_a >= l.beg_b and l.len_a - l.end_a > l.len_b - l.end_b:
# A containe B
self._add_segment(l.read_a, l.len_a)
self._add_segment(l.read_b, l.len_b)
......@@ -114,7 +137,7 @@ class Parser:
def generate_gfa(self):
for seq, length in self.__segments.items():
if self.__remove and seq in self.__containments:
if not self.__containment and seq in self.__containments:
continue
yield "S\t{}\t*\tLN:i:{}".format(seq, length)
......@@ -199,15 +222,19 @@ def main(args):
parser.add_argument('paf', type=argparse.FileType('r'))
parser.add_argument('gfa', type=argparse.FileType('w'))
parser.add_argument("-r", "--remove-all-containement", action='store_true',
parser.add_argument("-c", "--remove-all-containement", action='store_true',
help="Remove all containement")
parser.add_argument("-i", "--remove-all-internal", action='store_true',
help="Remove all internal match")
arg = vars(parser.parse_args(args))
paf = arg["paf"].name
gfa = arg["gfa"].name
rm_all_contain = arg["remove_all_containement"]
rm_all_internal = arg["remove_all_internal"]
parser = Parser(rm_all_contain)
parser = Parser(not rm_all_contain, not rm_all_internal)
for line in arg["paf"]:
parser.parse_line(line)
......
......@@ -57,7 +57,7 @@ def test_A_contain_B_leave_contain_same():
line = "1\t2000\t500\t1500\t+\t2\t1000\t0\t1000\t30\t1000\t255"
resu = "S\t1\t*\tLN:i:2000\nC\t1\t+\t2\t+\t500\t1000M\n"
p = paf2gfa.Parser(True)
p = paf2gfa.Parser(False)
p.parse_line(line)
assert resu == p.get_gfa()
......@@ -69,7 +69,7 @@ def test_A_contain_B_leave_contain_diff():
line = "1\t2000\t500\t1500\t-\t2\t1000\t0\t1000\t30\t1000\t255"
resu = "S\t1\t*\tLN:i:2000\nC\t1\t+\t2\t-\t500\t1000M\n"
p = paf2gfa.Parser(True)
p = paf2gfa.Parser(False)
p.parse_line(line)
assert resu == p.get_gfa()
......@@ -81,7 +81,7 @@ def test_A_contained_B_leave_contain_same():
line = "2\t1000\t0\t1000\t+\t1\t2000\t500\t1500\t30\t1000\t255"
resu = "S\t1\t*\tLN:i:2000\nC\t1\t+\t2\t+\t500\t1000M\n"
p = paf2gfa.Parser(True)
p = paf2gfa.Parser(False)
p.parse_line(line)
assert resu == p.get_gfa()
......@@ -93,7 +93,7 @@ def test_A_contained_B_leave_contain_diff():
line = "2\t1000\t0\t1000\t-\t1\t2000\t500\t1500\t30\t1000\t255"
resu = "S\t1\t*\tLN:i:2000\nC\t1\t+\t2\t-\t500\t1000M\n"
p = paf2gfa.Parser(True)
p = paf2gfa.Parser(False)
p.parse_line(line)
assert resu == p.get_gfa()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment