Commit 88a24aec authored by MARIJON Pierre's avatar MARIJON Pierre

Two line in PAF for same read in same strand write one GFA L record:

If we try to add GFA line with same (read_a, strand_a, read_b, strand_b),
than a previous added GFA line, we keep the GFA line with :
- best overlap length or
- less error or
- smaller overhang length / mapping length

close #5
parent b5abe331
......@@ -21,7 +21,7 @@ class Parser:
self.__internal = internal
self.__segments = OrderedDict()
self.__links = list()
self.__links = dict()
self.__containments = OrderedDict()
self.__first_meet = dict()
......@@ -168,7 +168,7 @@ class Parser:
continue
yield "S\t{}\t*\tLN:i:{}".format(seq, length)
for link in self.__links:
for link in self.__links.values():
yield "L\t" + "\t".join(link)
for conted, list_conter in self.__containments.items():
......@@ -187,9 +187,32 @@ class Parser:
self.__segments[name] = length
def _add_link(self, name_a, strand_a, name_b, strand_b, ov_len, nb_match, overhang_maplen):
self.__links.append([name_a, strand_a, name_b, strand_b, ov_len+"M",
"NM:i:"+str(int(ov_len) - int(nb_match)),
"om:f:{:.2f}".format(overhang_maplen)])
signature = (name_a, strand_a, name_b, strand_b)
value = (name_a, strand_a, name_b, strand_b, ov_len+"M",
"NM:i:"+str(int(ov_len) - int(nb_match)),
"om:f:{:.2f}".format(overhang_maplen))
if signature in self.__links:
logging_line = " ".join(value) +" are duplicate "
# we see this overlap previously
# we keep less error, less overhang / match ratio overlap
actual_value = self.__links[signature]
#longest
if int(actual_value[4][:-1]) < int(ov_len):
logging_line + "keep longuest"
self.__links[signature] = value
#less error
elif int(actual_value[5][5:]) > int(ov_len) - int(nb_match):
logging_line + "keep less error"
self.__links[signature] = value
#lowest overhang / match ratio
elif float(actual_value[6][5:]) > overhang_maplen:
logging_line + "keep lowest overhang/maplen"
self.__links[signature] = value
logging.info(logging_line)
else:
self.__links[signature] = value
def _add_containment(self, container, strand_ner, contained, strand_ned, pos, length):
if contained not in self.__containments:
......
......@@ -26,7 +26,7 @@ L 3 + 4 + 8000M NM:i:0 om:f:0.00
p = paf2gfa.Parser()
p.parse_lines(line.split("\n"))
assert resu.strip() == p.get_gfa().strip()
assert set(resu.strip().split("\n")) == set(p.get_gfa().strip().split("\n"))
def test_with_2_repetition_strand_diff():
line = """
......@@ -65,4 +65,4 @@ L 6 + 5 - 8000M NM:i:0 om:f:0.00
p = paf2gfa.Parser()
p.parse_lines(line.split("\n"))
assert resu.strip() == p.get_gfa().strip()
assert set(resu.strip().split("\n")) == set(p.get_gfa().strip().split("\n"))
import pytest
import paf2gfa
# B is contain in A
# --------------------------->
# --------->
def test_duplication():
line = """
A\t1000\t200\t1000\t+\tB\t1000\t0\t800\t800\t800\t255
A\t1000\t200\t1000\t+\tB\t1000\t0\t800\t800\t800\t255
B\t1000\t0\t800\t+\tA\t1000\t200\t1000\t800\t800\t255
"""
resu = """
H\tVN:Z:1.0
S\tA\t*\tLN:i:1000
S\tB\t*\tLN:i:1000
L\tA\t+\tB\t+\t800M\tNM:i:0\tom:f:0.00
"""
p = paf2gfa.Parser()
p.parse_lines(line.split("\n"))
assert resu.strip() == p.get_gfa().strip()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment