From 766a09a6c18ec4682934f0de17db2b86f37e44bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Laurent?= <clemence@opensquare.eu> Date: Wed, 14 Dec 2022 10:40:44 +0100 Subject: [PATCH] maj python --- alRawTextCleaning.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/alRawTextCleaning.py b/alRawTextCleaning.py index 15c82f0..9cd133f 100644 --- a/alRawTextCleaning.py +++ b/alRawTextCleaning.py @@ -12,11 +12,13 @@ def altok_escape_metacharacters(s): s = re.sub(r"([\\Â {}])", r"\\\1", s) return s -def altok_escape_xml(s): - s = re.sub(r"^\s*(<[\!\?]?[\w\.:_-]+(?: .*?)?\/?>(?:.*<\/[\w\.:_-]+>)?)\s*$", r"{\1} _XML", s) - s = re.sub(r"^\s*((?:<\/?[\w\.:_-]+[^>]*\/?>)+)\s*$", r"{\1} _XML", s) - s = re.sub(r"^\s*(<\!--[^>]+>)\s*$", r"{\1} _XML", s) - s = re.sub(r"^\s*(<\/[\w\.:_-]+>)\s*$", r"{\1} _XML", s) +def altok_escape_xml(s, do_not_create_XML_special_tokens = 0): + if do_not_create_XML_special_tokens == 0: + s = re.sub(r"^\s*(<[\!\?]?[\w\.:_-]+(?: .*?)?\/?>(?:.*<\/[\w\.:_-]+>)?)\s*$", r"{\1} _XML", s) + s = re.sub(r"^\s*((?:<\/?[\w\.:_-]+[^>]*\/?>)+)\s*$", r"{\1} _XML", s) + s = re.sub(r"^\s*(<\!--[^>]+>)\s*$", r"{\1} _XML", s) + s = re.sub(r"^\s*(<\/[\w\.:_-]+>)\s*$", r"{\1} _XML", s) + s = re.sub(r"&", r"&", s) s = re.sub(r"<", r"<", s) s = re.sub(r">", r">", s) @@ -24,6 +26,10 @@ def altok_escape_xml(s): s = re.sub(r"(?=\\)(\{[^\{}]*)&", r"\1&", s) s = re.sub(r"(?=\\)(\{[^\{}]*)<", r"\1<", s) s = re.sub(r"(?=\\)(\{[^\{}]*)>", r"\1>", s) + + if do_not_create_XML_special_tokens == 0: + if re.search(r"_XML", s): + s = re.sub(r"\{([^{}]+)\} _XML", lambda x : "{" + _double_whitespaces(x.group(1)) + "} _XML", s) return s def altok_interpret_entities(s): @@ -34,5 +40,9 @@ def altok_interpret_entities(s): #s = s.encode("utf-8").decode("utf-8") return s +def _double_whitespaces(s): + s = re.sub(r" ", r" ", s) + return s + \ No newline at end of file -- GitLab