diff --git a/alRawTextCleaning.py b/alRawTextCleaning.py index 15c82f03a731ba23ada1939bfbecc5db7dbf3a84..9cd133fac231f638a6023801867684f52a06ef68 100644 --- a/alRawTextCleaning.py +++ b/alRawTextCleaning.py @@ -12,11 +12,13 @@ def altok_escape_metacharacters(s): s = re.sub(r"([\\Â {}])", r"\\\1", s) return s -def altok_escape_xml(s): - s = re.sub(r"^\s*(<[\!\?]?[\w\.:_-]+(?: .*?)?\/?>(?:.*<\/[\w\.:_-]+>)?)\s*$", r"{\1} _XML", s) - s = re.sub(r"^\s*((?:<\/?[\w\.:_-]+[^>]*\/?>)+)\s*$", r"{\1} _XML", s) - s = re.sub(r"^\s*(<\!--[^>]+>)\s*$", r"{\1} _XML", s) - s = re.sub(r"^\s*(<\/[\w\.:_-]+>)\s*$", r"{\1} _XML", s) +def altok_escape_xml(s, do_not_create_XML_special_tokens = 0): + if do_not_create_XML_special_tokens == 0: + s = re.sub(r"^\s*(<[\!\?]?[\w\.:_-]+(?: .*?)?\/?>(?:.*<\/[\w\.:_-]+>)?)\s*$", r"{\1} _XML", s) + s = re.sub(r"^\s*((?:<\/?[\w\.:_-]+[^>]*\/?>)+)\s*$", r"{\1} _XML", s) + s = re.sub(r"^\s*(<\!--[^>]+>)\s*$", r"{\1} _XML", s) + s = re.sub(r"^\s*(<\/[\w\.:_-]+>)\s*$", r"{\1} _XML", s) + s = re.sub(r"&", r"&", s) s = re.sub(r"<", r"<", s) s = re.sub(r">", r">", s) @@ -24,6 +26,10 @@ def altok_escape_xml(s): s = re.sub(r"(?=\\)(\{[^\{}]*)&", r"\1&", s) s = re.sub(r"(?=\\)(\{[^\{}]*)<", r"\1<", s) s = re.sub(r"(?=\\)(\{[^\{}]*)>", r"\1>", s) + + if do_not_create_XML_special_tokens == 0: + if re.search(r"_XML", s): + s = re.sub(r"\{([^{}]+)\} _XML", lambda x : "{" + _double_whitespaces(x.group(1)) + "} _XML", s) return s def altok_interpret_entities(s): @@ -34,5 +40,9 @@ def altok_interpret_entities(s): #s = s.encode("utf-8").decode("utf-8") return s +def _double_whitespaces(s): + s = re.sub(r" ", r" ", s) + return s + \ No newline at end of file