From 766a09a6c18ec4682934f0de17db2b86f37e44bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mence=20Laurent?= <clemence@opensquare.eu>
Date: Wed, 14 Dec 2022 10:40:44 +0100
Subject: [PATCH] maj python

---
 alRawTextCleaning.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/alRawTextCleaning.py b/alRawTextCleaning.py
index 15c82f0..9cd133f 100644
--- a/alRawTextCleaning.py
+++ b/alRawTextCleaning.py
@@ -12,11 +12,13 @@ def altok_escape_metacharacters(s):
 	s = re.sub(r"([\\ {}])", r"\\\1", s)
 	return s
 
-def altok_escape_xml(s):
-	s = re.sub(r"^\s*(<[\!\?]?[\w\.:_-]+(?: .*?)?\/?>(?:.*<\/[\w\.:_-]+>)?)\s*$", r"{\1} _XML", s)
-	s = re.sub(r"^\s*((?:<\/?[\w\.:_-]+[^>]*\/?>)+)\s*$", r"{\1} _XML", s)
-	s = re.sub(r"^\s*(<\!--[^>]+>)\s*$", r"{\1} _XML", s)
-	s = re.sub(r"^\s*(<\/[\w\.:_-]+>)\s*$", r"{\1} _XML", s)
+def altok_escape_xml(s, do_not_create_XML_special_tokens = 0):
+	if do_not_create_XML_special_tokens == 0:
+		s = re.sub(r"^\s*(<[\!\?]?[\w\.:_-]+(?: .*?)?\/?>(?:.*<\/[\w\.:_-]+>)?)\s*$", r"{\1} _XML", s)
+		s = re.sub(r"^\s*((?:<\/?[\w\.:_-]+[^>]*\/?>)+)\s*$", r"{\1} _XML", s)
+		s = re.sub(r"^\s*(<\!--[^>]+>)\s*$", r"{\1} _XML", s)
+		s = re.sub(r"^\s*(<\/[\w\.:_-]+>)\s*$", r"{\1} _XML", s)
+
 	s = re.sub(r"&", r"&amp;", s)
 	s = re.sub(r"<", r"&lt;", s)
 	s = re.sub(r">", r"&gt;", s)
@@ -24,6 +26,10 @@ def altok_escape_xml(s):
 	s = re.sub(r"(?=\\)(\{[^\{}]*)&amp;", r"\1&", s)
 	s = re.sub(r"(?=\\)(\{[^\{}]*)&lt;", r"\1<", s)
 	s = re.sub(r"(?=\\)(\{[^\{}]*)&gt;", r"\1>", s)
+
+	if do_not_create_XML_special_tokens == 0:
+		if re.search(r"_XML", s):
+			s = re.sub(r"\{([^{}]+)\} _XML", lambda x : "{" + _double_whitespaces(x.group(1)) + "} _XML", s)
 	return s
 
 def altok_interpret_entities(s):
@@ -34,5 +40,9 @@ def altok_interpret_entities(s):
 	#s = s.encode("utf-8").decode("utf-8")
 	return s
 
+def _double_whitespaces(s):
+	s = re.sub(r" ", r"  ", s)
+	return s
+
 
 	
\ No newline at end of file
-- 
GitLab