From e2b49433de14368a1d6f5ac4449e1e48c8b7c1d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Laurent?= <clemence@opensquare.eu> Date: Wed, 4 Jan 2023 12:00:18 +0100 Subject: [PATCH] debug sentence splitting python --- alSentenceSplitting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alSentenceSplitting.py b/alSentenceSplitting.py index f39b5c3..11678cf 100644 --- a/alSentenceSplitting.py +++ b/alSentenceSplitting.py @@ -119,7 +119,7 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a s = re.sub(r"(?<!TA_TEXTUAL_PONCT|_META_TEXTUAL_GN)( ?)(\{[^\}]*\} _META_TEXTUAL[A-Z_]+)", r'</s>\1<s type="li">\2', s) # attention - if re.search(r"^(ja|zh|th|km)$", lang): + if re.search(r"^(ja|zh|th|km)$", lang) is None: match_pattern = re.compile(r"^((?:[^\"“â€]*[\"“â€\Ë][^\"“â€]*[\"“â€\Ë])*[^\"“â€]*[\.;\?\!])( )([\"“â€\Ë])") match_check = match_pattern.search(s) while match_check is not None: # attention -- GitLab