diff --git a/alSentenceSplitting.py b/alSentenceSplitting.py index 3c0479293511101329df86e14ef7cd73722d15a3..3c48289a11e3c9c68366edbbb3de35165ed85254 100644 --- a/alSentenceSplitting.py +++ b/alSentenceSplitting.py @@ -116,10 +116,10 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a s = re.sub(rf"([…\.:;\?\!])( +)([\"“â€\Ë] {maj.pattern}[^\"“â€\Ë<>]*[\.:;\?\!] [\"“â€\Ë])( +)({maj.pattern})", r"\1</s>\2<s>\3</s>\4<s>\5", s) # detection of sentences entirely surrounded by double quotes s = re.sub(rf"([…\.:;\?\!])( +)([\"“â€\Ë] {maj.pattern}[^\"“â€\Ë<>]*[\.:;\?\!] [\"“â€\Ë])( +)$", r"\1</s>\2<s>\3</s>\4", s) # detection of sentences entirely surrounded by double quotes #s = re.sub(rf"(\.(?: \.)*|…)( +)(\( (?:\. \. \.|…) \))( +)({maj.pattern}|[\[_{{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s) - s = re.sub(rf"(\.(?: \.)*|…)( +)(\( (?:\. \. \.|[…。]) \))( +)({maj.pattern}|[\[_\{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s) + s = re.sub(rf"(\.(?: \.)*|…)( +)(\( (?:\. \. \.|[…。]) \))( +)({maj.pattern}|[\[_{{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s) #s = re.sub(rf"([^\.][0-9}} ](?:\.(?: \.)*|…))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE - s = re.sub(rf"([^\.0-9][\} ](?:\.(?: \.)*|[…。]))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 1 + s = re.sub(rf"([^\.0-9][}} ](?:\.(?: \.)*|[…。]))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 1 s = re.sub(rf"([^\.][0-9](?:\.(?: \.)+|[…。]))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 2 s = re.sub(rf"^ ([^\.][0-9]\.)( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 3 s = re.sub(rf"([\.;\!\?…] [0-9]+ \.)( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 4