changes previously made to .pm file to python

1a7279ca · Clémence Laurent · 8da38e6b · 1a7279ca
Commit 1a7279ca authored 2 years ago by Clémence Laurent
--- a/alSentenceSplitting.py
+++ b/alSentenceSplitting.py
@@ -52,15 +52,15 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a
 	s = re.sub(r"&lt; *br *(?:\/ *)?&gt;", r'</s><s  type="br">', s)
-	if re.search(r"^(ja|zh|tw)$", lang):
+	if re.search(r"^(ja|zh|tw)(_|$)", lang):
 		s = re.sub(r"([①-⑫●※＊■◆→⇒◇◎★☆〇·•])", r"<l/>\1", s)
 		s = re.sub(r"(?<![1-9１-９〇零一二三四五六七八九○十百千萬億兆万亿])( +)(○ +)(?![1-9１-９〇零一二三四五六七八九○十百千萬億兆万亿])", r"\1<l/>\2", s)
 		s = re.sub(r"([\（\(] *[1-9１-９] *[\）\)])", r"<l/>\1", s)
 		s = re.sub(r"(?<![\（\(A-Za-z] )(?<![0-9１-９A-Za-z])([1-9１-９]+ +[\.\)、\）。，：:])", r"<l/>\1", s)
 		s = re.sub(r"(^|。)( +)((?:第 )?[一二] [\.\)、\）。，：:])", r"\1\2<l/>\3", s)
-	elif re.search(r"^th$", lang):
+	elif re.search(r"^th(_|$)", lang):
 		s = re.sub(r"(^|<br\/?>)( +)(๑๐|๑?[๑๒๓๔๕๖๗๘๙])( \.)", r"\1\2</l>\3 \4", s)
-	elif lang == "km":
+	elif re.search(r"^km(_|$)", lang):
 		s = re.sub(r"([\P{Latin}\)][ ]*)(-[ ]*\P{Latin})", r"\1</l>\2", s)
 		s = re.sub(r"([\P{Latin}\)][ ]*)(\d+ ?[,\-][ ]*\P{Latin})", r"\1</l>\2", s)
 	else:
@@ -84,31 +84,32 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a
 	#un ajout d'espace ici en faisant le process de trad
 	s = re.sub(r'<s  type="br"></s>( *)<s  type="li">', r'\1<s  type="br,li">', s)
-	if re.search(r"^(ja|zh)$", lang):
+	if re.search(r"^(ja|zh)(_|$)", lang):
-		s = re.sub(r" ([！？\!\?。｡])( +)", r" \1</s>\2<s>", s)
+		s = re.sub(r" ([！？\!\?。｡\.])( +)", r" \1</s>\2<s>", s)
 		if weak_sbound > 0:
 			s = re.sub(r" ([：；:;]|)( +)", r" \1</s>\2<s>", s)
-	elif lang == "km":
+	elif re.search(r"^km(_|$)", lang):
 		s = re.sub(r" ([។៕])( +)", r" \1</s>\2<s>", s)
-	elif lang == "th":
+	elif re.search(r"^th(_|$)", lang)::
 		pass
 	else:
-		s = re.sub(rf"([\.:;\?\!])  ([\"“”\˝] {maj.pattern}[^\"“”\˝<>]*[\.:;\?\!] [\"“”\˝])  ({maj.pattern})", r"\1</s>  <s>\2</s>  <s>\3", s) # detection of sentences entirely surrounded by double quotes
+		s = re.sub(rf"([…\.:;\?\!])(  +)([\"“”\˝] {maj.pattern}[^\"“”\˝<>]*[\.:;\?\!] [\"“”\˝])(  +)({maj.pattern})", r"\1</s>\2<s>\3</s>\4<s>\5", s) # detection of sentences entirely surrounded by double quotes
-		s = re.sub(rf"([\.:;\?\!])  ([\"“”\˝] {maj.pattern}[^\"“”\˝<>]*[\.:;\?\!] [\"“”\˝]) $", r"\1</s>  <s>\2</s> ", s) # detection of sentences entirely surrounded by double quotes
+		s = re.sub(rf"([…\.:;\?\!])(  +)([\"“”\˝] {maj.pattern}[^\"“”\˝<>]*[\.:;\?\!] [\"“”\˝])( +)$", r"\1</s>\2<s>\3</s>\4", s) # detection of sentences entirely surrounded by double quotes
-		s = re.sub(rf"(\.(?: \.)*)( +)(\( \. \. \. \))( +)({maj.pattern}|[\[_{{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s)
+		s = re.sub(rf"(\.(?: \.)*|…)( +)(\( (?:\. \. \.|…) \))( +)({maj.pattern}|[\[_{{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s)
-		s = re.sub(rf"([^\.][0-9}} ]\.(?: \.)*)(  +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE
+		s = re.sub(rf"([^\.][0-9}} ](?:\.(?: \.)*|…))(  +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE
 		s = re.sub(rf"({maj.pattern}{l.pattern}+  {maj.pattern} \.)</s>  <s>({maj.pattern}{l.pattern}+ )", r"\1  \2", s) # oversegmentation correction by the standard case (case Prénom I. Nom)
-		s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r) \.)</s>  <s>((?:{maj.pattern} \.  )*{maj.pattern}{l.pattern}+ )", r"\1  \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres)
+		s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r) \.)</s>(  +)<s>((?:{maj.pattern} \.  )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres)
 		s = re.sub(rf"( {strictmaj.pattern} \.)</s>  <s>({minus.pattern}(?:{l.pattern}+| \.) )", r"\1  \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (examples: A.F.D.S.F.G. est une entreprise connue. | We need to R.S.V.P. a.s.a.p., you know!)
 		s = re.sub(rf"( {minus.pattern} \. {minus.pattern} \.)</s>  <s>({minus.pattern}(?:{l.pattern}+| \.) )", r"\1  \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (exemple: Please r.s.v.p. a.s.a.p. otherwise who know what will happen.)
+		s = re.sub(rf"( {minus.pattern} \.)</s>  <s>({minus.pattern} \.  )", r"\1  \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (exemple: e. g.)
 		s = re.sub(rf"({l.pattern}|[\!\?])( +)(\. \. \.)( +)({maj.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3</s>\4<s>\5", s) # used to use $l instead of $maj
 		s = re.sub(rf"(\.  \.(?: \.)*)( +)({maj.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s)
-		s = re.sub(rf"(\. (?: \.)+)(  )({initialclass.pattern})", r"\1</s>\2<s>\3", s) # attention !!!
+		s = re.sub(rf"(\. (?: \.)+)(  +)({initialclass.pattern})", r"\1</s>\2<s>\3", s) # attention !!!
 		s = re.sub(r"(\. (?: \.)+)( +)([\[_\{\-\«¿¡])", r"\1</s>\2<s>\3", s)  # attention !!!
-		s = re.sub(rf"([\?\!](?: ?(?: \.)+)?)(  )({initialclass.pattern})", r"\1</s>\2<s>\3", s) # attention !!!
+		s = re.sub(rf"([\?\!](?: ?(?: \.)+)?)(  +)({initialclass.pattern})", r"\1</s>\2<s>\3", s) # attention !!!
 		s = re.sub(r"([\?\!](?: ?(?: \.)+)?)( +)([\[_\{\-\«¿¡])", r"\1</s>\2<s>\3", s) # attention !!!
-		s = re.sub(r"([\.\?\!] +\.(?: \.)+)(  )", r"\1</s>\2<s>", s) # attention
+		s = re.sub(r"([\.\?\!] +\.(?: \.)+)(  +)", r"\1</s>\2<s>", s) # attention
 		s = re.sub(r"([\.\?\!,:])(  )([\-\+\«¿¡])", r"\1</s>\2<s>\3", s) # attention
 		if weak_sbound > 0: # if $weak_sbound, colons are sentence boundaries