diff --git a/alSentenceSplitting.py b/alSentenceSplitting.py index 40af67d2f54af05d75eaafaaa648e5105469cd85..70976f582956135a004ed723959af7344622939f 100644 --- a/alSentenceSplitting.py +++ b/alSentenceSplitting.py @@ -52,15 +52,15 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a s = re.sub(r"< *br *(?:\/ *)?>", r'</s><s type="br">', s) - if re.search(r"^(ja|zh|tw)$", lang): + if re.search(r"^(ja|zh|tw)(_|$)", lang): s = re.sub(r"([â‘ -â‘«â—※*■◆→⇒◇◎★☆〇·•])", r"<l/>\1", s) s = re.sub(r"(?<![1-91-9〇零一二三四五å…七八ä¹â—‹å百åƒè¬å„„兆万亿])( +)(â—‹ +)(?![1-91-9〇零一二三四五å…七八ä¹â—‹å百åƒè¬å„„兆万亿])", r"\1<l/>\2", s) s = re.sub(r"([\(\(] *[1-91-ï¼™] *[\)\)])", r"<l/>\1", s) s = re.sub(r"(?<![\(\(A-Za-z] )(?<![0-91-ï¼™A-Za-z])([1-91-ï¼™]+ +[\.\)ã€\)。,::])", r"<l/>\1", s) s = re.sub(r"(^|。)( +)((?:第 )?[一二] [\.\)ã€\)。,::])", r"\1\2<l/>\3", s) - elif re.search(r"^th$", lang): + elif re.search(r"^th(_|$)", lang): s = re.sub(r"(^|<br\/?>)( +)(๑à¹|๑?[๑๒๓๔๕๖๗๘๙])( \.)", r"\1\2</l>\3 \4", s) - elif lang == "km": + elif re.search(r"^km(_|$)", lang): s = re.sub(r"([\P{Latin}\)][ ​]*)(-[ ​]*\P{Latin})", r"\1</l>\2", s) s = re.sub(r"([\P{Latin}\)][ ​]*)(\d+ ?[,\-][ ​]*\P{Latin})", r"\1</l>\2", s) else: @@ -84,31 +84,32 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a #un ajout d'espace ici en faisant le process de trad s = re.sub(r'<s type="br"></s>( *)<s type="li">', r'\1<s type="br,li">', s) - if re.search(r"^(ja|zh)$", lang): - s = re.sub(r" ([ï¼ï¼Ÿ\!\?。。])( +)", r" \1</s>\2<s>", s) + if re.search(r"^(ja|zh)(_|$)", lang): + s = re.sub(r" ([ï¼ï¼Ÿ\!\?。。\.])( +)", r" \1</s>\2<s>", s) if weak_sbound > 0: s = re.sub(r" ([:;:;]|)( +)", r" \1</s>\2<s>", s) - elif lang == "km": + elif re.search(r"^km(_|$)", lang): s = re.sub(r" ([។៕])( +)", r" \1</s>\2<s>", s) - elif lang == "th": + elif re.search(r"^th(_|$)", lang):: pass else: - s = re.sub(rf"([\.:;\?\!]) ([\"“â€\Ë] {maj.pattern}[^\"“â€\Ë<>]*[\.:;\?\!] [\"“â€\Ë]) ({maj.pattern})", r"\1</s> <s>\2</s> <s>\3", s) # detection of sentences entirely surrounded by double quotes - s = re.sub(rf"([\.:;\?\!]) ([\"“â€\Ë] {maj.pattern}[^\"“â€\Ë<>]*[\.:;\?\!] [\"“â€\Ë]) $", r"\1</s> <s>\2</s> ", s) # detection of sentences entirely surrounded by double quotes - s = re.sub(rf"(\.(?: \.)*)( +)(\( \. \. \. \))( +)({maj.pattern}|[\[_{{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s) - s = re.sub(rf"([^\.][0-9}} ]\.(?: \.)*)( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE + s = re.sub(rf"([…\.:;\?\!])( +)([\"“â€\Ë] {maj.pattern}[^\"“â€\Ë<>]*[\.:;\?\!] [\"“â€\Ë])( +)({maj.pattern})", r"\1</s>\2<s>\3</s>\4<s>\5", s) # detection of sentences entirely surrounded by double quotes + s = re.sub(rf"([…\.:;\?\!])( +)([\"“â€\Ë] {maj.pattern}[^\"“â€\Ë<>]*[\.:;\?\!] [\"“â€\Ë])( +)$", r"\1</s>\2<s>\3</s>\4", s) # detection of sentences entirely surrounded by double quotes + s = re.sub(rf"(\.(?: \.)*|…)( +)(\( (?:\. \. \.|…) \))( +)({maj.pattern}|[\[_{{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s) + s = re.sub(rf"([^\.][0-9}} ](?:\.(?: \.)*|…))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE s = re.sub(rf"({maj.pattern}{l.pattern}+ {maj.pattern} \.)</s> <s>({maj.pattern}{l.pattern}+ )", r"\1 \2", s) # oversegmentation correction by the standard case (case Prénom I. Nom) - s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres) + s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres) s = re.sub(rf"( {strictmaj.pattern} \.)</s> <s>({minus.pattern}(?:{l.pattern}+| \.) )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (examples: A.F.D.S.F.G. est une entreprise connue. | We need to R.S.V.P. a.s.a.p., you know!) s = re.sub(rf"( {minus.pattern} \. {minus.pattern} \.)</s> <s>({minus.pattern}(?:{l.pattern}+| \.) )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (exemple: Please r.s.v.p. a.s.a.p. otherwise who know what will happen.) + s = re.sub(rf"( {minus.pattern} \.)</s> <s>({minus.pattern} \. )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (exemple: e. g.) s = re.sub(rf"({l.pattern}|[\!\?])( +)(\. \. \.)( +)({maj.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3</s>\4<s>\5", s) # used to use $l instead of $maj s = re.sub(rf"(\. \.(?: \.)*)( +)({maj.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) - s = re.sub(rf"(\. (?: \.)+)( )({initialclass.pattern})", r"\1</s>\2<s>\3", s) # attention !!! + s = re.sub(rf"(\. (?: \.)+)( +)({initialclass.pattern})", r"\1</s>\2<s>\3", s) # attention !!! s = re.sub(r"(\. (?: \.)+)( +)([\[_\{\-\«¿¡])", r"\1</s>\2<s>\3", s) # attention !!! - s = re.sub(rf"([\?\!](?: ?(?: \.)+)?)( )({initialclass.pattern})", r"\1</s>\2<s>\3", s) # attention !!! + s = re.sub(rf"([\?\!](?: ?(?: \.)+)?)( +)({initialclass.pattern})", r"\1</s>\2<s>\3", s) # attention !!! s = re.sub(r"([\?\!](?: ?(?: \.)+)?)( +)([\[_\{\-\«¿¡])", r"\1</s>\2<s>\3", s) # attention !!! - s = re.sub(r"([\.\?\!] +\.(?: \.)+)( )", r"\1</s>\2<s>", s) # attention + s = re.sub(r"([\.\?\!] +\.(?: \.)+)( +)", r"\1</s>\2<s>", s) # attention s = re.sub(r"([\.\?\!,:])( )([\-\+\«¿¡])", r"\1</s>\2<s>\3", s) # attention if weak_sbound > 0: # if $weak_sbound, colons are sentence boundaries