diff --git a/alSentenceSplitting.pm b/alSentenceSplitting.pm index 5ae9b076b4ac558c51303070dd703b85e3921ae8..0a8e946b26b805d4c0c7ded61898f9043b09edfc 100644 --- a/alSentenceSplitting.pm +++ b/alSentenceSplitting.pm @@ -96,12 +96,7 @@ sub altok_split_sentences { $s =~ s/^( +)<l\/>/$1<s type=\"li\">/g; $s =~ s/([^ ])( +)<l\/>/$1<\/s>$2<s type=\"li\">/g; $s =~ s/<l\/>//g; -<<<<<<< Updated upstream - -======= - ->>>>>>> Stashed changes #TODO check, j'ai ajouté un espace entre <s et type="br,li" pour bien detokeniser ensuite $s =~ s/<s type="br"><\/s>( *)<s type="li">/$1<s type="br,li">/g; diff --git a/alSentenceSplitting.py b/alSentenceSplitting.py index 444c5f1da96d02b3755588155b761f40350eae00..3c0479293511101329df86e14ef7cd73722d15a3 100644 --- a/alSentenceSplitting.py +++ b/alSentenceSplitting.py @@ -31,9 +31,9 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a maj = re.compile(r"(?:[A-ZÀÅÃÉÈÊËÂÄÔÖÛÜÇÓÚÃÃÒØÌÆŒ])") l = re.compile(r"(?:[æœà åâäãéêèëîïöôùûüÿçøóúÃáòìa-zA-ZÀÅÃÉÈÊËÂÄÔÖÛÜÇÓÚÃÃÒØÌÆŒ])") elif re.search(r"^(pl|cs|sk|ro|sl|hr|sr|sc|bn|tr|fa|ckb)$", lang): - minus = re.compile(r"(?:[a-záäąćÄÄéęěëÃĺľłńňóôöŕřśšťúůüýźż])") - maj = re.compile(r"(?:[A-ZÃÄĄĆČĎÉĘĚËÃĹÅĽŃŇÓÔÖŔŘŚŠŤÚŮÜÃŹŻ])") - l = re.compile(r"(?:[a-záäąćÄÄéęěëÃĺľłńňóôöŕřśšťúůüýźżA-ZÃÄĄĆČĎÉĘĚËÃĹÅĽŃŇÓÔÖŔŘŚŠŤÚŮÜÃŹŻ ])") + minus = re.compile(r"(?:[a-záäąćÄçÄéęěëÃĺľłńňóôöŕřśšşťúůüýźż])") + maj = re.compile(r"(?:[A-ZÃÄĄĆČÇĎÉĘĚËÃİĹÅĽŃŇÓÔÖŔŘŚŠŞŤÚŮÜÃŹŻ])") + l = re.compile(r"(?:[a-záäąćÄÄéęěëÃĺľłńňóôöŕřśšťúůüýźżA-ZÃÄĄĆČÇĎÉĘĚËÃİĹÅĽŃŇÓÔÖŔŘŚŠŞŤÚŮÜÃŹŻ ])") elif re.search(r"^(ru|uk|bg|bl|kk|bxr)$", lang): minus = re.compile(r"(?:[a-zабвгдежзийклмнопрÑтуфхцчшщÑÑŽÑыьёү])") maj = re.compile(r"(?:[A-ZÐБВГДЕЖЗИЙКЛМÐОПРСТУФХЦЧШЩÐЮЯЫЬÐÒ®])") @@ -79,6 +79,15 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a s = re.sub(rf"({listnumprefix.pattern})({listnumid.pattern} +[\.\)\]\/\]])(?= +[^0-9 ])", r"\1<l/>\2", s) # [début de ligne] 1) texte s = re.sub(rf"({listnumprefix.pattern})({listnumid.pattern} +[\-\Â] )(?= +[^0-9 ])", r"\1<l/>\2", s) # [début de ligne] 1) texte + if re.search(r"^ \d [,\.] ", s) is not None: + match_pattern = re.compile(r"([\.;\!] )(\d ,)(?= +[^0-9 ])") + match_check = match_pattern.search(s) + while match_check is not None: + s = match_pattern.sub(r"\1<l/>\2", s) # [début de ligne] 1, texte … 2, texte + match_check = match_pattern.search(s) + + s = re.sub(r"(\. +)(・)", r"\1<l/>\2", s) + s = re.sub(r"^( +)<l\/>", r'\1<s type="li">', s) s = re.sub(r"([^ ])( +)<l\/>", r'\1</s>\2<s type="li">', s) s = re.sub(r"<l\/>", r"", s) @@ -87,7 +96,16 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a s = re.sub(r'<s type="br"></s>( *)<s type="li">', r'\1<s type="br,li">', s) if re.search(r"^(ja|zh)(_|$)", lang): - s = re.sub(r" ([ï¼ï¼Ÿ\!\?。。\.])( +)", r" \1</s>\2<s>", s) + s = re.sub(r" ([ï¼ï¼Ÿ\!\?。。\.](?: [ï¼ï¼Ÿ\!\?。。\.])*)( +)", r" \1</s>\2<s>", s) + s = re.sub(r"([ï¼ï¼Ÿ\!\?。。\.])<\/s>( +)<s>([()\(\)])", r"\1\2\3", s) + + if re.search(r' <s type="li">\d [。。\.]<\/s>', s) is not None: + match_pattern = re.compile(r'(<s type="li">\d [。。\.])<\/s>( +)<s>') + match_check = match_pattern.search(s) + while match_check is not None: + s = match_pattern.sub(r"\1\2", s) + match_check = match_pattern.search(s) + if weak_sbound > 0: s = re.sub(r" ([:;:;]|)( +)", r" \1</s>\2<s>", s) elif re.search(r"^km(_|$)", lang): @@ -97,23 +115,35 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a else: s = re.sub(rf"([…\.:;\?\!])( +)([\"“â€\Ë] {maj.pattern}[^\"“â€\Ë<>]*[\.:;\?\!] [\"“â€\Ë])( +)({maj.pattern})", r"\1</s>\2<s>\3</s>\4<s>\5", s) # detection of sentences entirely surrounded by double quotes s = re.sub(rf"([…\.:;\?\!])( +)([\"“â€\Ë] {maj.pattern}[^\"“â€\Ë<>]*[\.:;\?\!] [\"“â€\Ë])( +)$", r"\1</s>\2<s>\3</s>\4", s) # detection of sentences entirely surrounded by double quotes - s = re.sub(rf"(\.(?: \.)*|…)( +)(\( (?:\. \. \.|…) \))( +)({maj.pattern}|[\[_{{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s) - s = re.sub(rf"([^\.][0-9}} ](?:\.(?: \.)*|…))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE + #s = re.sub(rf"(\.(?: \.)*|…)( +)(\( (?:\. \. \.|…) \))( +)({maj.pattern}|[\[_{{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s) + s = re.sub(rf"(\.(?: \.)*|…)( +)(\( (?:\. \. \.|[…。]) \))( +)({maj.pattern}|[\[_\{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s) + #s = re.sub(rf"([^\.][0-9}} ](?:\.(?: \.)*|…))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE + + s = re.sub(rf"([^\.0-9][\} ](?:\.(?: \.)*|[…。]))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 1 + s = re.sub(rf"([^\.][0-9](?:\.(?: \.)+|[…。]))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 2 + s = re.sub(rf"^ ([^\.][0-9]\.)( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 3 + s = re.sub(rf"([\.;\!\?…] [0-9]+ \.)( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 4 + s = re.sub(rf"([^\s>0-9][0-9]+ (?:\.(?: \.)*|[…。]))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 5 + s = re.sub(rf"([^\.;\?\!…] [0-9]+ (?:\.(?: \.)*|[…。]))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 5 + s = re.sub(rf"({maj.pattern}{l.pattern}+ {maj.pattern} \.)</s> <s>({maj.pattern}{l.pattern}+ )", r"\1 \2", s) # oversegmentation correction by the standard case (case Prénom I. Nom) -<<<<<<< Updated upstream - s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres) -======= - s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres) + s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r|e \. +g) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres) + if lang == "pl": - s = re.sub(rf"\b((?:np|m ?\. *in) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas np. et autres) + s = re.sub(rf"\b((?:np|m ?\. *in) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas np. et autres) + elif lang == "ru": - s = re.sub(rf"\b((?:Т \. +е) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Т. е. et autres) + s = re.sub(rf"\b((?:Т \. +е) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Т. е. et autres) + elif lang == "es": - s = re.sub(rf"\b((?:Ej) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction else if lang == "de": - s = re.sub(rf"\b((?:bzw|z *\. *b|ggf) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas bzw. et autres) + s = re.sub(rf"\b((?:Ej) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Ej. et autres) + + elif lang == "de": + s = re.sub(rf"\b((?:bzw|z *\. *b|ggf) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas bzw. et autres) + elif lang == "nl": - s = re.sub(rf"\b((?:bv) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas bv. et autres) ->>>>>>> Stashed changes + s = re.sub(rf"\b((?:bv) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas bv. et autres) + s = re.sub(rf"( {strictmaj.pattern} \.)</s> <s>({minus.pattern}(?:{l.pattern}+| \.) )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (examples: A.F.D.S.F.G. est une entreprise connue. | We need to R.S.V.P. a.s.a.p., you know!) s = re.sub(rf"( {minus.pattern} \. {minus.pattern} \.)</s> <s>({minus.pattern}(?:{l.pattern}+| \.) )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (exemple: Please r.s.v.p. a.s.a.p. otherwise who know what will happen.) s = re.sub(rf"( {minus.pattern} \.)</s> <s>({minus.pattern} \. )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (exemple: e. g.)