Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 8a82d73f authored by Benoît Sagot's avatar Benoît Sagot
Browse files

debug

parents 2343c5fe 6e81e29f
No related branches found
No related tags found
No related merge requests found
......@@ -31,9 +31,9 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a
maj = re.compile(r"(?:[A-ZÀÅÃÉÈÊËÂÄÔÖÛÜÇÓÚÍÁÒØÌÆŒ])")
l = re.compile(r"(?:[æœàåâäãéêèëîïöôùûüÿçøóúíáòìa-zA-ZÀÅÃÉÈÊËÂÄÔÖÛÜÇÓÚÍÁÒØÌÆŒ])")
elif re.search(r"^(pl|cs|sk|ro|sl|hr|sr|sc|bn|tr|fa|ckb)$", lang):
minus = re.compile(r"(?:[a-záäąćčďéęěëíĺľłńňóôöŕřśšťúůüýźż])")
maj = re.compile(r"(?:[A-ZÁÄĄĆČĎÉĘĚËÍĹŁĽŃŇÓÔÖŔŘŚŠŤÚŮÜÝŹŻ])")
l = re.compile(r"(?:[a-záäąćčďéęěëíĺľłńňóôöŕřśšťúůüýźżA-ZÁÄĄĆČĎÉĘĚËÍĹŁĽŃŇÓÔÖŔŘŚŠŤÚŮÜÝŹŻ ])")
minus = re.compile(r"(?:[a-záäąćčçďéęěëíĺľłńňóôöŕřśšşťúůüýźż])")
maj = re.compile(r"(?:[A-ZÁÄĄĆČÇĎÉĘĚËÍİĹŁĽŃŇÓÔÖŔŘŚŠŞŤÚŮÜÝŹŻ])")
l = re.compile(r"(?:[a-záäąćčďéęěëíĺľłńňóôöŕřśšťúůüýźżA-ZÁÄĄĆČÇĎÉĘĚËÍİĹŁĽŃŇÓÔÖŔŘŚŠŞŤÚŮÜÝŹŻ ])")
elif re.search(r"^(ru|uk|bg|bl|kk|bxr)$", lang):
minus = re.compile(r"(?:[a-zабвгдежзийклмнопрстуфхцчшщэюяыьёү])")
maj = re.compile(r"(?:[A-ZАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЭЮЯЫЬЁҮ])")
......@@ -79,6 +79,15 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a
s = re.sub(rf"({listnumprefix.pattern})({listnumid.pattern} +[\.\)\]\/\]])(?= +[^0-9 ])", r"\1<l/>\2", s) # [début de ligne] 1) texte
s = re.sub(rf"({listnumprefix.pattern})({listnumid.pattern} +[\-\­] )(?= +[^0-9 ])", r"\1<l/>\2", s) # [début de ligne] 1) texte
if re.search(r"^ \d [,\.] ", s) is not None:
match_pattern = re.compile(r"([\.;\!] )(\d ,)(?= +[^0-9 ])")
match_check = match_pattern.search(s)
while match_check is not None:
s = match_pattern.sub(r"\1<l/>\2", s) # [début de ligne] 1, texte … 2, texte
match_check = match_pattern.search(s)
s = re.sub(r"(\. +)(・)", r"\1<l/>\2", s)
s = re.sub(r"^( +)<l\/>", r'\1<s type="li">', s)
s = re.sub(r"([^ ])( +)<l\/>", r'\1</s>\2<s type="li">', s)
s = re.sub(r"<l\/>", r"", s)
......@@ -87,7 +96,16 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a
s = re.sub(r'<s type="br"></s>( *)<s type="li">', r'\1<s type="br,li">', s)
if re.search(r"^(ja|zh)(_|$)", lang):
s = re.sub(r" ([!?\!\?。。\.])( +)", r" \1</s>\2<s>", s)
s = re.sub(r" ([!?\!\?。。\.](?: [!?\!\?。。\.])*)( +)", r" \1</s>\2<s>", s)
s = re.sub(r"([!?\!\?。。\.])<\/s>( +)<s>([()\(\)])", r"\1\2\3", s)
if re.search(r' <s type="li">\d [。。\.]<\/s>', s) is not None:
match_pattern = re.compile(r'(<s type="li">\d [。。\.])<\/s>( +)<s>')
match_check = match_pattern.search(s)
while match_check is not None:
s = match_pattern.sub(r"\1\2", s)
match_check = match_pattern.search(s)
if weak_sbound > 0:
s = re.sub(r" ([:;:;]|)( +)", r" \1</s>\2<s>", s)
elif re.search(r"^km(_|$)", lang):
......@@ -97,23 +115,35 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a
else:
s = re.sub(rf"([…\.:;\?\!])( +)([\"“”\˝] {maj.pattern}[^\"“”\˝<>]*[\.:;\?\!] [\"“”\˝])( +)({maj.pattern})", r"\1</s>\2<s>\3</s>\4<s>\5", s) # detection of sentences entirely surrounded by double quotes
s = re.sub(rf"([…\.:;\?\!])( +)([\"“”\˝] {maj.pattern}[^\"“”\˝<>]*[\.:;\?\!] [\"“”\˝])( +)$", r"\1</s>\2<s>\3</s>\4", s) # detection of sentences entirely surrounded by double quotes
s = re.sub(rf"(\.(?: \.)*|…)( +)(\( (?:\. \. \.|…) \))( +)({maj.pattern}|[\[_{{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s)
s = re.sub(rf"([^\.][0-9}} ](?:\.(?: \.)*|…))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE
#s = re.sub(rf"(\.(?: \.)*|…)( +)(\( (?:\. \. \.|…) \))( +)({maj.pattern}|[\[_{{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s)
s = re.sub(rf"(\.(?: \.)*|…)( +)(\( (?:\. \. \.|[…。]) \))( +)({maj.pattern}|[\[_\{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s)
#s = re.sub(rf"([^\.][0-9}} ](?:\.(?: \.)*|…))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE
s = re.sub(rf"([^\.0-9][\} ](?:\.(?: \.)*|[…。]))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 1
s = re.sub(rf"([^\.][0-9](?:\.(?: \.)+|[…。]))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 2
s = re.sub(rf"^ ([^\.][0-9]\.)( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 3
s = re.sub(rf"([\.;\!\?…] [0-9]+ \.)( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 4
s = re.sub(rf"([^\s>0-9][0-9]+ (?:\.(?: \.)*|[…。]))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 5
s = re.sub(rf"([^\.;\?\!…] [0-9]+ (?:\.(?: \.)*|[…。]))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 5
s = re.sub(rf"({maj.pattern}{l.pattern}+ {maj.pattern} \.)</s> <s>({maj.pattern}{l.pattern}+ )", r"\1 \2", s) # oversegmentation correction by the standard case (case Prénom I. Nom)
<<<<<<< Updated upstream
s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres)
=======
s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres)
s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r|e \. +g) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres)
if lang == "pl":
s = re.sub(rf"\b((?:np|m ?\. *in) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas np. et autres)
s = re.sub(rf"\b((?:np|m ?\. *in) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas np. et autres)
elif lang == "ru":
s = re.sub(rf"\b((?:Т \. +е) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Т. е. et autres)
s = re.sub(rf"\b((?:Т \. +е) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Т. е. et autres)
elif lang == "es":
s = re.sub(rf"\b((?:Ej) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction else if lang == "de":
s = re.sub(rf"\b((?:bzw|z *\. *b|ggf) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas bzw. et autres)
s = re.sub(rf"\b((?:Ej) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Ej. et autres)
elif lang == "de":
s = re.sub(rf"\b((?:bzw|z *\. *b|ggf) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas bzw. et autres)
elif lang == "nl":
s = re.sub(rf"\b((?:bv) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas bv. et autres)
>>>>>>> Stashed changes
s = re.sub(rf"\b((?:bv) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas bv. et autres)
s = re.sub(rf"( {strictmaj.pattern} \.)</s> <s>({minus.pattern}(?:{l.pattern}+| \.) )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (examples: A.F.D.S.F.G. est une entreprise connue. | We need to R.S.V.P. a.s.a.p., you know!)
s = re.sub(rf"( {minus.pattern} \. {minus.pattern} \.)</s> <s>({minus.pattern}(?:{l.pattern}+| \.) )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (exemple: Please r.s.v.p. a.s.a.p. otherwise who know what will happen.)
s = re.sub(rf"( {minus.pattern} \.)</s> <s>({minus.pattern} \. )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (exemple: e. g.)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment