Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 6e81e29f authored by Clémence Laurent's avatar Clémence Laurent
Browse files

répercussions python alSentenceSplitting + corrections post merge perl et...

répercussions python alSentenceSplitting + corrections post merge perl et python en retirant traces messages git
parent f2373cb5
No related branches found
No related tags found
No related merge requests found
......@@ -96,12 +96,7 @@ sub altok_split_sentences {
$s =~ s/^( +)<l\/>/$1<s type=\"li\">/g;
$s =~ s/([^ ])( +)<l\/>/$1<\/s>$2<s type=\"li\">/g;
$s =~ s/<l\/>//g;
<<<<<<< Updated upstream
=======
>>>>>>> Stashed changes
#TODO check, j'ai ajouté un espace entre <s et type="br,li" pour bien detokeniser ensuite
$s =~ s/<s type="br"><\/s>( *)<s type="li">/$1<s type="br,li">/g;
......
......@@ -31,9 +31,9 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a
maj = re.compile(r"(?:[A-ZÀÅÃÉÈÊËÂÄÔÖÛÜÇÓÚÍÁÒØÌÆŒ])")
l = re.compile(r"(?:[æœàåâäãéêèëîïöôùûüÿçøóúíáòìa-zA-ZÀÅÃÉÈÊËÂÄÔÖÛÜÇÓÚÍÁÒØÌÆŒ])")
elif re.search(r"^(pl|cs|sk|ro|sl|hr|sr|sc|bn|tr|fa|ckb)$", lang):
minus = re.compile(r"(?:[a-záäąćčďéęěëíĺľłńňóôöŕřśšťúůüýźż])")
maj = re.compile(r"(?:[A-ZÁÄĄĆČĎÉĘĚËÍĹŁĽŃŇÓÔÖŔŘŚŠŤÚŮÜÝŹŻ])")
l = re.compile(r"(?:[a-záäąćčďéęěëíĺľłńňóôöŕřśšťúůüýźżA-ZÁÄĄĆČĎÉĘĚËÍĹŁĽŃŇÓÔÖŔŘŚŠŤÚŮÜÝŹŻ ])")
minus = re.compile(r"(?:[a-záäąćčçďéęěëíĺľłńňóôöŕřśšşťúůüýźż])")
maj = re.compile(r"(?:[A-ZÁÄĄĆČÇĎÉĘĚËÍİĹŁĽŃŇÓÔÖŔŘŚŠŞŤÚŮÜÝŹŻ])")
l = re.compile(r"(?:[a-záäąćčďéęěëíĺľłńňóôöŕřśšťúůüýźżA-ZÁÄĄĆČÇĎÉĘĚËÍİĹŁĽŃŇÓÔÖŔŘŚŠŞŤÚŮÜÝŹŻ ])")
elif re.search(r"^(ru|uk|bg|bl|kk|bxr)$", lang):
minus = re.compile(r"(?:[a-zабвгдежзийклмнопрстуфхцчшщэюяыьёү])")
maj = re.compile(r"(?:[A-ZАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЭЮЯЫЬЁҮ])")
......@@ -79,6 +79,15 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a
s = re.sub(rf"({listnumprefix.pattern})({listnumid.pattern} +[\.\)\]\/\]])(?= +[^0-9 ])", r"\1<l/>\2", s) # [début de ligne] 1) texte
s = re.sub(rf"({listnumprefix.pattern})({listnumid.pattern} +[\-\­] )(?= +[^0-9 ])", r"\1<l/>\2", s) # [début de ligne] 1) texte
if re.search(r"^ \d [,\.] ", s) is not None:
match_pattern = re.compile(r"([\.;\!] )(\d ,)(?= +[^0-9 ])")
match_check = match_pattern.search(s)
while match_check is not None:
s = match_pattern.sub(r"\1<l/>\2", s) # [début de ligne] 1, texte … 2, texte
match_check = match_pattern.search(s)
s = re.sub(r"(\. +)(・)", r"\1<l/>\2", s)
s = re.sub(r"^( +)<l\/>", r'\1<s type="li">', s)
s = re.sub(r"([^ ])( +)<l\/>", r'\1</s>\2<s type="li">', s)
s = re.sub(r"<l\/>", r"", s)
......@@ -87,7 +96,16 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a
s = re.sub(r'<s type="br"></s>( *)<s type="li">', r'\1<s type="br,li">', s)
if re.search(r"^(ja|zh)(_|$)", lang):
s = re.sub(r" ([!?\!\?。。\.])( +)", r" \1</s>\2<s>", s)
s = re.sub(r" ([!?\!\?。。\.](?: [!?\!\?。。\.])*)( +)", r" \1</s>\2<s>", s)
s = re.sub(r"([!?\!\?。。\.])<\/s>( +)<s>([()\(\)])", r"\1\2\3", s)
if re.search(r' <s type="li">\d [。。\.]<\/s>', s) is not None:
match_pattern = re.compile(r'(<s type="li">\d [。。\.])<\/s>( +)<s>')
match_check = match_pattern.search(s)
while match_check is not None:
s = match_pattern.sub(r"\1\2", s)
match_check = match_pattern.search(s)
if weak_sbound > 0:
s = re.sub(r" ([:;:;]|)( +)", r" \1</s>\2<s>", s)
elif re.search(r"^km(_|$)", lang):
......@@ -97,23 +115,35 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a
else:
s = re.sub(rf"([…\.:;\?\!])( +)([\"“”\˝] {maj.pattern}[^\"“”\˝<>]*[\.:;\?\!] [\"“”\˝])( +)({maj.pattern})", r"\1</s>\2<s>\3</s>\4<s>\5", s) # detection of sentences entirely surrounded by double quotes
s = re.sub(rf"([…\.:;\?\!])( +)([\"“”\˝] {maj.pattern}[^\"“”\˝<>]*[\.:;\?\!] [\"“”\˝])( +)$", r"\1</s>\2<s>\3</s>\4", s) # detection of sentences entirely surrounded by double quotes
s = re.sub(rf"(\.(?: \.)*|…)( +)(\( (?:\. \. \.|…) \))( +)({maj.pattern}|[\[_{{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s)
s = re.sub(rf"([^\.][0-9}} ](?:\.(?: \.)*|…))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE
#s = re.sub(rf"(\.(?: \.)*|…)( +)(\( (?:\. \. \.|…) \))( +)({maj.pattern}|[\[_{{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s)
s = re.sub(rf"(\.(?: \.)*|…)( +)(\( (?:\. \. \.|[…。]) \))( +)({maj.pattern}|[\[_\{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s)
#s = re.sub(rf"([^\.][0-9}} ](?:\.(?: \.)*|…))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE
s = re.sub(rf"([^\.0-9][\} ](?:\.(?: \.)*|[…。]))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 1
s = re.sub(rf"([^\.][0-9](?:\.(?: \.)+|[…。]))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 2
s = re.sub(rf"^ ([^\.][0-9]\.)( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 3
s = re.sub(rf"([\.;\!\?…] [0-9]+ \.)( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 4
s = re.sub(rf"([^\s>0-9][0-9]+ (?:\.(?: \.)*|[…。]))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 5
s = re.sub(rf"([^\.;\?\!…] [0-9]+ (?:\.(?: \.)*|[…。]))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 5
s = re.sub(rf"({maj.pattern}{l.pattern}+ {maj.pattern} \.)</s> <s>({maj.pattern}{l.pattern}+ )", r"\1 \2", s) # oversegmentation correction by the standard case (case Prénom I. Nom)
<<<<<<< Updated upstream
s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres)
=======
s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres)
s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r|e \. +g) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres)
if lang == "pl":
s = re.sub(rf"\b((?:np|m ?\. *in) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas np. et autres)
s = re.sub(rf"\b((?:np|m ?\. *in) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas np. et autres)
elif lang == "ru":
s = re.sub(rf"\b((?:Т \. +е) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Т. е. et autres)
s = re.sub(rf"\b((?:Т \. +е) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Т. е. et autres)
elif lang == "es":
s = re.sub(rf"\b((?:Ej) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction else if lang == "de":
s = re.sub(rf"\b((?:bzw|z *\. *b|ggf) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas bzw. et autres)
s = re.sub(rf"\b((?:Ej) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Ej. et autres)
elif lang == "de":
s = re.sub(rf"\b((?:bzw|z *\. *b|ggf) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas bzw. et autres)
elif lang == "nl":
s = re.sub(rf"\b((?:bv) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas bv. et autres)
>>>>>>> Stashed changes
s = re.sub(rf"\b((?:bv) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas bv. et autres)
s = re.sub(rf"( {strictmaj.pattern} \.)</s> <s>({minus.pattern}(?:{l.pattern}+| \.) )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (examples: A.F.D.S.F.G. est une entreprise connue. | We need to R.S.V.P. a.s.a.p., you know!)
s = re.sub(rf"( {minus.pattern} \. {minus.pattern} \.)</s> <s>({minus.pattern}(?:{l.pattern}+| \.) )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (exemple: Please r.s.v.p. a.s.a.p. otherwise who know what will happen.)
s = re.sub(rf"( {minus.pattern} \.)</s> <s>({minus.pattern} \. )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (exemple: e. g.)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment