Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 1a7279ca authored by Clémence Laurent's avatar Clémence Laurent
Browse files

changes previously made to .pm file to python

parent 8da38e6b
No related branches found
No related tags found
No related merge requests found
...@@ -52,15 +52,15 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a ...@@ -52,15 +52,15 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a
s = re.sub(r"&lt; *br *(?:\/ *)?&gt;", r'</s><s type="br">', s) s = re.sub(r"&lt; *br *(?:\/ *)?&gt;", r'</s><s type="br">', s)
if re.search(r"^(ja|zh|tw)$", lang): if re.search(r"^(ja|zh|tw)(_|$)", lang):
s = re.sub(r"([①-⑫●※*■◆→⇒◇◎★☆〇·•])", r"<l/>\1", s) s = re.sub(r"([①-⑫●※*■◆→⇒◇◎★☆〇·•])", r"<l/>\1", s)
s = re.sub(r"(?<![1-91-9〇零一二三四五六七八九○十百千萬億兆万亿])( +)(○ +)(?![1-91-9〇零一二三四五六七八九○十百千萬億兆万亿])", r"\1<l/>\2", s) s = re.sub(r"(?<![1-91-9〇零一二三四五六七八九○十百千萬億兆万亿])( +)(○ +)(?![1-91-9〇零一二三四五六七八九○十百千萬億兆万亿])", r"\1<l/>\2", s)
s = re.sub(r"([\(\(] *[1-91-9] *[\)\)])", r"<l/>\1", s) s = re.sub(r"([\(\(] *[1-91-9] *[\)\)])", r"<l/>\1", s)
s = re.sub(r"(?<![\(\(A-Za-z] )(?<![0-91-9A-Za-z])([1-91-9]+ +[\.\)、\)。,::])", r"<l/>\1", s) s = re.sub(r"(?<![\(\(A-Za-z] )(?<![0-91-9A-Za-z])([1-91-9]+ +[\.\)、\)。,::])", r"<l/>\1", s)
s = re.sub(r"(^|。)( +)((?:第 )?[一二] [\.\)、\)。,::])", r"\1\2<l/>\3", s) s = re.sub(r"(^|。)( +)((?:第 )?[一二] [\.\)、\)。,::])", r"\1\2<l/>\3", s)
elif re.search(r"^th$", lang): elif re.search(r"^th(_|$)", lang):
s = re.sub(r"(^|<br\/?>)( +)(๑๐|๑?[๑๒๓๔๕๖๗๘๙])( \.)", r"\1\2</l>\3 \4", s) s = re.sub(r"(^|<br\/?>)( +)(๑๐|๑?[๑๒๓๔๕๖๗๘๙])( \.)", r"\1\2</l>\3 \4", s)
elif lang == "km": elif re.search(r"^km(_|$)", lang):
s = re.sub(r"([\P{Latin}\)][ ​]*)(-[ ​]*\P{Latin})", r"\1</l>\2", s) s = re.sub(r"([\P{Latin}\)][ ​]*)(-[ ​]*\P{Latin})", r"\1</l>\2", s)
s = re.sub(r"([\P{Latin}\)][ ​]*)(\d+ ?[,\-][ ​]*\P{Latin})", r"\1</l>\2", s) s = re.sub(r"([\P{Latin}\)][ ​]*)(\d+ ?[,\-][ ​]*\P{Latin})", r"\1</l>\2", s)
else: else:
...@@ -84,31 +84,32 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a ...@@ -84,31 +84,32 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a
#un ajout d'espace ici en faisant le process de trad #un ajout d'espace ici en faisant le process de trad
s = re.sub(r'<s type="br"></s>( *)<s type="li">', r'\1<s type="br,li">', s) s = re.sub(r'<s type="br"></s>( *)<s type="li">', r'\1<s type="br,li">', s)
if re.search(r"^(ja|zh)$", lang): if re.search(r"^(ja|zh)(_|$)", lang):
s = re.sub(r" ([!?\!\?。。])( +)", r" \1</s>\2<s>", s) s = re.sub(r" ([!?\!\?。。\.])( +)", r" \1</s>\2<s>", s)
if weak_sbound > 0: if weak_sbound > 0:
s = re.sub(r" ([:;:;]|)( +)", r" \1</s>\2<s>", s) s = re.sub(r" ([:;:;]|)( +)", r" \1</s>\2<s>", s)
elif lang == "km": elif re.search(r"^km(_|$)", lang):
s = re.sub(r" ([។៕])( +)", r" \1</s>\2<s>", s) s = re.sub(r" ([។៕])( +)", r" \1</s>\2<s>", s)
elif lang == "th": elif re.search(r"^th(_|$)", lang)::
pass pass
else: else:
s = re.sub(rf"([\.:;\?\!]) ([\"“”\˝] {maj.pattern}[^\"“”\˝<>]*[\.:;\?\!] [\"“”\˝]) ({maj.pattern})", r"\1</s> <s>\2</s> <s>\3", s) # detection of sentences entirely surrounded by double quotes s = re.sub(rf"([\.:;\?\!])( +)([\"“”\˝] {maj.pattern}[^\"“”\˝<>]*[\.:;\?\!] [\"“”\˝])( +)({maj.pattern})", r"\1</s>\2<s>\3</s>\4<s>\5", s) # detection of sentences entirely surrounded by double quotes
s = re.sub(rf"([\.:;\?\!]) ([\"“”\˝] {maj.pattern}[^\"“”\˝<>]*[\.:;\?\!] [\"“”\˝]) $", r"\1</s> <s>\2</s> ", s) # detection of sentences entirely surrounded by double quotes s = re.sub(rf"([\.:;\?\!])( +)([\"“”\˝] {maj.pattern}[^\"“”\˝<>]*[\.:;\?\!] [\"“”\˝])( +)$", r"\1</s>\2<s>\3</s>\4", s) # detection of sentences entirely surrounded by double quotes
s = re.sub(rf"(\.(?: \.)*)( +)(\( \. \. \. \))( +)({maj.pattern}|[\[_{{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s) s = re.sub(rf"(\.(?: \.)*|…)( +)(\( (?:\. \. \.|…) \))( +)({maj.pattern}|[\[_{{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s)
s = re.sub(rf"([^\.][0-9}} ]\.(?: \.)*)( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE s = re.sub(rf"([^\.][0-9}} ](?:\.(?: \.)*|…))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE
s = re.sub(rf"({maj.pattern}{l.pattern}+ {maj.pattern} \.)</s> <s>({maj.pattern}{l.pattern}+ )", r"\1 \2", s) # oversegmentation correction by the standard case (case Prénom I. Nom) s = re.sub(rf"({maj.pattern}{l.pattern}+ {maj.pattern} \.)</s> <s>({maj.pattern}{l.pattern}+ )", r"\1 \2", s) # oversegmentation correction by the standard case (case Prénom I. Nom)
s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres) s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres)
s = re.sub(rf"( {strictmaj.pattern} \.)</s> <s>({minus.pattern}(?:{l.pattern}+| \.) )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (examples: A.F.D.S.F.G. est une entreprise connue. | We need to R.S.V.P. a.s.a.p., you know!) s = re.sub(rf"( {strictmaj.pattern} \.)</s> <s>({minus.pattern}(?:{l.pattern}+| \.) )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (examples: A.F.D.S.F.G. est une entreprise connue. | We need to R.S.V.P. a.s.a.p., you know!)
s = re.sub(rf"( {minus.pattern} \. {minus.pattern} \.)</s> <s>({minus.pattern}(?:{l.pattern}+| \.) )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (exemple: Please r.s.v.p. a.s.a.p. otherwise who know what will happen.) s = re.sub(rf"( {minus.pattern} \. {minus.pattern} \.)</s> <s>({minus.pattern}(?:{l.pattern}+| \.) )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (exemple: Please r.s.v.p. a.s.a.p. otherwise who know what will happen.)
s = re.sub(rf"( {minus.pattern} \.)</s> <s>({minus.pattern} \. )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (exemple: e. g.)
s = re.sub(rf"({l.pattern}|[\!\?])( +)(\. \. \.)( +)({maj.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3</s>\4<s>\5", s) # used to use $l instead of $maj s = re.sub(rf"({l.pattern}|[\!\?])( +)(\. \. \.)( +)({maj.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3</s>\4<s>\5", s) # used to use $l instead of $maj
s = re.sub(rf"(\. \.(?: \.)*)( +)({maj.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) s = re.sub(rf"(\. \.(?: \.)*)( +)({maj.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s)
s = re.sub(rf"(\. (?: \.)+)( )({initialclass.pattern})", r"\1</s>\2<s>\3", s) # attention !!! s = re.sub(rf"(\. (?: \.)+)( +)({initialclass.pattern})", r"\1</s>\2<s>\3", s) # attention !!!
s = re.sub(r"(\. (?: \.)+)( +)([\[_\{\-\«¿¡])", r"\1</s>\2<s>\3", s) # attention !!! s = re.sub(r"(\. (?: \.)+)( +)([\[_\{\-\«¿¡])", r"\1</s>\2<s>\3", s) # attention !!!
s = re.sub(rf"([\?\!](?: ?(?: \.)+)?)( )({initialclass.pattern})", r"\1</s>\2<s>\3", s) # attention !!! s = re.sub(rf"([\?\!](?: ?(?: \.)+)?)( +)({initialclass.pattern})", r"\1</s>\2<s>\3", s) # attention !!!
s = re.sub(r"([\?\!](?: ?(?: \.)+)?)( +)([\[_\{\-\«¿¡])", r"\1</s>\2<s>\3", s) # attention !!! s = re.sub(r"([\?\!](?: ?(?: \.)+)?)( +)([\[_\{\-\«¿¡])", r"\1</s>\2<s>\3", s) # attention !!!
s = re.sub(r"([\.\?\!] +\.(?: \.)+)( )", r"\1</s>\2<s>", s) # attention s = re.sub(r"([\.\?\!] +\.(?: \.)+)( +)", r"\1</s>\2<s>", s) # attention
s = re.sub(r"([\.\?\!,:])( )([\-\+\«¿¡])", r"\1</s>\2<s>\3", s) # attention s = re.sub(r"([\.\?\!,:])( )([\-\+\«¿¡])", r"\1</s>\2<s>\3", s) # attention
if weak_sbound > 0: # if $weak_sbound, colons are sentence boundaries if weak_sbound > 0: # if $weak_sbound, colons are sentence boundaries
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment