Mentions légales du service

Skip to content
Snippets Groups Projects
Commit f2373cb5 authored by Benoît Sagot's avatar Benoît Sagot
Browse files

python update

parent e4940fc0
No related branches found
No related tags found
No related merge requests found
......@@ -38,9 +38,9 @@ sub altok_split_sentences {
$maj=qr/(?:[A-ZÀÅÃÉÈÊËÂÄÔÖÛÜÇÓÚÍÁÒØÌÆŒ])/;
$l=qr/(?:[æœàåâäãéêèëîïöôùûüÿçøóúíáòìa-zA-ZÀÅÃÉÈÊËÂÄÔÖÛÜÇÓÚÍÁÒØÌÆŒ])/;
} elsif ($lang =~ /^(pl|cs|sk|ro|sl|hr|sr|sc|bn|tr|fa|ckb)$/) {
$min=qr/(?:[a-záäąćčďéęěëíĺľłńňóôöŕřśšťúůüýźż])/;
$maj=qr/(?:[A-ZÁÄĄĆČĎÉĘĚËÍĹŁĽŃŇÓÔÖŔŘŚŠŤÚŮÜÝŹŻ])/;
$l=qr/(?:[a-záäąćčďéęěëíĺľłńňóôöŕřśšťúůüýźżA-ZÁÄĄĆČĎÉĘĚËÍĹŁĽŃŇÓÔÖŔŘŚŠŤÚŮÜÝŹŻ ])/;
$min=qr/(?:[a-záäąćčçďéęěëíĺľłńňóôöŕřśšşťúůüýźż])/;
$maj=qr/(?:[A-ZÁÄĄĆČÇĎÉĘĚËÍİĹŁĽŃŇÓÔÖŔŘŚŠŞŤÚŮÜÝŹŻ])/;
$l=qr/(?:[a-záäąćčďéęěëíĺľłńňóôöŕřśšťúůüýźżA-ZÁÄĄĆČÇĎÉĘĚËÍİĹŁĽŃŇÓÔÖŔŘŚŠŞŤÚŮÜÝŹŻ ])/;
} elsif ($lang =~ /^(ru|uk|bg|bl|kk|bxr)$/) {
$min=qr/(?:[a-zабвгдежзийклмнопрстуфхцчшщэюяыьёү])/;
$maj=qr/(?:[A-ZАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЭЮЯЫЬЁҮ])/;
......@@ -87,18 +87,30 @@ sub altok_split_sentences {
$s =~ s/($listnumprefix)([\-\­] +$listnumid +[\-\­] )(?= +[^0-9 ])/$1<l\/>$2/go; # [début de ligne] - 1) texte
$s =~ s/($listnumprefix)($listnumid +[\.\)\]\/\]])(?= +[^0-9 ])/$1<l\/>$2/go; # [début de ligne] 1) texte
$s =~ s/($listnumprefix)($listnumid +[\-\­] )(?= +[^0-9 ])/$1<l\/>$2/go; # [début de ligne] 1) texte
if ($s =~ /^ \d [,\.] /) {
while ($s =~ s/([\.;\!] )(\d ,)(?= +[^0-9 ])/$1<l\/>$2/o) {} # [début de ligne] 1, texte … 2, texte
}
$s =~ s/(\. +)(・)/$1<l\/>$2/g;
}
}
$s =~ s/^( +)<l\/>/$1<s type=\"li\">/g;
$s =~ s/([^ ])( +)<l\/>/$1<\/s>$2<s type=\"li\">/g;
$s =~ s/<l\/>//g;
<<<<<<< Updated upstream
=======
>>>>>>> Stashed changes
#TODO check, j'ai ajouté un espace entre <s et type="br,li" pour bien detokeniser ensuite
$s =~ s/<s type="br"><\/s>( *)<s type="li">/$1<s type="br,li">/g;
if ($lang =~ /^(ja|zh)(_|$)/) {
$s =~ s/ ([!?\!\?。。\.])( +)/ \1<\/s>\2<s>/g;
$s =~ s/ ([!?\!\?。。\.](?: [!?\!\?。。\.])*)( +)/ \1<\/s>\2<s>/g;
$s =~ s/([!?\!\?。。\.])<\/s>( +)<s>([()\(\)])/\1\2\3/g;
if ($s =~ / <s type="li">\d [。。\.]<\/s>/) {
while ($s =~ s/(<s type="li">\d [。。\.])<\/s>( +)<s>/\1\2/) {}
}
if ($weak_sbound > 0) {
$s =~ s/ ([:;:;]|)( +)/ \1<\/s>\2<s>/g;
}
......@@ -108,10 +120,27 @@ sub altok_split_sentences {
} else {
$s =~ s/([…\.:;\?\!])( +)([\"“”\˝] ${maj}[^\"“”\˝<>]*[\.:;\?\!] [\"“”\˝])( +)($maj)/$1<\/s>$2<s>$3<\/s>$4<s>$5/g; # detection of sentences entirely surrounded by double quotes
$s =~ s/([…\.:;\?\!])( +)([\"“”\˝] ${maj}[^\"“”\˝<>]*[\.:;\?\!] [\"“”\˝])( +)$/$1<\/s>$2<s>$3<\/s>$4/g; # detection of sentences entirely surrounded by double quotes
$s =~ s/(\.(?: \.)*|…)( +)(\( (?:\. \. \.|…) \))( +)($maj|[\[_\{\.])/$1<\/s>$2<s>$3<\/s>$4<s>$5/g;
$s =~ s/([^\.][0-9\} ](?:\.(?: \.)*|…))( +)($initialclass|$special_split)/$1<\/s>$2<s>$3/g; # STANDARD CASE
$s =~ s/(\.(?: \.)*|…)( +)(\( (?:\. \. \.|[…。]) \))( +)($maj|[\[_\{\.])/$1<\/s>$2<s>$3<\/s>$4<s>$5/g;
#$s =~ s/([^\.][0-9\} ](?:\.(?: \.)*|[…。]))( +)($initialclass|$special_split)/$1<\/s>$2<s>$3/g; # STANDARD CASE
$s =~ s/([^\.0-9][\} ](?:\.(?: \.)*|[…。]))( +)($initialclass|$special_split)/$1<\/s>$2<s>$3/g; # STANDARD CASE 1
$s =~ s/([^\.][0-9](?:\.(?: \.)+|[…。]))( +)($initialclass|$special_split)/$1<\/s>$2<s>$3/g; # STANDARD CASE 2
$s =~ s/^ ([^\.][0-9]\.)( +)($initialclass|$special_split)/$1<\/s>$2<s>$3/g; # STANDARD CASE 3
$s =~ s/([\.;\!\?…] [0-9]+ \.)( +)($initialclass|$special_split)/$1<\/s>$2<s>$3/g; # STANDARD CASE 4
$s =~ s/([^\s>0-9][0-9]+ (?:\.(?: \.)*|[…。]))( +)($initialclass|$special_split)/$1<\/s>$2<s>$3/g; # STANDARD CASE 5
$s =~ s/([^\.;\?\!…] [0-9]+ (?:\.(?: \.)*|[…。]))( +)($initialclass|$special_split)/$1<\/s>$2<s>$3/g; # STANDARD CASE 5
$s =~ s/($maj$l+ $maj \.)<\/s> <s>($maj$l+ )/$1 $2/g; # oversegmentation correction by the standard case (case Prénom I. Nom)
$s =~ s/\b((?:mr?s?|me?lle|[pd]r) \.)<\/s>( +)<s>((?:$maj \. )*$maj$l+ )/$1$2$3/gi; # oversegmentation correction by the standard case (cas Mr. et autres)
$s =~ s/\b((?:mr?s?|me?lle|[pd]r|e \. +g) \.)<\/s>( +)<s>((?:$maj \. )*$maj$l+ )/$1$2$3/gi; # oversegmentation correction by the standard case (cas Mr. et autres)
if ($lang eq "pl") {
$s =~ s/\b((?:np|m ?\. *in) \.)<\/s>( +)<s>((?:$maj \. )*$maj$l+ )/$1$2$3/gi; # oversegmentation correction by the standard case (cas np. et autres)
} elsif ($lang eq "ru") {
$s =~ s/\b((?:Т \. +е) \.)<\/s>( +)<s>((?:$maj \. )*$maj$l+ )/$1$2$3/gi; # oversegmentation correction by the standard case (cas Т.е. et autres)
} elsif ($lang eq "es") {
$s =~ s/\b((?:Ej) \.)<\/s>( +)<s>((?:$maj \. )*$maj$l+ )/$1$2$3/gi; # oversegmentation correction by the standard case (cas Ej. et autres)
} elsif ($lang eq "de") {
$s =~ s/\b((?:bzw|z *\. *b|ggf) \.)<\/s>( +)<s>((?:$maj \. )*$maj$l+ )/$1$2$3/gi; # oversegmentation correction by the standard case (cas bzw. et autres)
} elsif ($lang eq "nl") {
$s =~ s/\b((?:bv) \.)<\/s>( +)<s>((?:$maj \. )*$maj$l+ )/$1$2$3/gi; # oversegmentation correction by the standard case (cas bv. et autres)
}
$s =~ s/( $strictmaj \.)<\/s> <s>($min(?:$l+| \.) )/$1 $2/g; # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (examples: A.F.D.S.F.G. est une entreprise connue. | We need to R.S.V.P. a.s.a.p., you know!)
$s =~ s/( $min \. $min \.)<\/s> <s>($min(?:$l+| \.) )/$1 $2/g; # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (exemple: Please r.s.v.p. a.s.a.p. otherwise who know what will happen.)
$s =~ s/( $min \.)<\/s> <s>($min \. )/$1 $2/g; # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (exemple: e. g.)
......
......@@ -100,7 +100,20 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a
s = re.sub(rf"(\.(?: \.)*|…)( +)(\( (?:\. \. \.|…) \))( +)({maj.pattern}|[\[_{{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s)
s = re.sub(rf"([^\.][0-9}} ](?:\.(?: \.)*|…))( +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE
s = re.sub(rf"({maj.pattern}{l.pattern}+ {maj.pattern} \.)</s> <s>({maj.pattern}{l.pattern}+ )", r"\1 \2", s) # oversegmentation correction by the standard case (case Prénom I. Nom)
<<<<<<< Updated upstream
s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r) \.)</s>( +)<s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres)
=======
s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres)
if lang == "pl":
s = re.sub(rf"\b((?:np|m ?\. *in) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas np. et autres)
elif lang == "ru":
s = re.sub(rf"\b((?:Т \. +е) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Т. е. et autres)
elif lang == "es":
s = re.sub(rf"\b((?:Ej) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction else if lang == "de":
s = re.sub(rf"\b((?:bzw|z *\. *b|ggf) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas bzw. et autres)
elif lang == "nl":
s = re.sub(rf"\b((?:bv) \.)</s> <s>((?:{maj.pattern} \. )*{maj.pattern}{l.pattern}+ )", r"\1 \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas bv. et autres)
>>>>>>> Stashed changes
s = re.sub(rf"( {strictmaj.pattern} \.)</s> <s>({minus.pattern}(?:{l.pattern}+| \.) )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (examples: A.F.D.S.F.G. est une entreprise connue. | We need to R.S.V.P. a.s.a.p., you know!)
s = re.sub(rf"( {minus.pattern} \. {minus.pattern} \.)</s> <s>({minus.pattern}(?:{l.pattern}+| \.) )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (exemple: Please r.s.v.p. a.s.a.p. otherwise who know what will happen.)
s = re.sub(rf"( {minus.pattern} \.)</s> <s>({minus.pattern} \. )", r"\1 \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (exemple: e. g.)
......
......@@ -301,6 +301,132 @@ push @{$cmpnd_translits{"cyrillic"}}, ".";
my %cmpnd_translits_re;
$cmpnd_translits_re{"cyrillic"} = join ("|", sort {length($b) <=> length($a)} @{$cmpnd_translits{"cyrillic"}});
my %cyrillic_to_simpleroman = (
"А" => "A",
"а" => "a",
"Б" => "B",
"б" => "b",
"В" => "V",
"в" => "v",
"Г" => "G",
"г" => "g",
"Ґ" => "Ġ",
"ґ" => "ġ",
"Д" => "D",
"д" => "d",
"Ѓ" => "Ǵ",
"ѓ" => "ǵ",
"Ђ" => "Ď",
"ђ" => "ď",
"Е" => "E",
"е" => "e",
"Ё" => "Ë",
"ё" => "ë",
"Є" => "Ě",
"є" => "ě",
"Ж" => "Ž",
"ж" => "ž",
"З" => "Z",
"з" => "z",
"Ѕ" => "DZ",
"ѕ" => "dz",
"И" => "I",
"и" => "i",
"I" => "Ì",
"і" => "ì",
"Ї" => "Ï",
"ї" => "ï",
"Й" => "J",
"й" => "j",
"Ј" => "Ĵ",
"ј" => "ĵ",
"К" => "K",
"к" => "k",
"Л" => "L",
"л" => "l",
"Љ" => "Ľ",
"љ" => "ľ",
"М" => "M",
"м" => "m",
"Н" => "N",
"н" => "n",
"Њ" => "Ň",
"њ" => "ň",
"О" => "O",
"о" => "o",
"П" => "P",
"п" => "p",
"Р" => "R",
"р" => "r",
"С" => "S",
"с" => "s",
"Т" => "T",
"т" => "t",
"Ќ" => "",
"ќ" => "",
"Ћ" => "Ć",
"ћ" => "ć",
"У" => "U",
"у" => "u",
"Ў" => "W",
"ў" => "w",
"Ф" => "F",
"ф" => "f",
"Х" => "X",
"х" => "x",
"Ц" => "C",
"ц" => "c",
"Ч" => "Č",
"ч" => "č",
"Џ" => "",
"џ" => "",
"Ш" => "Š",
"ш" => "š",
"Щ" => "ŠČ",
"щ" => "šč",
"Ъ" => "Ŭ",
"ъ" => "ŭ",
"Ы" => "Y",
"ы" => "y",
"Ь" => "Ĭ",
"ь" => "ĭ",
"Ѣ" => "Ä",
"ѣ" => "ä",
"Э" => "È",
"э" => "è",
"Ю" => "Ǔ",
"ю" => "ǔ",
"Я" => "Ǎ",
"я" => "ǎ",
"" => "'",
"Ѡ" => "Ô",
"ѡ" => "ô",
"Ѧ" => "Ę",
"ѧ" => "ę",
"Ѩ" => "",
"ѩ" => "",
"Ѫ" => "Ǫ",
"ѫ" => "ǫ",
"Ѭ" => "",
"ѭ" => "",
"Ѯ" => "",
"ѯ" => "",
"Ѱ" => "",
"ѱ" => "",
"Ѳ" => "",
"ѳ" => "",
"Ѵ" => "Ü",
"ѵ" => "ü",
);
for (keys %cyrillic_to_simpleroman) {
if (length($cyrillic_to_simpleroman{$_}) > 1) {
push @{$cmpnd_translits{"simplecyrillic"}}, $cyrillic_to_simpleroman{$_};
}
}
push @{$cmpnd_translits{"simplecyrillic"}}, ".";
$cmpnd_translits_re{"simplecyrillic"} = join ("|", sort {length($b) <=> length($a)} @{$cmpnd_translits{"simplecyrillic"}});
my %cyrillic_to_enroman = (
"А" => "A",
"а" => "a",
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment