répercussions python alSentenceSplitting + corrections post merge perl et...

répercussions python alSentenceSplitting + corrections post merge perl et python en retirant traces messages git

répercussions python alSentenceSplitting + corrections post merge perl et...
6e81e29f · Clémence Laurent · f2373cb5 · 6e81e29f · 6e81e29f
Commit 6e81e29f authored 2 years ago by Clémence Laurent
--- a/alSentenceSplitting.pm
+++ b/alSentenceSplitting.pm
@@ -96,12 +96,7 @@ sub altok_split_sentences {
  $s =~ s/^( +)<l\/>/$1<s  type=\"li\">/g;
  $s =~ s/([^ ])( +)<l\/>/$1<\/s>$2<s  type=\"li\">/g;
  $s =~ s/<l\/>//g;
-<<<<<<< Updated upstream
-=======
->>>>>>> Stashed changes
  #TODO check, j'ai ajouté un espace entre <s et type="br,li" pour bien detokeniser ensuite
  $s =~ s/<s  type="br"><\/s>( *)<s  type="li">/$1<s  type="br,li">/g;

--- a/alSentenceSplitting.py
+++ b/alSentenceSplitting.py
@@ -31,9 +31,9 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a
 		maj = re.compile(r"(?:[A-ZÀÅÃÉÈÊËÂÄÔÖÛÜÇÓÚÍÁÒØÌÆŒ])")
 		l = re.compile(r"(?:[æœàåâäãéêèëîïöôùûüÿçøóúíáòìa-zA-ZÀÅÃÉÈÊËÂÄÔÖÛÜÇÓÚÍÁÒØÌÆŒ])")
 	elif re.search(r"^(pl|cs|sk|ro|sl|hr|sr|sc|bn|tr|fa|ckb)$", lang):
-		minus = re.compile(r"(?:[a-záäąćčďéęěëíĺľłńňóôöŕřśšťúůüýźż])")
+		minus = re.compile(r"(?:[a-záäąćčçďéęěëíĺľłńňóôöŕřśšşťúůüýźż])")
-		maj = re.compile(r"(?:[A-ZÁÄĄĆČĎÉĘĚËÍĹŁĽŃŇÓÔÖŔŘŚŠŤÚŮÜÝŹŻ])")
+		maj = re.compile(r"(?:[A-ZÁÄĄĆČÇĎÉĘĚËÍİĹŁĽŃŇÓÔÖŔŘŚŠŞŤÚŮÜÝŹŻ])")
-		l = re.compile(r"(?:[a-záäąćčďéęěëíĺľłńňóôöŕřśšťúůüýźżA-ZÁÄĄĆČĎÉĘĚËÍĹŁĽŃŇÓÔÖŔŘŚŠŤÚŮÜÝŹŻ ])")
+		l = re.compile(r"(?:[a-záäąćčďéęěëíĺľłńňóôöŕřśšťúůüýźżA-ZÁÄĄĆČÇĎÉĘĚËÍİĹŁĽŃŇÓÔÖŔŘŚŠŞŤÚŮÜÝŹŻ ])")
 	elif re.search(r"^(ru|uk|bg|bl|kk|bxr)$", lang):
 		minus = re.compile(r"(?:[a-zабвгдежзийклмнопрстуфхцчшщэюяыьёү])")
 		maj = re.compile(r"(?:[A-ZАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЭЮЯЫЬЁҮ])")
@@ -79,6 +79,15 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a
 		s = re.sub(rf"({listnumprefix.pattern})({listnumid.pattern} +[\.\)\]\/\]])(?= +[^0-9 ])", r"\1<l/>\2", s) # [début de ligne] 1) texte
 		s = re.sub(rf"({listnumprefix.pattern})({listnumid.pattern} +[\-\] )(?= +[^0-9 ])", r"\1<l/>\2", s) # [début de ligne] 1) texte
+		if re.search(r"^ \d [,\.] ", s) is not None:
+			match_pattern = re.compile(r"([\.;\!]  )(\d ,)(?= +[^0-9 ])")
+			match_check = match_pattern.search(s)
+			while match_check is not None:
+				s = match_pattern.sub(r"\1<l/>\2", s) # [début de ligne] 1, texte … 2, texte
+				match_check = match_pattern.search(s)
+		s = re.sub(r"(\. +)(・)", r"\1<l/>\2", s)
 	s = re.sub(r"^( +)<l\/>", r'\1<s  type="li">', s)
 	s = re.sub(r"([^ ])( +)<l\/>", r'\1</s>\2<s  type="li">', s)
 	s = re.sub(r"<l\/>", r"", s)
@@ -87,7 +96,16 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a
 	s = re.sub(r'<s  type="br"></s>( *)<s  type="li">', r'\1<s  type="br,li">', s)
 	if re.search(r"^(ja|zh)(_|$)", lang):
-		s = re.sub(r" ([！？\!\?。｡\.])( +)", r" \1</s>\2<s>", s)
+		s = re.sub(r" ([！？\!\?。｡\.](?: [！？\!\?。｡\.])*)( +)", r" \1</s>\2<s>", s)
+		s = re.sub(r"([！？\!\?。｡\.])<\/s>( +)<s>([（）\(\)])", r"\1\2\3", s)
+		if re.search(r' <s  type="li">\d [。｡\.]<\/s>', s) is not None:
+			match_pattern = re.compile(r'(<s  type="li">\d [。｡\.])<\/s>( +)<s>')
+			match_check = match_pattern.search(s)
+			while match_check is not None:
+				s = match_pattern.sub(r"\1\2", s)
+				match_check = match_pattern.search(s)
 		if weak_sbound > 0:
 			s = re.sub(r" ([：；:;]|)( +)", r" \1</s>\2<s>", s)
 	elif re.search(r"^km(_|$)", lang):
@@ -97,23 +115,35 @@ def altok_split_sentences(s, lang, weak_sbound = 0, less_lists = 0, noxml = 0, a
 	else:
 		s = re.sub(rf"([…\.:;\?\!])(  +)([\"“”\˝] {maj.pattern}[^\"“”\˝<>]*[\.:;\?\!] [\"“”\˝])(  +)({maj.pattern})", r"\1</s>\2<s>\3</s>\4<s>\5", s) # detection of sentences entirely surrounded by double quotes
 		s = re.sub(rf"([…\.:;\?\!])(  +)([\"“”\˝] {maj.pattern}[^\"“”\˝<>]*[\.:;\?\!] [\"“”\˝])( +)$", r"\1</s>\2<s>\3</s>\4", s) # detection of sentences entirely surrounded by double quotes
-		s = re.sub(rf"(\.(?: \.)*|…)( +)(\( (?:\. \. \.|…) \))( +)({maj.pattern}|[\[_{{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s)
+		#s = re.sub(rf"(\.(?: \.)*|…)( +)(\( (?:\. \. \.|…) \))( +)({maj.pattern}|[\[_{{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s)
-		s = re.sub(rf"([^\.][0-9}} ](?:\.(?: \.)*|…))(  +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE
+		s = re.sub(rf"(\.(?: \.)*|…)( +)(\( (?:\. \. \.|[…。]) \))( +)({maj.pattern}|[\[_\{\.])", r"\1</s>\2<s>\3</s>\4<s>\5", s)
+		#s = re.sub(rf"([^\.][0-9}} ](?:\.(?: \.)*|…))(  +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE
+		s = re.sub(rf"([^\.0-9][\} ](?:\.(?: \.)*|[…。]))(  +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 1
+		s = re.sub(rf"([^\.][0-9](?:\.(?: \.)+|[…。]))(  +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 2
+		s = re.sub(rf"^ ([^\.][0-9]\.)(  +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 3
+		s = re.sub(rf"([\.;\!\?…]  [0-9]+ \.)(  +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 4
+		s = re.sub(rf"([^\s>0-9][0-9]+ (?:\.(?: \.)*|[…。]))(  +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 5
+		s = re.sub(rf"([^\.;\?\!…]  [0-9]+ (?:\.(?: \.)*|[…。]))(  +)({initialclass.pattern}|{special_split.pattern})", r"\1</s>\2<s>\3", s) # STANDARD CASE 5
 		s = re.sub(rf"({maj.pattern}{l.pattern}+  {maj.pattern} \.)</s>  <s>({maj.pattern}{l.pattern}+ )", r"\1  \2", s) # oversegmentation correction by the standard case (case Prénom I. Nom)
-<<<<<<< Updated upstream
+		s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r|e \. +g) \.)</s>(  +)<s>((?:{maj.pattern} \.  )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres)
-		s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r) \.)</s>(  +)<s>((?:{maj.pattern} \.  )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres)
-=======
-		s = re.sub(rf"\b((?:mr?s?|me?lle|[pd]r) \.)</s>  <s>((?:{maj.pattern} \.  )*{maj.pattern}{l.pattern}+ )", r"\1  \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Mr. et autres)
 		if lang == "pl":
-			s = re.sub(rf"\b((?:np|m ?\. *in) \.)</s>  <s>((?:{maj.pattern} \.  )*{maj.pattern}{l.pattern}+ )", r"\1  \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas np. et autres)
+			s = re.sub(rf"\b((?:np|m ?\. *in) \.)</s>(  +)<s>((?:{maj.pattern} \.  )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas np. et autres)
 		elif lang == "ru":
-			s = re.sub(rf"\b((?:Т \. +е) \.)</s>  <s>((?:{maj.pattern} \.  )*{maj.pattern}{l.pattern}+ )", r"\1  \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Т. е. et autres)            
+			s = re.sub(rf"\b((?:Т \. +е) \.)</s>(  +)<s>((?:{maj.pattern} \.  )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Т. е. et autres)
 		elif lang == "es":
-			s = re.sub(rf"\b((?:Ej) \.)</s>  <s>((?:{maj.pattern} \.  )*{maj.pattern}{l.pattern}+ )", r"\1  \2", s, flags = re.IGNORECASE) # oversegmentation correction        else if lang == "de":
+			s = re.sub(rf"\b((?:Ej) \.)</s>(  +)<s>((?:{maj.pattern} \.  )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas Ej. et autres)
-			s = re.sub(rf"\b((?:bzw|z *\. *b|ggf) \.)</s>  <s>((?:{maj.pattern} \.  )*{maj.pattern}{l.pattern}+ )", r"\1  \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas bzw. et autres)            
+		elif lang == "de":
+			s = re.sub(rf"\b((?:bzw|z *\. *b|ggf) \.)</s>(  +)<s>((?:{maj.pattern} \.  )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas bzw. et autres)
 		elif lang == "nl":
-			s = re.sub(rf"\b((?:bv) \.)</s>  <s>((?:{maj.pattern} \.  )*{maj.pattern}{l.pattern}+ )", r"\1  \2", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas bv. et autres)            
+			s = re.sub(rf"\b((?:bv) \.)</s>(  +)<s>((?:{maj.pattern} \.  )*{maj.pattern}{l.pattern}+ )", r"\1\2\3", s, flags = re.IGNORECASE) # oversegmentation correction by the standard case (cas bv. et autres)
->>>>>>> Stashed changes
 		s = re.sub(rf"( {strictmaj.pattern} \.)</s>  <s>({minus.pattern}(?:{l.pattern}+| \.) )", r"\1  \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (examples: A.F.D.S.F.G. est une entreprise connue. | We need to R.S.V.P. a.s.a.p., you know!)
 		s = re.sub(rf"( {minus.pattern} \. {minus.pattern} \.)</s>  <s>({minus.pattern}(?:{l.pattern}+| \.) )", r"\1  \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (exemple: Please r.s.v.p. a.s.a.p. otherwise who know what will happen.)
 		s = re.sub(rf"( {minus.pattern} \.)</s>  <s>({minus.pattern} \.  )", r"\1  \2", s) # oversegmentation correction by the standard case in mode 'allow_sentence_initial_lowercase' (exemple: e. g.)