From adacbc172db0302f270fe2b8a37d5862eb27c138 Mon Sep 17 00:00:00 2001
From: Petko Valtchev <Petko.Valtchev@uqam.ca>
Date: Wed, 5 Oct 2005 18:34:38 +0000
Subject: [PATCH] the tockenizer is now working with both numbers and
 delimiters (all non-alphanumeric symbols)

---
 .../exmo/align/ling/JWNLDistances.java        | 160 ++++++++++--------
 1 file changed, 91 insertions(+), 69 deletions(-)

diff --git a/src/fr/inrialpes/exmo/align/ling/JWNLDistances.java b/src/fr/inrialpes/exmo/align/ling/JWNLDistances.java
index b62af976..d25f4be0 100644
--- a/src/fr/inrialpes/exmo/align/ling/JWNLDistances.java
+++ b/src/fr/inrialpes/exmo/align/ling/JWNLDistances.java
@@ -495,11 +495,11 @@ public class JWNLDistances {
 
 
 	public boolean isAlphaNum(char c) {
-		return (c >= 'A') && (c <= 'Z') || (c >= 'a') && (c <= 'z') || (c >= '0') && (c <= '9');
+		return isAlpha(c) || isNum(c);
 	}
 
 	public boolean isAlpha(char c) {
-		return (c >= 'A') && (c <= 'Z') || (c >= 'a') && (c <= 'z');
+		return isAlphaCap(c) || isAlphaSmall(c);
 	}
 
 	public boolean isAlphaCap(char c) {
@@ -509,7 +509,12 @@ public class JWNLDistances {
 	public boolean isAlphaSmall(char c) {
 		return (c >= 'a') && (c <= 'z');
 	}
+
+	public boolean isNum(char c) {
+		return (c >= '0') && (c <= '9');
+	}
 	
+
 	// the new tokenizer
 	// firsst looks for non-alphanumeric chars in the string
 	// if any, they will be taken as the only delimiters
@@ -530,7 +535,8 @@ public class JWNLDistances {
 		int tkEnd = 0;
 		
 		// looks for the first delimiter
-		while (tkStart < sLength  && isAlpha (str1.charAt(tkStart))) {
+		// while (tkStart < sLength  && isAlpha (str1.charAt(tkStart))) {
+		while (tkStart < sLength  && isAlphaNum (str1.charAt(tkStart))) {
 			tkStart++;
 		}
 		
@@ -542,99 +548,103 @@ public class JWNLDistances {
 			tkStart = 0;
 
 			// ignore leading separators
-			while (tkStart < sLength  && ! isAlpha (str1.charAt(tkStart))) {
+			// while (tkStart < sLength && ! isAlpha (str1.charAt(tkStart))) {
+			while (tkStart < sLength  && ! isAlphaNum (str1.charAt(tkStart))) {
 				tkStart++;
 			}
 
-			tkEnd = tkStart+1;
+			tkEnd = tkStart;
 
 			while (tkStart < sLength) {
 
-				// consumption of the token
-				while (tkEnd < sLength  && isAlpha (str1.charAt(tkEnd))) {
-					tkEnd++;
+				// consumption of the Alpha/Num token
+				if (isAlpha (str1.charAt(tkEnd))) {
+					while (tkEnd < sLength  && isAlpha (str1.charAt(tkEnd))) {
+						tkEnd++;
+					}
+				} else {
+					while (tkEnd < sLength  && isNum (str1.charAt(tkEnd))) {
+						tkEnd++;
+					}					
 				}
-				// creation
-				vTokens.add(str1.substring(tkStart, tkEnd));
 				
+				// consumption of the Num token
+				vTokens.add(str1.substring(tkStart, tkEnd));
+
 				// ignoring intermediate delimiters
-								
-				while (tkEnd < sLength  && !isAlpha (str1.charAt(tkEnd))) {
+				while (tkEnd < sLength  && !isAlphaNum (str1.charAt(tkEnd))) {
 					tkEnd++;
-				}
-			
+				}			
 				tkStart=tkEnd;
-			}
-
-			
+			}			
 		}
 		
 		// else the standard naming convention will be used
-		// TO DO :: include numbers: (between parts)
+		//
 		// 
 		else{
 			// start at the beginning of the string
 			tkStart = 0;			
-
-			tkEnd = tkStart+1;
+			tkEnd = tkStart;
 
 			while (tkStart < sLength) {
-				// INV: thStart is always the first char of a token or the
-				// position after the string end
-				// consumption of the leading Caps
-				while (tkEnd < sLength  && isAlphaCap (str1.charAt(tkEnd))) {
-					tkEnd++;
-				}
 
-				// at this point tkEnd is pointing at:
-				// a) the first small letter OR
-				// b) the end of the string
-				// c) [NOT YET DONE] a number
-				
-				// if a) look whether there are more than one caps in a row
-				if (tkEnd < sLength) {
+				// the beginning of a token
+				if (isAlpha (str1.charAt(tkEnd))){
 					
-					// if there are several this should be an abbreviation
-					// so make a token out of them, till the second last one
-					// and update the position of tkStart
-					if (tkEnd - tkStart > 1) {
+					if (isAlphaCap (str1.charAt(tkEnd))){
 						
-						vTokens.add(str1.substring(tkStart, tkEnd-1));
-						tkStart = tkEnd-1;
-					
-					}
-					else {
-						// if there is only one, this is the beginning of the
-						// token
-						// so just go on
+						// This starts with a Cap
+						// IS THIS an Abbreviaton ???
+						// lets see how maqny Caps
+						while (tkEnd < sLength  && isAlphaCap (str1.charAt(tkEnd))) {
+							tkEnd++;
+						}
+			
+						// The pointer is at:
+						// a) string end: make a token and go on
+						// b) number: make a token and go on
+						// c) a small letter:
+						// if there are at least 3 Caps,
+						// separate them up to the second last one and move the
+						// tkStart to tkEnd-1
+						// otherwise
+						// go on
+
+						if (tkEnd == sLength || isNum (str1.charAt(tkEnd))) {
+							vTokens.add(str1.substring(tkStart, tkEnd));
+							tkStart=tkEnd;									
+						} else {
+							// small letter
+							if (tkEnd - tkStart > 2) {
+								// If at least 3
+								vTokens.add(str1.substring(tkStart, tkEnd-1));
+								tkStart=tkEnd-1;									
+							}
+						}
+						// if (isAlphaSmall (str1.charAt(tkEnd))){}
+					} else {
+						// it is a small letter that follows a number : go on
+						// relaxed
+						while (tkEnd < sLength  && isAlphaSmall (str1.charAt(tkEnd))) {
+							tkEnd++;
+						}
+						vTokens.add(str1.substring(tkStart, tkEnd));
+						tkStart=tkEnd;										
 					}
-					
-				}
-				else {
-					// if b) i.e., this is the string end, just create the token
-					// 
-				}
-				// at this point th Start is the leading letter of a token that
-				// has at least one
-				// small letter (regardless of whether its first one is a
-				// capital or not)
-				// OR it is of length one
+				} else {
 				
-				while (tkEnd < sLength  && isAlphaSmall (str1.charAt(tkEnd))) {
-					tkEnd++;
+					// Here is the numerical token processing
+					while (tkEnd < sLength  && isNum (str1.charAt(tkEnd))) {
+						tkEnd++;
+					}
+					vTokens.add(str1.substring(tkStart, tkEnd));
+					tkStart=tkEnd;			
 				}
-				
-				
-				vTokens.add(str1.substring(tkStart, tkEnd));
-				tkStart=tkEnd;			
-
-			}
-			
-
+			}	
 		}
-
 		// PV: Debug 
-		// System.out.println("Tokens = "+ vTokens.toString());		
+		System.out.println("Tokens = "+ vTokens.toString());		
 			return vTokens;
 	}
 	
@@ -875,7 +885,7 @@ public class JWNLDistances {
         JWNLDistances j = new JWNLDistances();
         j.Initialize();
         String s1 = "French997Guy";
-        String s2 = "Dutch_Goaly";
+        String s2 = "Dutch_Goa77ly";
 //        try {
 //            IndexWord index1 = Dictionary.getInstance().getIndexWord(POS.NOUN, s1);
 //            IndexWord index2 = Dictionary.getInstance().getIndexWord(POS.NOUN, s2);
@@ -888,5 +898,17 @@ public class JWNLDistances {
 //        System.out.println("Sim = " + j.computeSimilarity(s1, s2));
 //        System.out.println("SimOld = " + (1 - j.BasicSynonymDistance(s1, s2)));
 //        System.out.println("SimSubs = " + (1 - StringDistances.subStringDistance(s1, s2)));
+        s1 = "FREnch997guy21GUIe";
+        s2 = "Dutch_GOa77ly.";
+        System.out.println("SimWN = " + j.compareComponentNames(s1, s2));
+
+        s1 = "a997c";
+        s2 = "77ly.";
+        System.out.println("SimWN = " + j.compareComponentNames(s1, s2));
+
+        s1 = "MSc";
+        s2 = "PhD";
+        System.out.println("SimWN = " + j.compareComponentNames(s1, s2));
+
     }
 }
-- 
GitLab