From f15ecbd761fa4be204ec3c2f6ec9b1b8fa27f442 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Euzenat?= <Jerome.Euzenat@inria.fr> Date: Fri, 19 May 2006 12:32:50 +0000 Subject: [PATCH] - correction, rectifications, expansions --- .../align/impl/method/StringDistances.java | 193 ++++++++++++++---- 1 file changed, 155 insertions(+), 38 deletions(-) diff --git a/src/fr/inrialpes/exmo/align/impl/method/StringDistances.java b/src/fr/inrialpes/exmo/align/impl/method/StringDistances.java index aa588b64..f7de6fdb 100644 --- a/src/fr/inrialpes/exmo/align/impl/method/StringDistances.java +++ b/src/fr/inrialpes/exmo/align/impl/method/StringDistances.java @@ -1,7 +1,7 @@ /* * $Id$ * - * Copyright (C) INRIA Rhône-Alpes, 2003-2005 + * Copyright (C) INRIA Rhône-Alpes, 2003-2006 * Except for the Levenshtein class whose copyright is not claimed to * our knowledge. * @@ -20,7 +20,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -/** +/* * This class implements various string distances that can be used on * various kind of strings. * @@ -29,8 +29,12 @@ * - subStringDistance * - pSubStringDistance [not implemented yet] * - lowenhein (edit) distance - * - n-gram distance [not implemented yet] + * - n-gram distance * - CERTH ISWC-2005 SMOA (calling org.ivml.alimo.ISub) + * + * All of these are implemented as distances normalized by the + * largest possible distance between two such strings. + * They return doubles * * @author Jérôme Euzenat * @version $Id$ @@ -42,21 +46,23 @@ import org.ivml.alimo.ISub; public class StringDistances { - //***************************** - // Compute substring distance - // = 1 - (2 | length of longest common substring | / |s1|+|s2|) - //***************************** - + /* + * subStringDistance + * computes substring distance: + * = 1 - (2 | length of longest common substring | / |s1|+|s2|) + * return: 0 if both string are equal, 1 otherwise + */ public static double subStringDistance (String s1, String s2) { if (s1 == null || s2 == null) { - throw new IllegalArgumentException("Strings must not be null"); + //throw new IllegalArgumentException("Strings must not be null"); + return 1.; } int l1 = s1.length(); // length of s int l2 = s2.length(); // length of t - if ((l1 == 0) && ( l2 == 0 )) return 0; - if ((l1 == 0) || ( l2 == 0 )) return 1; + if ((l1 == 0) && ( l2 == 0 )) return 0.; + if ((l1 == 0) || ( l2 == 0 )) return 1.; int max = Math.min( l1, l2 ); // the maximal length of a subs int best = 0; // the best subs length so far @@ -76,8 +82,7 @@ public class StringDistances { } } } - //System.err.println(s1+" x "+s2+" = "+(1.0 - ((double)2*best / (l1+l2)))); - return (1.0 - ((double)2*best / (l1+l2))); + return (1.0 - ((double)2*best / (double)(l1+l2))); } /* pSubStringDistance: @@ -85,28 +90,134 @@ public class StringDistances { * sum their size / s1+s2 */ - public static int equalDistance (String s, String t) { + /* + * equalDistance + * return: 0 if both string are equal, 1 otherwise + */ + public static double equalDistance (String s, String t) { + if (s == null || t == null) { + //throw new IllegalArgumentException("Strings must not be null"); + return 1.; + } + if ( s.equals(t) ) { return 0.;} else {return 1.;} + } + + /* + * hammingDistance + * return: the proportion of positions on which two strings differ + */ + public static double hammingDistance (String s, String t) { if (s == null || t == null) { - throw new IllegalArgumentException("Strings must not be null"); + //throw new IllegalArgumentException("Strings must not be null"); + return 1.; } - if ( s.equals(t) ) { return 1;} else {return 0;} + int l1 = s.length(); + int l2 = t.length(); + int min = Math.min(l1, l2); + int max = Math.max(l1, l2); + + int score = max; + for( int i=0; i < min ; i++ ) + if ( s.charAt(i) == t.charAt(i) ) score--; + return (double)score/(double)max ; } - // JE: 30/05/2005: this has not been tested - public static int ngramDistance(String s, String t) { + /* + * jaroMeasure as a dissimilarity (identical have 0.) + * return: + * Original algorithm by Jérôme Euzenat. + * It traverses both strings at the same time + * finding the first match on s, then looking for the first on t + * (and deciding if there is transposition or not on the fly) + * before comming back to "s", etc. + * This is certainly minimal in lines of code if not optimal + */ + public static double jaroMeasure (String s, String t) { + if (s == null || t == null) { + //throw new IllegalArgumentException("Strings must not be null"); + return 1.; + } + int l1 = s.length(); // length of s + int l2 = t.length(); // length of t + int span = Math.min(l1, l2)/2; // vicinity of search + int i = 0; // index on s + int j = 0; // index on t + int comps = 0; // nb of char in s, close in t + int compt = 0; // nb of char in t, close in s + int transp = 0; // nb of char NOT transposed + // i.e., nb of char in s appearing in the same order in t + char lastchars = '\0'; // last matched char in s + while( i < l1 || j < l2 ){ + if ( ( j < l2 && comps > compt ) || i > l1 ){ + // find a new match in compt + for ( int k = Math.max(0,j-span); k < Math.min(l1,j+span); k++){ + if ( t.charAt(j) == s.charAt(k) ){ + compt++; + if ( t.charAt(j) == lastchars ) transp++; + k = l1; + } + } + j++; + } else if ( i == l1 ) { // end of s + lastchars = '\0'; // avoid matching with it + i = l1+1; // so we will go to the previous clause now + } else { // comps + for ( int k = Math.max(0,i-span); k < Math.min(l2,i+span); k++){ + if ( t.charAt(k) == s.charAt(i) ){ + comps++; + lastchars = s.charAt(i); + k = l2; + } + } + i++; + } + } + if ( comps == 0. ) return 1.; + else return 1.0 - ((double)comps/l1 + (double)compt/l2 + (double)transp/comps)/3.; + } + + /* + * jaroWinklerMeasure + * return: + */ + public static double jaroWinklerMeasure (String s, String t) { + int PREFIX = 4; + double jaro = jaroMeasure( s, t ); + //int P = Math.max( PREFIX );// length or larger prefix + return jaro + (double)PREFIX*(1 - jaro)/10; + } + + /* + * ngrammDistance + * In fact 3-gramm distance + * return: the ratio between the number of common n-grams over the + * total number of n-gramms in both strings. + */ + public static double ngramDistance(String s, String t) { int n = 3; // tri-grams for the moment if (s == null || t == null) { - throw new IllegalArgumentException("Strings must not be null"); + //throw new IllegalArgumentException("Strings must not be null"); + return 1.; } + int l1 = s.length()-n+1; + int l2 = t.length()-n+1; int found = 0; - for( int i=0; i < s.length()-n ; i++ ){ - for( int j=0; j < t.length()-n; j++){ + for( int i=0; i < l1 ; i++ ){ + for( int j=0; j < l2; j++){ int k = 0; - for( ; (k<n) && s.charAt(i+k)==t.charAt(j+k); k++); + for( ; ( k < n ) && ( s.charAt(i+k) == t.charAt(j+k) ); k++); if ( k == n ) found++; } } - return found; + return 1.0 - (2*((double)found)/((double)(l1+l2))); + } + + public static double levenshteinDistance (String s, String t) { + return needlemanWunchDistance( s, t, 1 ); + } + + public static double needlemanWunch2Distance (String s, String t) { + return needlemanWunchDistance( s, t, 2 ); } /* Pointer was provided in Todd Hugues (Lockheed) @@ -117,11 +228,13 @@ public class StringDistances { This algorithm should be taken appart of this file and reset in the context of a proper package name with an acceptable license terms. Hopefully, Jakarta Commons will provide this. + Modified by Jérôme Euzenat for returning normalized distance. + Modified again by Jérôme Euzenat for computing needleman-wunch. */ - - public static int levenshteinDistance (String s, String t) { + public static double needlemanWunchDistance (String s, String t, int gap) { if (s == null || t == null) { - throw new IllegalArgumentException("Strings must not be null"); + //throw new IllegalArgumentException("Strings must not be null"); + return 1.; } /* The difference between this impl. and the previous is that, rather @@ -148,8 +261,8 @@ public class StringDistances { int n = s.length(); // length of s int m = t.length(); // length of t - if (n == 0) return m; - else if (m == 0) return n; + if (n == 0) return 1.; + else if (m == 0) return 1.; int p[] = new int[n+1]; //'previous' cost array, horizontally int d[] = new int[n+1]; // cost array, horizontally @@ -173,7 +286,7 @@ public class StringDistances { cost = s.charAt(i-1)==t_j ? 0 : 1; // minimum of cell to the left+1, to the top+1, // diagonally left and up +cost - d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+cost); + d[i] = Math.min(Math.min(d[i-1]+gap, p[i]+gap), p[i-1]+cost); } // copy current distance counts to 'previous row' distance counts @@ -185,26 +298,30 @@ public class StringDistances { // our last action in the above loop was to switch d and p, so p now // actually has the most recent cost counts //System.err.println(s+" x "+t+" = "+p[n]); - return p[n]; + //return p[n]; + //return (double)p[n] / (double)Math.max( n, m ); + int min = Math.min( n, m ); + int diff = Math.max( n, m ) - min; + return (double)p[n] / (double)(min + diff*gap); } - /** + /* + * smoaDistance + * A specialized distance for ontology matching identifiers * Calls the string matching method proposed in the paper * "A String Metric For Ontology Alignment", published in ISWC 2005 * It is implemented in a separate class provided by the authors and * available with this package - * JE: question: - * ISub seems to be a distance: if both strings are empty, it returns 0, if only one is, it returns 1. But its final statement is: - * return commonality - dissimilarity + winklerImprovement; - * which really looks like a similarity! + * returns commonality - dissimilarity + winklerImprovement; */ public static double smoaDistance (String s1, String s2) { if (s1 == null || s2 == null) { - throw new IllegalArgumentException("Strings must not be null"); + //throw new IllegalArgumentException("Strings must not be null"); + return 1.; } - ISub metrics = new ISub(); - return metrics.score( s1, s2 ); + + return 1.0 - (double)metrics.score( s1, s2 ); } } -- GitLab