From 050b1a9803df0378a1af9faeb8bd804402c06091 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Euzenat?= <Jerome.Euzenat@inria.fr> Date: Sat, 19 Nov 2011 15:47:35 +0000 Subject: [PATCH] - Weieghted group evaluation (dates back from 2010) --- .../inrialpes/exmo/align/util/WGroupEval.java | 586 ++++++++++++++++++ 1 file changed, 586 insertions(+) create mode 100644 src/fr/inrialpes/exmo/align/util/WGroupEval.java diff --git a/src/fr/inrialpes/exmo/align/util/WGroupEval.java b/src/fr/inrialpes/exmo/align/util/WGroupEval.java new file mode 100644 index 00000000..b67c5cec --- /dev/null +++ b/src/fr/inrialpes/exmo/align/util/WGroupEval.java @@ -0,0 +1,586 @@ +/* + * $Id$ + * + * Copyright (C) 2003 The University of Manchester + * Copyright (C) 2003 The University of Karlsruhe + * Copyright (C) 2003-2011, INRIA + * Copyright (C) 2004, Université de Montréal + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA. + */ + +/* This program evaluates the results of several ontology aligners in a row. +*/ +package fr.inrialpes.exmo.align.util; + +import org.semanticweb.owl.align.Alignment; +import org.semanticweb.owl.align.Evaluator; + +import fr.inrialpes.exmo.align.impl.BasicParameters; +import fr.inrialpes.exmo.align.impl.eval.WeightedPREvaluator; + +import fr.inrialpes.exmo.ontowrap.OntologyFactory; +import fr.inrialpes.exmo.ontowrap.OntowrapException; + +import java.io.File; +import java.io.PrintStream; +import java.io.FileOutputStream; +import java.lang.Integer; +import java.util.Hashtable; +import java.util.Vector; +import java.util.Enumeration; +import java.util.Arrays; +import java.util.Formatter; + +import org.xml.sax.SAXException; + +import gnu.getopt.LongOpt; +import gnu.getopt.Getopt; + +import fr.inrialpes.exmo.align.parser.AlignmentParser; + +/** A basic class for synthesizing the results of a set of alignments provided by + different algorithms. The output is a table showing various classical measures + for each test and for each algorithm. Average is also computed as Harmonic means. + + <pre> + java -cp procalign.jar fr.inrialpes.exmo.align.util.WGroupEval [options] + </pre> + + where the options are: + <pre> + -o filename --output=filename + -f format = prfot (precision/recall/f-measure/overall/time) --format=prfot + -d debug --debug=level + -r filename --reference=filename + -s algo/measure + -l list of compared algorithms + -t output --type=output: xml/tex/html/ascii + </pre> + + The input is taken in the current directory in a set of subdirectories (one per + test which will be rendered by a line) each directory contains a number of + alignment files (one per algorithms which will be renderer as a column). + + If output is requested (<CODE>-o</CODE> flags), then output will be written to + <CODE>output</CODE> if present, stdout by default. + +<pre> +$Id$ +</pre> + +@author Sean K. Bechhofer +@author Jérôme Euzenat + */ + +public class WGroupEval { + + BasicParameters params = null; + String filename = null; + String reference = "refalign.rdf"; + String format = "pr"; + int fsize = 2; + String type = "html"; + boolean embedded = false; + String dominant = "s"; + Vector<String> listAlgo = null; + int debug = 0; + String color = null; + String ontoDir = null; + + public static void main(String[] args) { + try { new WGroupEval().run( args ); } + catch (Exception ex) { ex.printStackTrace(); }; + } + + public void run(String[] args) throws Exception { + String listFile = ""; + LongOpt[] longopts = new LongOpt[10]; + + longopts[0] = new LongOpt("help", LongOpt.NO_ARGUMENT, null, 'h'); + longopts[1] = new LongOpt("output", LongOpt.REQUIRED_ARGUMENT, null, 'o'); + longopts[2] = new LongOpt("format", LongOpt.REQUIRED_ARGUMENT, null, 'f'); + longopts[3] = new LongOpt("type", LongOpt.REQUIRED_ARGUMENT, null, 't'); + longopts[4] = new LongOpt("debug", LongOpt.OPTIONAL_ARGUMENT, null, 'd'); + longopts[5] = new LongOpt("sup", LongOpt.REQUIRED_ARGUMENT, null, 's'); + longopts[6] = new LongOpt("list", LongOpt.REQUIRED_ARGUMENT, null, 'l'); + longopts[7] = new LongOpt("color", LongOpt.OPTIONAL_ARGUMENT, null, 'c'); + longopts[8] = new LongOpt("reference", LongOpt.REQUIRED_ARGUMENT, null, 'r'); + longopts[9] = new LongOpt("directory", LongOpt.REQUIRED_ARGUMENT, null, 'w'); + + Getopt g = new Getopt("", args, "ho:a:d::l:f:t:r:w:c::", longopts); + int c; + String arg; + + while ((c = g.getopt()) != -1) { + switch (c) { + case 'h' : + usage(); + return; + case 'o' : + /* Write output here */ + filename = g.getOptarg(); + break; + case 'r' : + /* File name for the reference alignment */ + reference = g.getOptarg(); + break; + case 'f' : + /* Sequence of results to print */ + format = g.getOptarg(); + break; + case 't' : + /* Type of output (tex/html/xml/ascii) */ + type = g.getOptarg(); + break; + case 's' : + /* Print per type or per algo */ + dominant = g.getOptarg(); + break; + case 'c' : + /* Print colored lines */ + arg = g.getOptarg(); + if ( arg != null ) { + color = arg.trim(); + } else color = "lightblue"; + break; + case 'l' : + /* List of filename */ + listFile = g.getOptarg(); + break; + case 'd' : + /* Debug level */ + arg = g.getOptarg(); + if ( arg != null ) debug = Integer.parseInt(arg.trim()); + else debug = 4; + break; + case 'w' : + /* Use the given ontology directory */ + arg = g.getOptarg(); + if ( arg != null ) ontoDir = g.getOptarg(); + else ontoDir = null; + break; + } + } + + listAlgo = new Vector<String>(); + for ( String s : listFile.split(",") ) { + listAlgo.add( s ); + } + + params = new BasicParameters(); + if (debug > 0) params.setParameter( "debug", Integer.toString( debug-1 ) ); + + print( iterateDirectories() ); + } + + public Vector<Vector> iterateDirectories (){ + Vector<Vector> result = null; + File [] subdir = null; + try { + if (ontoDir == null) { + subdir = (new File(System.getProperty("user.dir"))).listFiles(); + } else { + subdir = (new File(ontoDir)).listFiles(); + } + } catch (Exception e) { + System.err.println("Cannot stat dir "+ e.getMessage()); + usage(); + } + int size = subdir.length; + Arrays.sort(subdir); + result = new Vector<Vector>(size); + int i = 0; + for ( int j=0 ; j < size; j++ ) { + if( subdir[j].isDirectory() ) { + if ( debug > 0 ) System.err.println("\nEntering directory "+subdir[j]); + // eval the alignments in a subdirectory + // store the result + Vector vect = iterateAlignments( subdir[j] ); + if ( vect != null ){ + result.add(i, vect); + i++; + } + } + } + return result; + } + + public Vector<Object> iterateAlignments ( File dir ) { + String prefix = dir.toURI().toString()+"/"; + Vector<Object> result = new Vector<Object>(); + boolean ok = false; + result.add(0,(Object)dir.getName().toString()); + int i = 0; + // for all alignments there, + for ( String m: listAlgo ) { + i++; + // call eval + // store the result in a record + // return the record. + if ( debug > 2) System.err.println(" Considering result "+i); + Evaluator evaluator = eval( prefix+reference, prefix+m+".rdf"); + if ( evaluator != null ) ok = true; + result.add( i, evaluator ); + } + // Unload the ontologies. + try { + OntologyFactory.clear(); + } catch ( OntowrapException owex ) { // only report + owex.printStackTrace(); + } + + if ( ok == true ) return result; + else return null; + } + + public Evaluator eval( String alignName1, String alignName2 ) { + Evaluator eval = null; + try { + int nextdebug; + if ( debug < 2 ) nextdebug = 0; + else nextdebug = debug - 2; + // Load alignments + AlignmentParser aparser = new AlignmentParser( nextdebug ); + Alignment align1 = aparser.parse( alignName1 ); + if ( debug > 2 ) System.err.println(" Alignment structure1 parsed"); + aparser.initAlignment( null ); + Alignment align2 = aparser.parse( alignName2 ); + if ( debug > 2 ) System.err.println(" Alignment structure2 parsed"); + // Create evaluator object + eval = new WeightedPREvaluator( align1, align2 ); + // Compare + params.setParameter( "debug", Integer.toString( nextdebug ) ); + eval.eval( params ) ; + } catch (Exception ex) { + if ( debug > 1 ) { + ex.printStackTrace(); + } else { + System.err.println("WGroupEval: "+ex); + System.err.println(alignName1+ " - "+alignName2 ); + } + }; + return eval; + } + + /** + * This does not only print the results but compute the average as well + */ + public void print( Vector<Vector> result ) { + if ( type.equals("html") ) printHTML( result ); + else if ( type.equals("tex") ) printLATEX( result ); + else if ( type.equals("triangle") ) printTRIANGLE( result ); + } + + public void printTRIANGLE( Vector<Vector> result ) { + // variables for computing iterative harmonic means + double expected = 0.; // expected so far + double foundVect[]; // found so far + double correctVect[]; // correct so far + long timeVect[]; // time so far + foundVect = new double[ listAlgo.size() ]; + correctVect = new double[ listAlgo.size() ]; + timeVect = new long[ listAlgo.size() ]; + for( int k = listAlgo.size()-1; k >= 0; k-- ) { + foundVect[k] = 0.; + correctVect[k] = 0.; + timeVect[k] = 0; + } + for ( Vector test : result ) { + int nexpected = -1; + Enumeration f = test.elements(); + // Too bad the first element must be skipped + f.nextElement(); + for( int k = 0 ; f.hasMoreElements() ; k++) { + WeightedPREvaluator eval = (WeightedPREvaluator)f.nextElement(); + if ( eval != null ){ + // iterative H-means computation + if ( nexpected == -1 ){ + nexpected = 0; + expected += eval.getExpected(); + } + foundVect[k] += eval.getFound(); + correctVect[k] += eval.getCorrect(); + timeVect[k] += eval.getTime(); + } + } + } + System.out.println("\\documentclass[11pt]{book}"); + System.out.println(); + System.out.println("\\usepackage{pgf}"); + System.out.println("\\usepackage{tikz}"); + System.out.println(); + System.out.println("\\begin{document}"); + System.out.println("\\date{today}"); + System.out.println(""); + System.out.println("\n%% Plot generated by GenPlot of alignapi"); + System.out.println("\\begin{tikzpicture}[cap=round]"); + System.out.println("% Draw grid"); + System.out.println("\\draw[step=1cm,very thin,color=gray] (-0.2,-0.2) grid (10.0,9.0);"); + System.out.println("\\draw[|-|] (-0,0) -- (10,0);"); + System.out.println("%\\draw[dashed,very thin] (0,0) -- (5,8.66) -- (10,0);"); + System.out.println("\\draw[dashed,very thin] (10,0) arc (0:60:10cm);"); + System.out.println("\\draw[dashed,very thin] (0,0) arc (180:120:10cm);"); + + System.out.println("\\draw (0,-0.3) node {$recall$}; "); + System.out.println("\\draw (10,-0.3) node {$precision$}; "); + //System.out.println("\\draw (0,-0.3) node {0.}; "); + //System.out.println("\\draw (10,-0.3) node {1.}; "); + System.out.println("% Plots"); + int k = 0; + for ( String m: listAlgo ) { + double precision = (double)correctVect[k]/foundVect[k]; + double recall = (double)correctVect[k]/expected; + double prec2 = precision*precision; + double a = ((prec2-(recall*recall)+1)/2); + double b = java.lang.Math.sqrt( prec2 - (a*a) ); + a = a*10; b = b*10; //for printing scale 10. + System.out.println("\\draw plot[mark=+,] coordinates {("+a+","+b+")};"); + System.out.println("\\draw ("+(a+.01)+","+(b+.01)+") node[anchor=south west] {"+m+"};"); + k++; + } + System.out.println("\\end{tikzpicture}"); + System.out.println(); + System.out.println("\\end{document}"); + } + + public void printLATEX( Vector result ) { + } + + /* A few comments on how and why computing "weighted harmonic means" + (Jérôme Euzenat) + +Let Ai be the found alignment for test i, let Ri be the reference alignment for test i. +Let |A| be the size of A, i.e., the number of correspondences. + +Let P(Ri,Ai) and R(Ri,Ai) being precision and recall respectively. + +Arithmetic means is \Sum{i=1}{n} P(Ri,Ai) / n and \Sum{i=1}{n} R(Ri,Ai) / n. + +Weighted harmonic means is + +\Sum{i=1}{n} Wi / \Sum{i=1}{n} (Wi/P(Ri,Ai)) +and +\Sum{i=1}{n} Wi / \Sum{i=1}{n} (Wi/R(Ri,Ai)) + +The goal of using it is that the result be the Precision and Recall of all tests (and not the average precision and recall). + +If we take Wi = |Ai\cap Ri| +Then we have exactly this result: + +\Sum{i=1}{n} Wi / \Sum{i=1}{n} (Wi/P(Ri,Ai)) + = P( \cup{i=1}{n} Ri, \cup{i=1}{n} Ai ) +(here no two correspondences are equivalent so \cup is a disjunct sum). + +[[you can replace Wi by kilometers, Precision by kilometers-per-hour +or you can do the test by yourself to convince you that this is true]] + +So our goal is to compute the weighted harmonic means with these weights because this will provide us the true precision and recall. + +In fact what the algorithm does is not to compute the harmonic means! I rephrase it, it computes the harmonic means of the numbers above it but since this is equivalent to computing precision and recall, it just computes it! + +How? +For each column k in the table (corresponding to an algorithm), it maintains two vectors: +correctVect[k] and foundVect[k] +which is equal to \Sum{i=1}{n} |Ai\cap Ri| and \Sim{i=1}{n} |Ai| +and it additionally stores in "expected" the size of \Sum{i=1}{n} |Ri| + +So computing the average means of these columns, with the weights corresponding respectively to the size |Ai\cup Ri|, corresponds to computing: + + correctVect[k] / foundVect[k] +and + correctVect[k] / expected + +which the program does... + */ + public void printHTML( Vector<Vector> result ) { + // variables for computing iterative harmonic means + int expected = 0; // expected so far + int foundVect[]; // found so far + int correctVect[]; // correct so far + long timeVect[]; // time so far + PrintStream writer = null; + fsize = format.length(); + // JE: the writer should be put out + // JE: the h-means computation should be put out as well + try { + // Print result + if ( filename == null ) { + writer = System.out; + } else { + writer = new PrintStream(new FileOutputStream( filename )); + } + Formatter formatter = new Formatter(writer); + + // Print the header + if ( embedded != true ) writer.println("<html><head></head><body>"); + writer.println("<table border='2' frame='sides' rules='groups'>"); + writer.println("<colgroup align='center' />"); + // for each algo <td spancol='2'>name</td> + for ( String m : listAlgo ) { + writer.println("<colgroup align='center' span='"+fsize+"' />"); + } + // For each file do a + writer.println("<thead valign='top'><tr><th>algo</th>"); + // for each algo <td spancol='2'>name</td> + for ( String m : listAlgo ) { + writer.println("<th colspan='"+fsize+"'>"+m+"</th>"); + } + writer.println("</tr></thead><tbody><tr><td>test</td>"); + // for each algo <td>Prec.</td><td>Rec.</td> + for ( String m : listAlgo ) { + for ( int i = 0; i < fsize; i++){ + writer.print("<td>"); + if ( format.charAt(i) == 'p' ) { + writer.print("Prec."); + } else if ( format.charAt(i) == 'f' ) { + writer.print("FMeas."); + } else if ( format.charAt(i) == 'o' ) { + writer.print("Over."); + } else if ( format.charAt(i) == 't' ) { + writer.print("Time"); + } else if ( format.charAt(i) == 'r' ) { + writer.print("Rec."); + } + writer.println("</td>"); + } + //writer.println("<td>Prec.</td><td>Rec.</td>"); + } + writer.println("</tr></tbody><tbody>"); + foundVect = new int[ listAlgo.size() ]; + correctVect = new int[ listAlgo.size() ]; + timeVect = new long[ listAlgo.size() ]; + for( int k = listAlgo.size()-1; k >= 0; k-- ) { + foundVect[k] = 0; + correctVect[k] = 0; + timeVect[k] = 0; + } + // </tr> + // For each directory <tr> + boolean colored = false; + for ( Vector test : result ) { + int nexpected = -1; + if ( colored == true && color != null ){ + colored = false; + writer.println("<tr bgcolor=\""+color+"\">"); + } else { + colored = true; + writer.println("<tr>"); + }; + // Print the directory <td>bla</td> + writer.println("<td>"+(String)test.get(0)+"</td>"); + // For each record print the values <td>bla</td> + Enumeration f = test.elements(); + f.nextElement(); + for( int k = 0 ; f.hasMoreElements() ; k++) { + WeightedPREvaluator eval = (WeightedPREvaluator)f.nextElement(); + if ( eval != null ){ + // iterative H-means computation + if ( nexpected == -1 ){ + expected += eval.getExpected(); + nexpected = 0; + } + foundVect[k] += eval.getFound(); + correctVect[k] += eval.getCorrect(); + timeVect[k] += eval.getTime(); + + for ( int i = 0 ; i < fsize; i++){ + writer.print("<td>"); + if ( format.charAt(i) == 'p' ) { + formatter.format("%1.2f", eval.getPrecision()); + } else if ( format.charAt(i) == 'f' ) { + formatter.format("%1.2f", eval.getFmeasure()); + } else if ( format.charAt(i) == 'o' ) { + formatter.format("%1.2f", eval.getOverall()); + } else if ( format.charAt(i) == 't' ) { + if ( eval.getTime() == 0 ){ + writer.print("-"); + } else { + formatter.format("%1.2f", eval.getTime()); + } + } else if ( format.charAt(i) == 'r' ) { + formatter.format("%1.2f", eval.getRecall()); + } + writer.println("</td>"); + } + } else { + writer.println("<td>n/a</td><td>n/a</td>"); + } + } + writer.println("</tr>"); + } + writer.print("<tr bgcolor=\"yellow\"><td>H-mean</td>"); + // Here we are computing a sheer average. + // While in the column results we print NaN when the returned + // alignment is empty, + // here we use the real values, i.e., add 0 to both correctVect and + // foundVect, so this is OK for computing the average. + int k = 0; + // ??? + for ( String m : listAlgo ) { + double precision = (double)correctVect[k]/foundVect[k]; + double recall = (double)correctVect[k]/expected; + for ( int i = 0 ; i < fsize; i++){ + writer.print("<td>"); + if ( format.charAt(i) == 'p' ) { + formatter.format("%1.2f", precision); + } else if ( format.charAt(i) == 'f' ) { + formatter.format("%1.2f", 2 * precision * recall / (precision + recall)); + } else if ( format.charAt(i) == 'o' ) { + formatter.format("%1.2f", recall * (2 - (1 / precision))); + } else if ( format.charAt(i) == 't' ) { + if ( timeVect[k] == 0 ){ + writer.print("-"); + } else { + formatter.format("%1.2f", timeVect[k]); + } + } else if ( format.charAt(i) == 'r' ) { + formatter.format("%1.2f", recall); + } + writer.println("</td>"); + }; + k++; + } + writer.println("</tr>"); + writer.println("</tbody></table>"); + writer.println("<p><small>n/a: result alignment not provided or not readable<br />"); + writer.println("NaN: division per zero, likely due to empty alignment.</small></p>"); + if ( embedded != true ) writer.println("</body></html>"); + writer.close(); + } catch (Exception ex) { + ex.printStackTrace(); + } + } + + public void usage() { + System.out.println("usage: WGroupEval [options]"); + System.out.println("options are:"); + System.out.println("\t--format=prfot -r prfot\tSpecifies the output order (precision/recall/f-measure/overall/time)"); + // Apparently not implemented + //System.out.println("\t--sup=algo -s algo\tSpecifies if dominant columns are algorithms or measure"); + System.out.println("\t--output=filename -o filename\tSpecifies a file to which the output will go"); + System.out.println("\t--reference=filename -r filename\tSpecifies the name of the reference alignment file (default: refalign.rdf)"); + + System.out.println("\t--type=html|xml|tex|ascii|triangle -t html|xml|tex|ascii\tSpecifies the output format"); + System.out.println("\t--list=algo1,...,algon -l algo1,...,algon\tSequence of the filenames to consider"); + System.out.println("\t--color=color -c color\tSpecifies if the output must color even lines of the output"); + System.out.println("\t--debug[=n] -d [n]\t\tReport debug info at level n"); + System.out.println("\t--help -h\t\t\tPrint this message"); + System.err.print("\n"+WGroupEval.class.getPackage().getImplementationTitle()+" "+WGroupEval.class.getPackage().getImplementationVersion()); + System.err.println(" ($Id$)\n"); + } +} + -- GitLab