diff --git a/src/main/java/fr/inrialpes/exmo/linkkey/LinkkeyExtraction.java b/src/main/java/fr/inrialpes/exmo/linkkey/LinkkeyExtraction.java index 53847721709971cd8110fb2cbf85ace05ddc4091..d6e9fa7a8f93f40506d51f7831f2cfcaa794001e 100644 --- a/src/main/java/fr/inrialpes/exmo/linkkey/LinkkeyExtraction.java +++ b/src/main/java/fr/inrialpes/exmo/linkkey/LinkkeyExtraction.java @@ -141,7 +141,7 @@ public class LinkkeyExtraction { classes.setArgs(0); options.addOption(classes); - Option classesfull = new Option("classesfull", "extracts link keys candidates with classes full"); + Option classesfull = new Option("classesfull", "extracts link keys candidates with classes full (may be very expensive)"); classesfull.setRequired(false); classesfull.setArgs(0); options.addOption(classesfull); @@ -173,7 +173,7 @@ public class LinkkeyExtraction { sparql.setType(java.lang.String.class); options.addOption(sparql); - Option compos = new Option("c", "considers composition of properties"); + Option compos = new Option("c", "compose properties"); compos.setRequired(false); compos.setArgName("composition length"); compos.setArgs(1); @@ -198,7 +198,7 @@ public class LinkkeyExtraction { eval.setType(java.lang.String.class); options.addOption(eval); - Option inverse = new Option("i", "considers inverse of properties"); + Option inverse = new Option("i", "considers inverse of properties (only useful with -c)"); inverse.setArgName("i"); inverse.setArgs(0); inverse.setType(java.lang.String.class); diff --git a/src/main/java/fr/inrialpes/exmo/linkkey/normalizers/StringNormalizer.java b/src/main/java/fr/inrialpes/exmo/linkkey/normalizers/StringNormalizer.java index 2176c7cd4ae8c89e601fd04a0625d8aa63ee27d8..91ef2e388dd931eaf3f3ca7237e0ce0b8ede78e8 100644 --- a/src/main/java/fr/inrialpes/exmo/linkkey/normalizers/StringNormalizer.java +++ b/src/main/java/fr/inrialpes/exmo/linkkey/normalizers/StringNormalizer.java @@ -26,6 +26,7 @@ import java.text.ParseException; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; +import java.util.regex.Pattern; import org.apache.jena.datatypes.BaseDatatype; import org.apache.jena.datatypes.xsd.XSDDateTime; import org.apache.jena.datatypes.xsd.impl.RDFhtml; @@ -64,18 +65,36 @@ public class StringNormalizer extends AbstractNormalizer { // other types (that are not String) if (!(o instanceof String)) { - //System.err.println(o.toString()); + //System.err.println("NO STRING : "+o.toString()); return o.toString(); } String s = (String) o; + // for ISBN-10 or ISBN-13 + // regular expression from O'reilly regular-expressions-cookbook + // ^(?:ISBN(?:-1[03])?:?\ )?(?=[0-9X]{10}$|(?=(?:[0-9]+[-\ ]){3})[-\ 0-9X]{13}$|97[89][0-9]{10}$|(?=(?:[0-9]+[-\ ]){4})[-\ 0-9]{17}$)(?:97[89][-\ ]?)?[0-9]{1,5}[-\ ]?[0-9]+[-\ ]?[0-9]+[-\ ]?[0-9X]$ + // be careful to apply it before parsing number because an isbn is parsed as a number. + //if (s.matches("([0-9]+-)+[0-9]+")) { + if (s.matches("^(?:ISBN(?:-1[03])?:?\\ )?(?=[0-9X]{10}$|(?=(?:[0-9]+[-\\ ]){3})[-\\ 0-9X]{13}$|97[89][0-9]{10}$|(?=(?:[0-9]+[-\\ ]){4})[-\\ 0-9]{17}$)(?:97[89][-\\ ]?)?[0-9]{1,5}[-\\ ]?[0-9]+[-\\ ]?[0-9]+[-\\ ]?[0-9X]$")) { + //System.err.println("ISBN : "+s); + return s; + } + // try if the string is a number - try { - String res = NumberFormat.getNumberInstance().parse(s).toString(); - return res; - } catch (ParseException e) {} - + if (s.matches("^-?(([0-9]+)|([0-9]*\\.[0-9]*))$")) { + try { + String res = NumberFormat.getNumberInstance().parse(s).toString(); + return res; + } catch (ParseException e) {} + } + + /*try { + String res = NumberFormat.getNumberInstance().parse(s).toString(); + System.err.println("Number missed : "+s); + } catch (ParseException e) {} + */ + //return s; if (s.contains("://") || s.contains("/")) { try {