diff --git a/src/main/java/fr/inrialpes/exmo/linkkey/CombinationExperiments.java b/src/main/java/fr/inrialpes/exmo/linkkey/CombinationExperiments.java index d7e49e3f659a29ebca6d4fb6407941c4a41d4c5e..050f396af34d1876fa6b0d1dd91093b0d8da0142 100644 --- a/src/main/java/fr/inrialpes/exmo/linkkey/CombinationExperiments.java +++ b/src/main/java/fr/inrialpes/exmo/linkkey/CombinationExperiments.java @@ -223,7 +223,7 @@ public class CombinationExperiments { if (res==0) { res = c1.getLinksSize()-c2.getLinksSize(); if (res==0) { - return (c1.equals(c2)) ? 0 : -1; + return (c1.equals(c2)) ? 0 : c1.toString().compareTo(c2.toString()); } } } @@ -252,7 +252,7 @@ public class CombinationExperiments { if (res==0) { res = c1.getLinksSize() - c2.getLinksSize(); if (res == 0) { - return (c1.equals(c2)) ? 0 : -1; + return (c1.equals(c2)) ? 0 : c1.toString().compareTo(c2.toString()); } } } @@ -319,7 +319,7 @@ public class CombinationExperiments { result=m.getRecall(c1)-m.getRecall(c2); } if (result==0) { - return 1; + return c1.toString().compareTo(c2.toString()); } else if (result>0) { return 1; } @@ -342,7 +342,7 @@ public class CombinationExperiments { out.println("################################################################################"); - out.println("# Chains optimizing the Fmeasure (in case of exaequo then precision, then recall)"); + out.println("# Chains optimizing the F-measure (in case of exaequo then precision, then recall)"); out.println("################################################################################"); refineBestAntichainDisplayPR(res, new FmeasureComp(m),1, 10, out); @@ -961,9 +961,11 @@ public class CombinationExperiments { // generate antichains of length 1 for (CandidateLinkkey c : result.getCandidates()) { - AntichainDisj chain = new AntichainDisj(c); - candidates.add(chain); - nbAntichains+=1; + if (result.getTop()!=c) { + AntichainDisj chain = new AntichainDisj(c); + candidates.add(chain); + nbAntichains+=1; + } } // build the list of candidates ordered by comp diff --git a/src/main/java/fr/inrialpes/exmo/linkkey/LinkkeyDiscoveryAlgorithm.java b/src/main/java/fr/inrialpes/exmo/linkkey/LinkkeyDiscoveryAlgorithm.java index 40d1e20537fce938f4b09c5f1331c01f01a2cb6e..5bdc725aa01350b059af10f2fdd592c2effb27f1 100644 --- a/src/main/java/fr/inrialpes/exmo/linkkey/LinkkeyDiscoveryAlgorithm.java +++ b/src/main/java/fr/inrialpes/exmo/linkkey/LinkkeyDiscoveryAlgorithm.java @@ -905,6 +905,7 @@ public class LinkkeyDiscoveryAlgorithm { LongSet eq = descriptions.getEqPairs(); CandidateLinkkey bottom = new CandidateLinkkey(eq,in, Collections.emptySet(), Collections.emptySet()); + // CandidateLinkkey bottom = new CandidateLinkkey(eq,in, Collections.emptySet(), Collections.emptySet()); Iterator<Long2ObjectMap.Entry<LongSet[]>> it = descriptions.entryIterator(); while (it.hasNext()) { @@ -945,7 +946,6 @@ public class LinkkeyDiscoveryAlgorithm { c.addSpecificLink(link); } - ExtractionResult res = new ExtractionResult(bottom.getTop(), pIdxDS1.getStrings() , pIdxDS2.getStrings(), uriIdx.getStrings(), config.typesDS1, config.typesDS2, diff --git a/src/main/java/fr/inrialpes/exmo/linkkey/LinkkeyExtraction.java b/src/main/java/fr/inrialpes/exmo/linkkey/LinkkeyExtraction.java index 2aa6c497152a6118310e470d2dc4ac7c6a167350..0c8b36ff8ddf89d1a57eca86271decf73ee52e1d 100644 --- a/src/main/java/fr/inrialpes/exmo/linkkey/LinkkeyExtraction.java +++ b/src/main/java/fr/inrialpes/exmo/linkkey/LinkkeyExtraction.java @@ -325,9 +325,9 @@ public class LinkkeyExtraction { Paths.get(dataset2)); r.run(); - } /*else if ("expeclust".equals(format)) { - ExpeClust.clusteringExpe(res,Paths.get(outputFilename)); - }*/ else { + } else if ("expeclust".equals(format)) { + fr.inrialpes.exmo.linkkey.expeclust.ExpeClust.clusteringExpe(res,Paths.get(outputFilename)); + } else { PrintWriter out = new PrintWriter(System.out); if (commandLine.hasOption('o')) { out = new PrintWriter(Files.newBufferedWriter(Paths.get(outputFilename))); @@ -343,10 +343,13 @@ public class LinkkeyExtraction { CombinationExperiments.expeCoverage(res, out); }/* else if ("txt2".equals(format)) { renderTXT2(res, true, out); - } - else if ("hclust".equals(format)) { - ClusteringInterface.clustering(res); }*/ + else if ("hclust".equals(format)) { + fr.inrialpes.exmo.linkkey.expeclust.ClusteringInterface.clustering(res); + } + else if ("viz".equals(format)) { + fr.inrialpes.exmo.linkkey.viz.LKLatticeViz.draw(res); + } else { TxtLinkKeyRenderer r = new TxtLinkKeyRenderer(res); r.render(out); diff --git a/src/main/java/fr/inrialpes/exmo/linkkey/clustering/BinaryCluster.java b/src/main/java/fr/inrialpes/exmo/linkkey/clustering/BinaryCluster.java new file mode 100644 index 0000000000000000000000000000000000000000..138704dc2b2ebf343109fda9e8d87e22566faedb --- /dev/null +++ b/src/main/java/fr/inrialpes/exmo/linkkey/clustering/BinaryCluster.java @@ -0,0 +1,119 @@ +package fr.inrialpes.exmo.linkkey.clustering; + +import fr.inrialpes.exmo.linkkey.CandidateLinkkey; + +import java.util.*; + +public class BinaryCluster extends Cluster { + + public final Cluster c1; + public final Cluster c2; + private List<CandidateLinkkey> lkSet; + + private Map<Cluster,Double> sims; + + private double[] sumSims; + + public BinaryCluster(LKHClust clust,double sim, Cluster c1, Cluster c2) { + super(clust,sim,c1.nb+c2.nb); + this.c1=c1; + this.c2=c2; + lkSet=new ArrayList<>();//(c1.getLK(),c2.getLK()); + lkSet.addAll(c1.getLK()); + lkSet.addAll(c2.getLK()); + sims = new HashMap<>(); + //sumSims(); + } + + + @Override + public CandidateLinkkey getSup() { + CandidateLinkkey c1Sup = c1.getSup(); + CandidateLinkkey c2Sup = c2.getSup(); + if (c1Sup==null || c2Sup==null) return null; + CandidateLinkkey sup = c1.getSup().getSup(c2.getSup()); + if (!lkSet.contains(sup)) return null; + return sup; + } + + @Override + public CandidateLinkkey getInf() { + CandidateLinkkey c1Inf = c1.getInf(); + CandidateLinkkey c2Inf = c2.getInf(); + if (c1Inf==null || c2Inf==null) return null; + CandidateLinkkey inf = c1.getInf().getInf(c2.getInf()); + if (!lkSet.contains(inf)) return null; + return inf; + } + + + + protected double[] sumSims() { + if (sumSims==null) { + c1.getMedoid(); c2.getMedoid(); + sumSims=new double[c1.nb+c2.nb]; + System.arraycopy(c1.sumSims(),0,sumSims,0,c1.nb); + System.arraycopy(c2.sumSims(),0,sumSims,c1.nb,c2.nb); + if (c1 instanceof BinaryCluster) ((BinaryCluster) c1).sumSims=null; + if (c2 instanceof BinaryCluster) ((BinaryCluster) c2).sumSims=null; + + int i=0; + for (CandidateLinkkey l1 : c1.getLK()) { + int j=0; + for (CandidateLinkkey l2 : c2.getLK()) { + double s = clust.getSim(l1,l2); + sumSims[i]+=s; + sumSims[c1.nb+j]+=s; + j++; + } + i++; + } + } + return sumSims; + } + + public void clear() { + sims=null; + } + + public double getSim(Cluster c) { + Double res = sims.get(c); + if (res ==null) { + if (c instanceof BinaryCluster && ((BinaryCluster) c).sims.containsKey(this)) { + return ((BinaryCluster) c).sims.get(this); + } + res = Math.min(c1.getSim(c),c2.getSim(c)); + //res = Math.max(c1.getSim(c),c2.getSim(c)); + sims.put(c,res); + } + return res; + } + + public void addToList(List<LeafCluster> l) { + c1.addToList(l); + c2.addToList(l); + } + + protected void cutAt(double sim, Set<Cluster> selected) { + if (this.sim>=sim) { + selected.add(this); + } + else { + c1.cutAt(sim,selected); + c2.cutAt(sim,selected); + } + } + + + @Override + protected void getAllCluster(List<Cluster> l) { + l.add(this); + c1.getAllCluster(l); + c1.getAllCluster(l); + } + + @Override + public List<CandidateLinkkey> getLK() { + return Collections.unmodifiableList(lkSet); + } +} diff --git a/src/main/java/fr/inrialpes/exmo/linkkey/clustering/ClosureSimilarity.java b/src/main/java/fr/inrialpes/exmo/linkkey/clustering/ClosureSimilarity.java new file mode 100644 index 0000000000000000000000000000000000000000..0784e07ab7416ac5f55dcd80f6078e1fca92afef --- /dev/null +++ b/src/main/java/fr/inrialpes/exmo/linkkey/clustering/ClosureSimilarity.java @@ -0,0 +1,26 @@ +package fr.inrialpes.exmo.linkkey.clustering; + +import fr.inrialpes.exmo.linkkey.CandidateLinkkey; + +public class ClosureSimilarity extends ConceptSimilarity { + + public ClosureSimilarity(CandidateLinkkey top, int topSize) { + super(top, topSize); + } + + @Override + public double similarity(CandidateLinkkey c1, CandidateLinkkey c2) { + CandidateLinkkey inf =c1.getInf(c2); + int inter = inf.getLinksSize(); + if (inter==0) return 0; + + CandidateLinkkey sup = c1.getSup(c2); + + int union = sup.getLinksSize(); + // In case of top, use the size of cartesian product + if (sup==top) { + union=topSize; + } + return (double) inter / union; + } +} diff --git a/src/main/java/fr/inrialpes/exmo/linkkey/clustering/Cluster.java b/src/main/java/fr/inrialpes/exmo/linkkey/clustering/Cluster.java new file mode 100644 index 0000000000000000000000000000000000000000..818dbb8f526c3d016b85c8d276fba09ee1f663d7 --- /dev/null +++ b/src/main/java/fr/inrialpes/exmo/linkkey/clustering/Cluster.java @@ -0,0 +1,142 @@ +package fr.inrialpes.exmo.linkkey.clustering; + +import fr.inrialpes.exmo.linkkey.CandidateLinkkey; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +public abstract class Cluster { + + public boolean isMerged=false; + public final double sim; + public final int nb; + + public final LKHClust clust; + private CandidateLinkkey medoid; + + + + public Cluster(LKHClust clust, double sim, int nb) { + this.clust=clust; + this.sim=sim; + this.nb=nb; + + } + + public abstract CandidateLinkkey getSup(); + public abstract CandidateLinkkey getInf(); + + public boolean isConvexSublattice() { + CandidateLinkkey sup = getSup(); + if (sup==null) return false; + CandidateLinkkey inf = getInf(); + if (inf==null) return false; + HashSet<CandidateLinkkey> s = new HashSet<>(getSup().getDescendants()); + s.retainAll(getInf().getAncestors()); + s.add(sup); + s.add(inf); + HashSet<CandidateLinkkey> s2 = new HashSet<>(getLK()); + return s.equals(s2); + } + + public List<Cluster> getAllClusters() { + ArrayList<Cluster> res = new ArrayList<>(); + getAllCluster(res); + return res; + } + + protected abstract void getAllCluster(List<Cluster> l); + + + public abstract List<CandidateLinkkey> getLK(); + public abstract void addToList(List<LeafCluster> l); + + public void clear() {} + + public abstract double getSim(Cluster c) ; + + protected abstract double[] sumSims(); + + + protected abstract void cutAt(double sim, Set<Cluster> selected); + + public Set<Cluster> cutAt(double sim) { + HashSet<Cluster> res = new HashSet<>(); + cutAt(sim,res); + return res; + } + + public CandidateLinkkey getMedoid() { + if (medoid==null) { + int maxIdx=0; + double maxSim=0; + double[] sims = sumSims(); + for (int i=0 ; i < nb ; i++) { + if (maxSim<sims[i] || (maxSim==sims[i] && getLK().get(maxIdx).getLinksSize()<getLK().get(i).getLinksSize())) { + maxSim=sims[i]; + maxIdx=i; + } + } + medoid = getLK().get(maxIdx); + /*List<CandidateLinkkey> l = new ArrayList<>(getLK()); + double[] sims = new double[l.size()]; + ListIterator<CandidateLinkkey> it1 = l.listIterator(); + int i = 0; + while (it1.hasNext()) { + CandidateLinkkey c1 = it1.next(); + ListIterator<CandidateLinkkey> it2 = l.listIterator(it1.nextIndex()); + while (it2.hasNext()) { + CandidateLinkkey c2 = it2.next(); + double s = clust.getSim(c1, c2); + sims[it1.previousIndex()] += s; + sims[it2.previousIndex()] += s; + } + } + + double max = -1; + int maxLinks=0; + // List<CandidateLinkkey> medoids = new LinkedList<>(); + medoid = null; + for (i = 0; i < sims.length; i++) { + if (sims[i] > max ) { + //medoids.clear(); + //medoids.add(l.get(i)); + medoid = l.get(i); + maxLinks = medoid.getLinksSize(); + + max = sims[i]; + } else if (sims[i] == max ) { + int ls = l.get(i).getLinksSize(); + if (ls>maxLinks){ + //medoids.clear(); + //medoids.add(l.get(i)); + medoid = l.get(i); + maxLinks = medoid.getLinksSize(); + } + } + }*/ + + // in case of ex-aequo + /*if (medoids.size()>1) { + it1 = medoids.listIterator(); + while (it1.hasNext()) { + CandidateLinkkey c1 = it1.next(); + Iterator<CandidateLinkkey> it2 = medoids.listIterator(); + while (it2.hasNext()) { + CandidateLinkkey c2 = it2.next(); + if (c1!=c2 && c1.getLinksSize()<c2.getLinksSize()) { + it1.remove(); + break; + } + } + } + // if more than one, too bad we chose one randomly from the remaining + medoid=medoids.iterator().next(); + //if (medoids.size()>1) System.out.println("To solve : Equivalent medoid, sim="+medoids.size()+" "+getLK().size()); + }*/ + } + return medoid; + } +} diff --git a/src/main/java/fr/inrialpes/exmo/linkkey/clustering/ConceptSimilarity.java b/src/main/java/fr/inrialpes/exmo/linkkey/clustering/ConceptSimilarity.java new file mode 100644 index 0000000000000000000000000000000000000000..0acc662d1b6cb654834a401827b1cb25c17bcb59 --- /dev/null +++ b/src/main/java/fr/inrialpes/exmo/linkkey/clustering/ConceptSimilarity.java @@ -0,0 +1,16 @@ +package fr.inrialpes.exmo.linkkey.clustering; + +import fr.inrialpes.exmo.linkkey.CandidateLinkkey; + +public abstract class ConceptSimilarity { + + protected final CandidateLinkkey top; + protected final int topSize; + + public ConceptSimilarity(CandidateLinkkey top, int topSize) { + this.top=top; + this.topSize=topSize; + } + + public abstract double similarity(CandidateLinkkey c1, CandidateLinkkey c2); +} diff --git a/src/main/java/fr/inrialpes/exmo/linkkey/clustering/Dendrogramme.java b/src/main/java/fr/inrialpes/exmo/linkkey/clustering/Dendrogramme.java new file mode 100644 index 0000000000000000000000000000000000000000..d0926659d61719d5e0f7d78a046bec2ae9e7c23e --- /dev/null +++ b/src/main/java/fr/inrialpes/exmo/linkkey/clustering/Dendrogramme.java @@ -0,0 +1,214 @@ +package fr.inrialpes.exmo.linkkey.clustering; + +import fr.inrialpes.exmo.linkkey.CandidateLinkkey; +import fr.inrialpes.exmo.linkkey.utils.renderer.TxtLinkKeyRenderer; + +import javax.swing.*; +import java.awt.*; +import java.util.List; +import java.util.*; + +public class Dendrogramme extends JComponent { + + private static int WIDTH=500; + private static int START=50; + private final static int LEAF_HEIGHT = 20; + + private final TxtLinkKeyRenderer r; + private final Cluster root; + + private List<LeafCluster> leaves; + + private Map<Cluster,Integer> positions; + private Map<Cluster,Rectangle> rectangles; + + private String longestLbl; + private double cutSim; + private List<Rectangle> rectanglesToPaint; + + private JPanel headerPanel; + + public Dendrogramme(Cluster root, TxtLinkKeyRenderer r) { + this.r=r; + this.root=root; + leaves = new ArrayList<>(); + root.addToList(leaves); + int i=LEAF_HEIGHT; + longestLbl=""; + positions = new HashMap<>(); + rectangles = new HashMap<>(); + for (LeafCluster l : leaves) { + positions.put(l,i); + rectangles.put(l, new Rectangle(START+WIDTH, i-LEAF_HEIGHT/2,0,LEAF_HEIGHT)); + i+=LEAF_HEIGHT; + String lbl = r.toString(l.c,true); + if (lbl.length()>longestLbl.length()) { + longestLbl=lbl; + } + + } + yPositionOf(root); + cutSim=0; + rectanglesToPaint=Collections.emptyList(); + } + + private int yPositionOf(Cluster c) { + Integer res = positions.get(c); + if (res!=null) return res; + if (c instanceof BinaryCluster) { + BinaryCluster cb = (BinaryCluster) c; + int c1pos = yPositionOf(cb.c1); + int c2pos = yPositionOf(cb.c2); + int pos = Math.min(c1pos,c2pos) + Math.abs(c1pos-c2pos)/2; + positions.put(c,pos); + return pos; + } + System.err.println("--"+c.getClass()); + return positions.get(c); + } + + private Rectangle rectangleOf(Cluster c) { + Rectangle res = rectangles.get(c); + if (res!=null) return res; + if (c instanceof BinaryCluster) { + BinaryCluster cb = (BinaryCluster) c; + Rectangle r1 = rectangleOf(cb.c1); + Rectangle r2 = rectangleOf(cb.c2); + + int x = START + (int) (WIDTH*cb.sim); + int y = Math.min(r1.y,r2.y); + int height = r1.height+r2.height; + int width = START + WIDTH - x; + res = new Rectangle(x,y,width,height); + rectangles.put(cb,res); + return res; + } + return rectangles.get(c); + } + + public void setCut(double t) { + if (t<0 || t>1) throw new IllegalArgumentException("cut threshold has to be between 0 and 1, currently : "+t); + if (cutSim!=t) { + cutSim = t; + rectanglesToPaint = rectanglesToPaint(cutSim); + repaint(); + } + } + + public JComponent getColumnHeader() { + if (headerPanel==null) { + headerPanel = new JPanel(); + headerPanel.setLayout(new BoxLayout(headerPanel, BoxLayout.LINE_AXIS)); + + JSlider slider = new JSlider(0, 100, 0); + // paint the ticks and tracks + Dictionary<Integer, JLabel> lbl = new Hashtable<>(); + for (int i = 0; i < 100; i += 10) { + lbl.put(i, new JLabel(String.valueOf((double) i / 100))); + } + slider.setLabelTable(lbl); + slider.setMajorTickSpacing(10); + slider.setMinorTickSpacing(5); + slider.setPaintTrack(true); + slider.setPaintLabels(true); + slider.setPaintTicks(true); + + slider.setPreferredSize(new Dimension(WIDTH, slider.getPreferredSize().height)); + slider.setMaximumSize(new Dimension(WIDTH, slider.getPreferredSize().height)); + + SpinnerNumberModel spinnerModel = new SpinnerNumberModel(0, 0, 1,0.05); + JSpinner spinner = new JSpinner(spinnerModel); + //spinner.setMaximumSize(); + + slider.addChangeListener(evt -> { + double newCutSim = (double) slider.getValue() / 100; + if (newCutSim != cutSim) { + spinner.setValue(newCutSim); + setCut(newCutSim); + } + }); + + spinner.addChangeListener( evt -> { + double newCutSim = (double) spinner.getValue(); + if (newCutSim != cutSim) { + slider.setValue((int) (newCutSim*100)); + setCut(newCutSim); + } + }); + + headerPanel.add(Box.createRigidArea(new Dimension(START, 0))); + headerPanel.add(slider); + headerPanel.add(spinner); + headerPanel.add(Box.createHorizontalGlue()); + } + return headerPanel; + } + + @Override + public Dimension getPreferredSize() { + int txtLength = this.getGraphics().getFontMetrics().stringWidth(longestLbl); + return new Dimension(START + WIDTH +10 + txtLength ,(LEAF_HEIGHT)* leaves.size() + LEAF_HEIGHT); + } + + + + private List<Rectangle> rectanglesToPaint(double sim) { + Set<Cluster> clust = root.cutAt(sim); + List<Rectangle> res = new ArrayList<>(); + for (Cluster c : clust) { + if (c instanceof BinaryCluster && c!=root) + res.add(rectangleOf(c)); + } + Collections.sort(res, (c1,c2) -> c1.y-c2.y); + return res; + } + + @Override + protected void paintComponent(Graphics g) { + //super.paintComponent(g); + Graphics2D g2d = (Graphics2D) g; + g2d.setRenderingHint(RenderingHints.KEY_TEXT_ANTIALIASING,RenderingHints.VALUE_TEXT_ANTIALIAS_ON); + + //Set<Cluster> clusters = root.cutAt(cutSim); + HashSet<CandidateLinkkey> medoid = new HashSet<>(); + //clusters.forEach( c -> medoid.add(c.getMedoid())); + + // draw leaves text + int fontHeight = g.getFontMetrics().getHeight(); + int i=LEAF_HEIGHT; + for (LeafCluster c : leaves) { + if (medoid.contains(c.c)) { + g2d.setColor(Color.RED); + } + else { + g2d.setColor(Color.BLACK); + } + g.drawString(c.c.getLinksSize()+" " +r.toString(c.c,true), WIDTH + START + 10, i+fontHeight/2); + i+=LEAF_HEIGHT; + } + g2d.setColor(Color.BLACK); + + + for (Rectangle r : rectanglesToPaint) { + g2d.setColor(g2d.getColor()==Color.CYAN?Color.GREEN:Color.CYAN); + g2d.fill(r); + } + g2d.setColor(Color.RED); + g2d.fillRect( START + (int) (WIDTH*cutSim),0,2,this.getHeight()); + g2d.setColor(Color.BLACK); + + // draw dendrogramme + for (Cluster c : positions.keySet()) { + if (c instanceof BinaryCluster) { + BinaryCluster cb = (BinaryCluster) c; + int x = START + (int) (WIDTH*c.sim); + // vertical line + g.drawLine(x,positions.get(cb.c1),x,positions.get(cb.c2)); + // horizontal to c1 + g.drawLine(x,positions.get(cb.c1),START + (int) (WIDTH*cb.c1.sim),positions.get(cb.c1)); + // horizontal to c2 + g.drawLine(x,positions.get(cb.c2),START + (int) (WIDTH*cb.c2.sim),positions.get(cb.c2)); + } + } + } +} diff --git a/src/main/java/fr/inrialpes/exmo/linkkey/clustering/JaccardSimilarity.java b/src/main/java/fr/inrialpes/exmo/linkkey/clustering/JaccardSimilarity.java new file mode 100644 index 0000000000000000000000000000000000000000..b1b0959d57f926ab7a844074254c927191bd5051 --- /dev/null +++ b/src/main/java/fr/inrialpes/exmo/linkkey/clustering/JaccardSimilarity.java @@ -0,0 +1,25 @@ +package fr.inrialpes.exmo.linkkey.clustering; + +import fr.inrialpes.exmo.linkkey.CandidateLinkkey; + +public class JaccardSimilarity extends ConceptSimilarity { + + + public JaccardSimilarity(CandidateLinkkey top, int topSize) { + super(top, topSize); + } + + @Override + public double similarity(CandidateLinkkey c1, CandidateLinkkey c2) { + CandidateLinkkey inf =c1.getInf(c2); + int inter = inf.getLinksSize(); + if (inter==0) return 0; + + int c1Size = c1==top?topSize:c1.getLinksSize(); + int c2Size = c2==top?topSize:c2.getLinksSize(); + int union = c1Size + c2Size - inter; + + return (double) inter / union; + + } +} diff --git a/src/main/java/fr/inrialpes/exmo/linkkey/clustering/LKHClust.java b/src/main/java/fr/inrialpes/exmo/linkkey/clustering/LKHClust.java new file mode 100644 index 0000000000000000000000000000000000000000..c3e9ff0d22c06b2239a48e720ddfdbdbb4cb026e --- /dev/null +++ b/src/main/java/fr/inrialpes/exmo/linkkey/clustering/LKHClust.java @@ -0,0 +1,304 @@ +package fr.inrialpes.exmo.linkkey.clustering; + + +import fr.inrialpes.exmo.linkkey.CandidateLinkkey; + +import java.util.*; +import java.util.concurrent.atomic.AtomicInteger; + +public class LKHClust { + + private static class Similarity implements Comparable<Similarity> { + static AtomicInteger seq=new AtomicInteger(0); + + boolean toRem=false; + + final int id; + final double sim; + Cluster c1; + Cluster c2; + + Similarity(double s, Cluster c1, Cluster c2) { + id=seq.getAndIncrement(); + sim=s; + this.c1=c1; + this.c2=c2; + } + + public boolean equals(Object o) { + if (o instanceof Similarity) { + Similarity s = (Similarity) o; + return (c1==s.c1 && c2==s.c2) || (c1==s.c2 && c2==s.c1); + } + return false; + } + + @Override + public int hashCode() { + return c1.hashCode()+c2.hashCode(); + } + + @Override + public int compareTo(Similarity o) { + if (sim<o.sim) { + return 1; + } + else if (sim>o.sim) { + return -1; + } + else { + return id-o.id; + } + } + } + + private Set<Cluster> clusters; + private PriorityQueue<Similarity> queue; + + private final Map<CandidateLinkkey,Integer> lkIdx; + private final Map<CandidateLinkkey,Cluster> clustersIdx; + + private final int nbLk; + + private final double[][] sims; + + private int nbSims; + private int nbSims0; + + private ConceptSimilarity sim; + private CandidateLinkkey top; + + private boolean forceConvexity; + + + public LKHClust(Collection<CandidateLinkkey> candidates, ConceptSimilarity sim, boolean onlyNeighbours, boolean forceConvexity) { + this.forceConvexity=forceConvexity; + lkIdx = new HashMap<>(); + this.sim=sim; + top = candidates.iterator().next().getTop(); + clustersIdx = new HashMap<>(); + + int i=0; + for (CandidateLinkkey c : candidates) { + //if (c.getParents().size()>0 && c.getChildren().size()>0) { + lkIdx.put(c,i); + clustersIdx.put(c, new LeafCluster(this,c)); + i+=1; + //} + } + // initialize the similarity array... + nbLk=i; + sims = new double[nbLk][]; + for (i=0 ; i<nbLk ; i++) { + sims[i] = new double[i]; + Arrays.fill(sims[i],Double.NEGATIVE_INFINITY); + } + nbSims=0; + + queue = new PriorityQueue<>(candidates.size()*2); + //queue = new ArrayList<>(); + if (onlyNeighbours) { + initNeighboursOnly(); + } + else { + initFull(); + } + computeClustering(); + } + + public Cluster getRoot() { + if (clusters.size()>1) throw new RuntimeException("several roots..."); + return clusters.iterator().next(); + } + + public int getNbSimComputed() { + return nbSims; + } + + public int getNbSim0Computed() { + return nbSims0; + } + + /** + * + * @param c + * @param alsoChildren + * @return + */ + private Set<Cluster> neighboursOf(Cluster c, boolean alsoChildren) { + HashSet<Cluster> res = new HashSet<>(); + for (CandidateLinkkey lk : c.getLK()) { + Collection<CandidateLinkkey> parents = lk.getParents(); + //HashSet<CandidateLinkkey> nextParents = new HashSet<>(); + // while (!parents.isEmpty()) { + for (CandidateLinkkey v : parents) { + Cluster r = clustersIdx.get(v); + if (r!=null) res.add(r); + } + // parents = nextParents; + // nextParents = new HashSet<>(); + //} + if (alsoChildren) { + Collection<CandidateLinkkey> children = lk.getChildren(); + // HashSet<CandidateLinkkey> nextChildren = new HashSet<>(); + // while (!children.isEmpty()) { + for (CandidateLinkkey v : children) { + Cluster r = clustersIdx.get(v); + if (r!=null) res.add(r); + } + // children = nextChildren; + // nextChildren = new HashSet<>(); + // } + } + } + res.remove(c); + return res; + } + + + private void initNeighboursOnly() { + // compute only neighbors similarities + for (Cluster c1 : clustersIdx.values()) { + for (Cluster c2 : neighboursOf(c1,false)) { + double s = getSim(c1.getMedoid(),c2.getMedoid()); + if (s>0) queue.add(new Similarity(s,c1,c2)); + } + } + } + private void initFull() { + // compute all similarities + List<Cluster> l = new ArrayList<>(clustersIdx.values()); + ListIterator<Cluster> it = l.listIterator(); + while (it.hasNext()) { + Cluster c1 = it.next(); + ListIterator<Cluster> it2 = l.listIterator(it.nextIndex()); + while (it2.hasNext()) { + Cluster c2 = it2.next(); + double s = getSim(c1.getMedoid(),c2.getMedoid()); + if (s>0) queue.add(new Similarity(s,c1,c2)); + } + } + // Collections.sort(queue); + } + + private void computeClustering() { + /* for (int i=0 ; i< nbLk-1 ; i++) { + + // takes the highest similarity in the queue + Similarity s = queue.poll(); + // remove similarities with already merged clusters + while (s!=null && (s.toRem|| (s.c1.isMerged || s.c2.isMerged))) + s = queue.poll(); + if (s==null) break;*/ + + while (queue.size()>1) { + // takes the highest similarity in the queue + Similarity s = queue.poll();//queue.remove(0);// + if (s.sim==0) break; + if (s.c1.isMerged || s.c2.isMerged) throw new RuntimeException("Duplicates in the similarities"); + BinaryCluster c = new BinaryCluster(this,s.sim, s.c1, s.c2); + if (forceConvexity && !c.isConvexSublattice()) { + // System.out.println("Not Convex, sim="+s.sim); + continue; + } + //System.out.println(s.sim); + s.c1.isMerged = true; + s.c2.isMerged = true; + + // replace the cluster associated with link keys in the index + for (CandidateLinkkey lk : c.getLK()) clustersIdx.put(lk,c); + + + + // check the case only one sim + Iterator<Similarity> it = queue.iterator(); + IdentityHashMap<Cluster,Similarity> toAdd = new IdentityHashMap<>(); + while (it.hasNext()) { + Similarity x = it.next(); + if (x.c1 == s.c1 || x.c1 == s.c2 ) { + it.remove(); + if (!toAdd.containsKey(x.c2)) toAdd.put(x.c2,new Similarity(c.getSim(x.c2),c,x.c2)); + + } + else if (x.c2 == s.c1 || x.c2 == s.c2) { + it.remove(); + if (!toAdd.containsKey(x.c1)) toAdd.put(x.c1,new Similarity(c.getSim(x.c1),c,x.c1)); + } + /* if (x.toRem) { + it.remove(); + } + else if (x.c1 == s.c1 || x.c1==s.c2) { + if (toRemove.containsKey(x.c2)) { + it.remove(); + toRemove.remove(x.c2); + } + else { + toRemove.put(x.c2,x); + x.c2=x.c1==s.c1?s.c2:s.c1; + } + } + else if (x.c2 == s.c1 || x.c2 == s.c2) { + if (toRemove.containsKey(x.c1)) { + it.remove(); + toRemove.remove(x.c1); + } + else { + toRemove.put(x.c1,x); + x.c1=x.c2==s.c1?s.c2:s.c1; + } + }*/ + } + queue.addAll(toAdd.values()); + // Collections.sort(queue); + + // this piece of code is needed if the similarity is computed only between neighbours. + /* boolean toSort=false; + for (Map.Entry<Cluster, Similarity> e : toRemove.entrySet()) { + if (e.getValue().sim>0) { + double sim = c.getSim(e.getKey()); + if (sim < e.getValue().sim) { + e.getValue().toRem = true; + if (sim > 0) { + toSort = true; + queue.add(new Similarity(sim, e.getKey(), c)); + } + } + } + } + if (toSort) Collections.sort(queue);*/ + + } + + queue=null; + + clusters = new HashSet<>(clustersIdx.values()); + // merge clusters with similarity of 0 + while (clusters.size()>1) { + Iterator<Cluster> it = clusters.iterator(); + Cluster c1 = it.next(); + it.remove(); + Cluster c2 = it.next(); + it.remove(); + clusters.add(new BinaryCluster(this,0, c1, c2)); + } + // System.out.println("NB sims computed = "+nbSims); + } + + + public double getSim(CandidateLinkkey c1, CandidateLinkkey c2) { + if (c1==c2) return 1; + + int idxC1 = lkIdx.get(c1); + int idxC2 = lkIdx.get(c2); + int min = idxC1<idxC2?idxC1:idxC2; + int max = min==idxC1?idxC2:idxC1; + + if (sims[max][min]==Double.NEGATIVE_INFINITY) { + + sims[max][min] = sim.similarity(c1,c2); + if (sims[max][min]==0) nbSims0++; + nbSims++; + } + return sims[max][min]; + } +} diff --git a/src/main/java/fr/inrialpes/exmo/linkkey/clustering/LeafCluster.java b/src/main/java/fr/inrialpes/exmo/linkkey/clustering/LeafCluster.java new file mode 100644 index 0000000000000000000000000000000000000000..2d00f55f3112817732d7fd1f8203a847b51f9338 --- /dev/null +++ b/src/main/java/fr/inrialpes/exmo/linkkey/clustering/LeafCluster.java @@ -0,0 +1,65 @@ +package fr.inrialpes.exmo.linkkey.clustering; + +import fr.inrialpes.exmo.linkkey.CandidateLinkkey; + +import java.util.Collections; +import java.util.List; +import java.util.Set; + +public class LeafCluster extends Cluster { + + private static double[] ZERO = new double[]{0}; + public final CandidateLinkkey c; + + public LeafCluster(LKHClust clust, CandidateLinkkey c) { + super(clust,1,1); + this.c=c; + } + + @Override + public CandidateLinkkey getSup() { + return c; + } + + @Override + public CandidateLinkkey getInf() { + return c; + } + + @Override + protected void getAllCluster(List<Cluster> l) { + l.add(this); + } + + @Override + public List<CandidateLinkkey> getLK() { + return Collections.singletonList(c); + } + + @Override + public void addToList(List<LeafCluster> l) { + l.add(this); + } + + @Override + public double getSim(Cluster c) { + if (c instanceof LeafCluster) { + return clust.getSim(this.c,((LeafCluster) c).c); + } + return c.getSim(this); + } + + @Override + protected double[] sumSims() { + return ZERO; + } + + @Override + protected void cutAt(double sim, Set<Cluster> selected) { + selected.add(this); + } + + public String toString() { + return c.toString(); + } +} diff --git a/src/main/java/fr/inrialpes/exmo/linkkey/clustering/NAryCluster.java b/src/main/java/fr/inrialpes/exmo/linkkey/clustering/NAryCluster.java new file mode 100644 index 0000000000000000000000000000000000000000..4692e0a284c469dcb654f45cedd50139a3a8e63d --- /dev/null +++ b/src/main/java/fr/inrialpes/exmo/linkkey/clustering/NAryCluster.java @@ -0,0 +1,82 @@ +package fr.inrialpes.exmo.linkkey.clustering; + +import fr.inrialpes.exmo.linkkey.CandidateLinkkey; + +import java.util.*; + +public abstract class NAryCluster extends Cluster { + + private List<Cluster> clusters; + private List<CandidateLinkkey> lks; + private Map<Cluster,Double> sims; + + public NAryCluster(LKHClust clust, double sim, Collection<Cluster> clusters) { + super(clust,sim,clusters.stream().reduce(0, (sum,c) -> sum+c.nb, Integer::sum)); + this.clusters = new ArrayList<>(clusters); + lks = new ArrayList<>(); + this.clusters.forEach( x -> lks.addAll(x.getLK())); + sims = new HashMap<>(); + } + + @Override + protected void getAllCluster(List<Cluster> l) { + l.add(this); + clusters.forEach( c -> c.getAllCluster(l)); + } + + @Override + public List<CandidateLinkkey> getLK() { + return Collections.unmodifiableList(lks); + } + + @Override + public void addToList(List<LeafCluster> l) { + clusters.forEach( x -> x.addToList(l)); + } + + @Override + public double getSim(Cluster c) { + Double res = sims.get(c); + if (res ==null) { + double min = 0; + for (Cluster cx : clusters) { + min = Math.min(min,cx.getSim(c)); + } + sims.put(c,min); + res=min; + } + return res; + } + + + @Override + protected double[] sumSims() { + System.out.println("Start compute sim"); + double[] res = new double[lks.size()]; + int i=0; + ListIterator<CandidateLinkkey> it1 = lks.listIterator(); + while (it1.hasNext()) { + System.out.println("Iteration "+ it1.nextIndex()); + CandidateLinkkey lk1 = it1.next(); + ListIterator<CandidateLinkkey> it2 = lks.listIterator(it1.nextIndex()); + while (it2.hasNext()) { + CandidateLinkkey lk2 = it2.next(); + double s = clust.getSim(lk1, lk2); + res[it1.previousIndex()] += s; + res[it2.previousIndex()] += s; + } + } + System.out.println("End compute sim"); + return res; + } + + @Override + protected void cutAt(double sim, Set<Cluster> selected) { + if (this.sim>=sim) { + selected.add(this); + } + else { + clusters.forEach(x -> x.cutAt(sim, selected)); + } + } +} diff --git a/src/main/java/fr/inrialpes/exmo/linkkey/expeclust/ClusteringInterface.java b/src/main/java/fr/inrialpes/exmo/linkkey/expeclust/ClusteringInterface.java new file mode 100644 index 0000000000000000000000000000000000000000..1d0449f1da39f35f14c846a4bb1741b4633506f5 --- /dev/null +++ b/src/main/java/fr/inrialpes/exmo/linkkey/expeclust/ClusteringInterface.java @@ -0,0 +1,56 @@ +package fr.inrialpes.exmo.linkkey.expeclust; + +import fr.inrialpes.exmo.linkkey.CandidateLinkkey; +import fr.inrialpes.exmo.linkkey.ExtractionResult; +import fr.inrialpes.exmo.linkkey.clustering.ClosureSimilarity; +import fr.inrialpes.exmo.linkkey.clustering.ConceptSimilarity; +import fr.inrialpes.exmo.linkkey.clustering.Dendrogramme; +import fr.inrialpes.exmo.linkkey.clustering.LKHClust; +import fr.inrialpes.exmo.linkkey.eval.EvalMeasures; +import fr.inrialpes.exmo.linkkey.eval.SupervisedEvalMeasures; +import fr.inrialpes.exmo.linkkey.utils.renderer.TxtLinkKeyRenderer; + +import javax.swing.*; +import java.awt.*; +import java.util.Collection; +import java.util.stream.Collectors; + +public class ClusteringInterface { + + public static void clustering(ExtractionResult res) { + + EvalMeasures evalMeasures = new EvalMeasures(res.getClass1Size(), res.getClass2Size(), res.getClasses2InstancesDS1(), res.getClasses2InstancesDS2()); + SupervisedEvalMeasures supEval = new SupervisedEvalMeasures(res.getReferenceLinks(),res.getClasses2InstancesDS1(),res.getClasses2InstancesDS2()); + //Collection<CandidateLinkkey> lkSet = res.getCandidates().stream().filter( c -> {return evalMeasures.discriminability(c)>0.8 && c.getLinksSize()>1;}).collect(Collectors.toList()); + + final int minLinks=1; + Collection<CandidateLinkkey> lkSet = res.getCandidates().stream()/*.filter( c -> c.getLinksSize()>minLinks)*//*.filter( m -> evalMeasures.discriminability(m)>0.9 )*/.collect(Collectors.toList()); + System.out.println("#Candidates : "+res.getCandidates().size()+", #Selected (#links>"+minLinks+") : "+lkSet.size()); + long time = System.currentTimeMillis(); + ConceptSimilarity sim = new ClosureSimilarity(res.getTop(),res.getInstances1().size()*res.getInstances2().size()); + LKHClust clust = new LKHClust(lkSet, sim,false,false); + time = System.currentTimeMillis()-time; + System.out.println("Clustering time="+(double) time/1000); + SwingUtilities.invokeLater( () -> { + JFrame f = new JFrame(); + f.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + Dendrogramme dend = new Dendrogramme(clust.getRoot(),new TxtLinkKeyRenderer(res)); + JScrollPane jsp = new JScrollPane(dend); + jsp.setColumnHeaderView(dend.getColumnHeader()); + f.getContentPane().add(jsp); + f.setSize(new Dimension(1024,768)); + f.setVisible(true); + //f.pack(); + }); +/* + ExpeClust.topkPR(clust.getRoot(), supEval,evalMeasures,new LinkKeyRenderer(res),System.out); + //ExpeClust.topkDC(clust.getRoot(), new SupervisedEvalMeasures(res.getReferenceLinks(),res.getClasses2InstancesDS1(),res.getClasses2InstancesDS2()),evalMeasures,new LinkKeyRenderer(res),System.out);xpeClust.boxPlotSingletonClusters(clust.getRoot(), supEval,evalMeasures,new LinkKeyRenderer(res),System.out); + ExpeClust.boxplotLkPerClass(clust.getRoot(), supEval,evalMeasures,new LinkKeyRenderer(res),System.out); + ExpeClust.boxplotQualityOfSingletonClusters(clust.getRoot(), supEval,evalMeasures,new LinkKeyRenderer(res),System.out); + ExpeClust.distribSimClusters(clust.getRoot(), supEval,evalMeasures,new LinkKeyRenderer(res),System.out); + ExpeClust.bestDC(clust.getRoot(), supEval,evalMeasures,new LinkKeyRenderer(res),System.out); + ExpeClust.topkDisjunction(clust.getRoot(), supEval,evalMeasures,new LinkKeyRenderer(res),0.9,System.out); + //ExpeClust.onlyBigCLusterDisjunction(clust.getRoot(), new SupervisedEvalMeasures(res.getReferenceLinks(),res.getClasses2InstancesDS1(),res.getClasses2InstancesDS2()),evalMeasures,new LinkKeyRenderer(res),System.out); +*/ + } +} diff --git a/src/main/java/fr/inrialpes/exmo/linkkey/expeclust/ExpeClust.java b/src/main/java/fr/inrialpes/exmo/linkkey/expeclust/ExpeClust.java new file mode 100644 index 0000000000000000000000000000000000000000..6330f6e70f7890975cb1b10e7665b011104b5b76 --- /dev/null +++ b/src/main/java/fr/inrialpes/exmo/linkkey/expeclust/ExpeClust.java @@ -0,0 +1,467 @@ +package fr.inrialpes.exmo.linkkey.expeclust; + +import fr.inrialpes.exmo.linkkey.AntichainDisj; +import fr.inrialpes.exmo.linkkey.CandidateLinkkey; +import fr.inrialpes.exmo.linkkey.ExtractionResult; +import fr.inrialpes.exmo.linkkey.clustering.*; +import fr.inrialpes.exmo.linkkey.eval.EvalMeasures; +import fr.inrialpes.exmo.linkkey.eval.SupervisedEvalMeasures; +import fr.inrialpes.exmo.linkkey.utils.renderer.LinkKeyRenderer; +import fr.inrialpes.exmo.linkkey.utils.renderer.TxtLinkKeyRenderer; +import it.unimi.dsi.fastutil.longs.LongOpenHashSet; +import it.unimi.dsi.fastutil.longs.LongSet; + +import java.io.IOException; +import java.io.PrintStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.text.NumberFormat; +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +public class ExpeClust { + + public static void clusteringExpe(ExtractionResult res, Path outputDir) { + //res.getInstances1().size()*res.getInstances2().size() + + ConceptSimilarity jaccard = new JaccardSimilarity(res.getTop(),res.getInstances1().size()*res.getInstances2().size()); + ConceptSimilarity closure = new ClosureSimilarity(res.getTop(),res.getInstances1().size()*res.getInstances2().size()); + try { + System.gc(); + Path p = outputDir.getName(0).resolve("dJaccard-full").resolve(outputDir.getName(1)); + if (Files.notExists(p)) Files.createDirectories(p); + clusteringExpe(res, false, false, jaccard, p); + + System.gc(); + p=outputDir.getName(0).resolve("dJaccard-full-forceConvexity").resolve(outputDir.getName(1)); + if (Files.notExists(p)) Files.createDirectories(p); + clusteringExpe(res, false, true,jaccard, p); + + System.gc(); + p=outputDir.getName(0).resolve("dClosure-full").resolve(outputDir.getName(1)); + if (Files.notExists(p)) Files.createDirectories(p); + clusteringExpe(res, false, false, closure, p); + + System.gc(); + p=outputDir.getName(0).resolve("dClosure-full-forceConvexity").resolve(outputDir.getName(1)); + if (Files.notExists(p)) Files.createDirectories(p); + clusteringExpe(res, false, true, closure, p); + + System.gc(); + p= outputDir.getName(0).resolve("dClosure-neighbours").resolve(outputDir.getName(1)); + if (Files.notExists(p)) Files.createDirectories(p); + clusteringExpe(res, true, false, closure, p); + + System.gc(); + p= outputDir.getName(0).resolve("dClosure-neighbours-forceConvexity").resolve(outputDir.getName(1)); + if (Files.notExists(p)) Files.createDirectories(p); + clusteringExpe(res, true, true, closure, p); + } + catch (IOException e ) {e.printStackTrace();} + } + + + + public static void clusteringExpe(ExtractionResult res, boolean onlyNeighbours, boolean forceConvexity, ConceptSimilarity s, Path outputDir) { + + EvalMeasures evalMeasures = new EvalMeasures(res.getClass1Size(), res.getClass2Size(), res.getClasses2InstancesDS1(), res.getClasses2InstancesDS2()); + SupervisedEvalMeasures supEval = new SupervisedEvalMeasures(res.getReferenceLinks(),res.getClasses2InstancesDS1(),res.getClasses2InstancesDS2()); + Collection<CandidateLinkkey> lkSet = res.getCandidates(); + System.err.println("# candidates: "+lkSet.size()); + + lkSet.remove(res.getTop()); + lkSet.remove(res.getBottom()); + + long t = System.currentTimeMillis(); + LKHClust clust = new LKHClust(lkSet,s,onlyNeighbours,forceConvexity); + t = System.currentTimeMillis()-t; + + try { + //Files.createDirectory(outputDir); + Path path=null; + PrintStream out =null; + // statistics + path = outputDir.resolve("stats.txt"); + out = new PrintStream(Files.newOutputStream(path)); + NumberFormat f = NumberFormat.getInstance(Locale.US); + f.setMaximumFractionDigits(2); + out.println("time: "+f.format((double) t/1000)); + out.println("nbSims: "+f.format(clust.getNbSimComputed())); + out.println("nbPositiveSims: "+f.format(clust.getNbSimComputed()-clust.getNbSim0Computed() )); + out.println("nbCandidates: "+f.format(lkSet.size())); + //out.println("firstNonConvex: "+f.format()); + out.close(); + + // Expe 1: print preservation and compression in function of the cut level + path = outputDir.resolve("comp-preserv.dat"); + out = new PrintStream(Files.newOutputStream(path)); + statsCutLevels(clust.getRoot(), supEval,evalMeasures,new LinkKeyRenderer(res),out); + out.close(); + + // Expe 2: print disjunctions Precision and Recall in function of the cut level + path = outputDir.resolve("all-disjunctions.dat"); + out = new PrintStream(Files.newOutputStream(path)); + topkDisjunction(clust.getRoot(), supEval,evalMeasures,new LinkKeyRenderer(res),0,out); + out.close(); + + // Expe 2bis: print disjunctions Precision and Recall in function of the cut level - only medoid with disc. > .9 + path = outputDir.resolve("90disc-disjunctions.dat"); + out = new PrintStream(Files.newOutputStream(path)); + topkDisjunction(clust.getRoot(), supEval,evalMeasures,new LinkKeyRenderer(res),0.9,out); + out.close(); + + // Expe 3: number of link keys per class (box plot) boxplotLkPerClass + path = outputDir.resolve("nb-linkkey-per-class.tex"); + out = new PrintStream(Files.newOutputStream(path)); + boxplotLkPerClass(clust.getRoot(), supEval,evalMeasures,new LinkKeyRenderer(res),out); + out.close(); + + // Expe 4: quality of singleton clusters + path = outputDir.resolve("quality-singleton-clusters.tex"); + out = new PrintStream(Files.newOutputStream(path)); + boxplotQualityOfSingletonClusters(clust.getRoot(), supEval,evalMeasures,new LinkKeyRenderer(res),out); + out.close(); + + } + catch (IOException e ) { + e.printStackTrace(); + } + + } + + public static void statsCutLevels(Cluster root, SupervisedEvalMeasures supEval, EvalMeasures eval, LinkKeyRenderer r, PrintStream out) { + + //out.println("Link key candidate preservation using Precision / RecallClass"); + NumberFormat f = NumberFormat.getInstance(Locale.US); + f.setMaximumFractionDigits(2); + + + Collection<CandidateLinkkey> allLks = root.getLK(); + LongSet links = new LongOpenHashSet(); + allLks.forEach( lk -> links.addAll(lk.getLinks())); + int nbLinks = links.size(); + links.clear(); + + out.println("cutlevel\tnbMedoids\tcompression\tpreservation\tnotCompRatio\tconvexRatio"); + + for (int i=0; i<10 ; i++) { + + double cut = 1.0-(double) i/10;//(double) i/10; + + List<CandidateLinkkey> medoids = new ArrayList<>(); + Set<Cluster> clusters= root.cutAt(cut); + clusters.forEach( clust -> medoids.add(clust.getMedoid())); + + double compression = 1.0 - (double) medoids.size()/allLks.size(); + + medoids.forEach( lk -> links.addAll(lk.getLinks())); + double linkPreserv = (double) links.size()/nbLinks; + links.clear(); + + int nbMedoids=medoids.size(); + // compute not comparable ratio + HashSet<CandidateLinkkey> notComparable = new HashSet<>(); + ListIterator<CandidateLinkkey> it1 = medoids.listIterator(); + l1: while (it1.hasNext()) { + CandidateLinkkey c1 = it1.next(); + ListIterator<CandidateLinkkey> it2 = medoids.listIterator(); + while (it2.hasNext()) { + CandidateLinkkey c2 = it2.next(); + if (c1!=c2 && (c1.getDescendants().contains(c2) || c2.getDescendants().contains(c1))) { + continue l1; + } + } + it1.remove(); + notComparable.add(c1); + } + double notCompRatio = (double) notComparable.size()/nbMedoids; + + // compute the ratio of clusters that are convex sublattices + int convex=0; + for (Cluster x : clusters) if (x.isConvexSublattice()) convex+=1; + + + out.print(f.format(1-cut)+"\t"+f.format(nbMedoids)+"\t"+ + f.format(compression)+"\t"+f.format(linkPreserv)+"\t" + +f.format(notCompRatio)+"\t"+f.format((double) convex/clusters.size())); + + + + out.println(); + } + + out.flush(); + } + + public static void distribSimClusters(Cluster root, SupervisedEvalMeasures supEval, EvalMeasures eval, LinkKeyRenderer r, PrintStream out) { + NumberFormat f = NumberFormat.getInstance(Locale.US); + f.setMaximumFractionDigits(2); + List<Cluster> clusts = root.getAllClusters(); + List<Double> sims = new ArrayList<>(clusts.size()); + clusts.forEach( c -> sims.add(c.sim)); + Collections.sort(sims); + ListIterator<Double> it = sims.listIterator(); + out.println("sim nb"); + for (int i=1; i<11 ; i++) { + int count=0; + double t = (double)i/10; + while (it.hasNext() && it.next()<t) count++; + if (it.hasPrevious()) it.previous(); + out.println(f.format(1-t)+" "+count); + } + + } + + + public static void boxplotLkPerClass(Cluster root, SupervisedEvalMeasures supEval, EvalMeasures eval, LinkKeyRenderer r, PrintStream out) { + NumberFormat f = NumberFormat.getInstance(Locale.US); + f.setMaximumFractionDigits(2); + double[] steps = {0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9}; + + String lbl = Arrays.toString(steps); + lbl = lbl.substring(1,lbl.length()-1); + lbl=lbl.replaceAll("0",""); + + String ticks = Arrays.toString(IntStream.range(1, steps.length+1).toArray()); + ticks = ticks.substring(1,ticks.length()-1); + + out.println("\\begin{tikzpicture}\n" + + " \\begin{axis}\n" + + " [\n" + + " boxplot/draw direction = y,\n" + + " xtick={"+ticks+"},\n" + + " xticklabels={"+lbl+"} \n" + + " ]"); + + for (double cut : steps) { + List<Integer> nbLK = new ArrayList<>(); + root.cutAt(1-cut).forEach( c -> { + nbLK.add(c.getLK().size()); + } ); + + Collections.sort(nbLK); + String[] quartiles = new String[5]; + int s = nbLK.size(); + for (int i=0 ; i<5 ; i++) { + quartiles[i] = f.format(nbLK.get(Math.min(i*s/4,s-1))); + } + + StringBuilder data = new StringBuilder(); + nbLK.forEach( nb -> data.append("\\\\ ").append(nb)); + + out.println("\\addplot+[\n" + + " boxplot prepared={\n" + + " median="+quartiles[2]+",\n" + + " upper quartile="+quartiles[3]+",\n" + + " lower quartile="+quartiles[1]+",\n" + + " upper whisker="+quartiles[4]+",\n" + + " lower whisker="+quartiles[0]+"\n" + + " }] " + + " coordinates {};\n" + //" }] table [row sep=\\\\,y index=0] {\n" + + //" data "+data+"\\\\\n"+ //'\\\\ 1\\\\ 3\\\\\n" + + //"};" + ); + + } + + + out.println(" \\end{axis}\n" + + "\\end{tikzpicture}"); + + } + + public static void boxplotQualityOfSingletonClusters(Cluster root, SupervisedEvalMeasures supEval, EvalMeasures eval, LinkKeyRenderer r, PrintStream out) { + NumberFormat f = NumberFormat.getInstance(Locale.US); + f.setMaximumFractionDigits(2); + double[] steps = {0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9}; + + String lbl = Arrays.toString(steps); + lbl = lbl.substring(1,lbl.length()-1); + lbl=lbl.replaceAll("0",""); + + String ticks = Arrays.toString(IntStream.range(1, steps.length+1).toArray()); + ticks = ticks.substring(1,ticks.length()-1); + + out.println("\\begin{tikzpicture}\n" + + " \\begin{axis}\n" + + " [\n" + + " boxplot/draw direction = y,\n" + + " xtick={"+ticks+"},\n" + + " xticklabels={"+lbl+"} \n" + + " ]"); + + for (double cut : steps) { + List<Double> fmeasures = new ArrayList<>(); + root.cutAt(1-cut).stream().filter( c-> c.getLK().size()==1).forEach( c -> { + CandidateLinkkey x = c.getMedoid(); + fmeasures.add(EvalMeasures.hmean(supEval.getPrecision(x),supEval.getRecallClass(x)) ); + } ); + + Collections.sort(fmeasures); + String[] quartiles = new String[5]; + int s = fmeasures.size(); + for (int i=0 ; i<5 ; i++) { + quartiles[i] = f.format(fmeasures.get(Math.min(i*s/4,s-1))); + } + out.println("\\addplot+[\n" + + " boxplot prepared={\n" + + " median="+quartiles[2]+",\n" + + " upper quartile="+quartiles[3]+",\n" + + " lower quartile="+quartiles[1]+",\n" + + " upper whisker="+quartiles[4]+",\n" + + " lower whisker="+quartiles[0]+"\n" + + " }] "+ + " coordinates {};\n" + ); + + } + + + out.println(" \\end{axis}\n" + + "\\end{tikzpicture}"); + } + + + public static void topkDC(Cluster root, SupervisedEvalMeasures supEval, EvalMeasures eval, LinkKeyRenderer r, PrintStream out) { + + out.println("Link key candidate preservation using Discriminability / CoverageClass"); + NumberFormat f = NumberFormat.getInstance(Locale.US); + f.setMaximumFractionDigits(2); + + /*int nbFirst = 10; + Set<CandidateLinkkey> lks = new HashSet<>(root.getLK()); + List<Set<CandidateLinkkey>> selections = new ArrayList<>(nbFirst); + for (int i=0; i<nbFirst ; i++) { + double maxf = 0; + Set<CandidateLinkkey> selection = new HashSet<>(); + for (CandidateLinkkey c : lks) { + double v = EvalMeasures.hmean(eval.discriminability(c), eval.coverageClassExp(c)); + if (v > maxf) { + selection.clear(); + maxf = v; + selection.add(c); + } else if (maxf == v) selection.add(c); + } + out.println("FMeasure Rank "+i+" ("+selection.size()+" selected)"); + for (CandidateLinkkey c : selection) { + out.println("D="+f.format(eval.discriminability(c)) + "\tC=" + f.format(eval.coverageClassExp(c))+"\t#Links="+c.getLinksSize()+"\t"+r.toString(c,true)); + lks.remove(c); + lks.removeAll(c.getDescendants()); + lks.removeAll(c.getAncestors()); + } + selections.add(selection); + + }*/ + + Collection<CandidateLinkkey> allLks = root.getLK(); + LongSet links = new LongOpenHashSet(); + allLks.forEach( lk -> links.addAll(lk.getLinks())); + int nbLinks = links.size(); + links.clear(); + + out.println("cut_level\tcompression\tpreservation"); + for (int i=0; i<11 ; i++) { + + double cut = 1.0-(double) i/10;// (double) i/10; + + Set<CandidateLinkkey> medoids = new HashSet<>(); + root.cutAt(cut).forEach( clust -> medoids.add(clust.getMedoid())); + + double compression = 1.0 - (double) medoids.size()/allLks.size(); + + medoids.forEach( lk -> links.addAll(lk.getLinks())); + double linkPreserv = (double) links.size()/nbLinks; + links.clear(); + + out.print(f.format(1-cut)+"\t"+f.format(compression)+"\t"+f.format(linkPreserv)); + + /*for (int j=0 ; j<nbFirst ; j++) { + HashSet<CandidateLinkkey> s = new HashSet<>(selections.get(j)); + s.retainAll(medoids); + double bestPreserve = (double) s.size()/selections.get(j).size(); + out.print("\t"+f.format(bestPreserve)); + }*/ + + out.println(); + + + /*HashSet<CandidateLinkkey> topk = new HashSet<>(); + Iterator<CandidateLinkkey> it = selection.descendingIterator(); + for (int j=0 ; j<100 && it.hasNext() ; j++) { + CandidateLinkkey cand = it.next(); + // System.out.println(cand+" : "+eval.hmeanDiscCov(cand)); + topk.add(cand); + } + + AntichainDisj r = new AntichainDisj(topk); + + + double prec = supEval.getPrecision(r); + double rec = supEval.getRecall(r); + + + out.println(f.format(cut)+"\t"+f.format(prec)+"\t"+f.format(rec));*/ + + } + out.flush(); + } + + public static void bestDC(Cluster root, SupervisedEvalMeasures supEval, EvalMeasures eval, TxtLinkKeyRenderer r, PrintStream out) { + + out.println("Best Link key using disc/coverage"); + NumberFormat f = NumberFormat.getInstance(Locale.US); + f.setMaximumFractionDigits(2); + + //CandidateLinkkey bestDC = root.getLK().stream().filter( m -> eval.discriminability(m)>0.9).sorted( (x,y) -> (int) (eval.hmeanDiscCovClass(y)*1000 - eval.hmeanDiscCovClass(x)*1000)).findFirst().get(); + + ArrayList<CandidateLinkkey> l = new ArrayList<>(root.getLK()); + + Collections.sort(l, (c1,c2) -> { + double res = eval.hmeanDiscCov(c1)-eval.hmeanDiscCov(c2); + if (res==0) return 0; + else if (res<0) return -1; + else return 1; + }); + CandidateLinkkey bestDC = l.get(l.size()-1); + double prec = supEval.getPrecision(bestDC); + double rec = supEval.getRecall(bestDC); + double fmeasure = EvalMeasures.hmean(prec,rec); + out.println("Best DC link key : "+f.format(fmeasure)+"\t"+f.format(prec)+"\t"+f.format(rec)+"\t"+r.toString(bestDC,true)); + } + + + public static void topkDisjunction(Cluster root, SupervisedEvalMeasures supEval, EvalMeasures eval, LinkKeyRenderer r, double discThreshold, PrintStream out) { + int nbSelect = Integer.MAX_VALUE; + //out.println("Link key selection using topk medoids (k="+nbSelect+")"); + NumberFormat f = NumberFormat.getInstance(Locale.US); + f.setMaximumFractionDigits(2); + + out.println("cutlevel\tcompression\tfmeasure\tprec\trec"); + + int initialSize=-1; + for (int i=0; i<10 ; i++) { + + double cut = 1.0-(double) i/10;//(double) i/10; + + List<CandidateLinkkey> medoids = new ArrayList<>(); + root.cutAt(cut).stream()/*.filter( clust-> clust.getLK().size()>1)*/.forEach( clust -> medoids.add(clust.getMedoid())); + + List<CandidateLinkkey> selection = medoids.stream().filter( m -> m.getLinksSize()>1 && eval.discriminability(m)>=discThreshold).collect(Collectors.toList()); + //List<CandidateLinkkey> selection = medoids.stream().collect(Collectors.toList()); + if (initialSize==-1) initialSize= selection.size(); + + AntichainDisj disj = new AntichainDisj(selection.size()>nbSelect?selection.subList(0,nbSelect):selection); + + double prec = supEval.getPrecision(disj); + double rec = supEval.getRecall(disj); + double fmeasure = EvalMeasures.hmean(prec,rec); + + out.println(f.format(1-cut)+"\t"+f.format(1- ((double) disj.getCandidates().size()/initialSize))+"\t"+f.format(fmeasure)+"\t"+f.format(prec)+"\t"+f.format(rec)); + + } + out.flush(); + } + +} diff --git a/src/main/java/fr/inrialpes/exmo/linkkey/viz/LKLatticeViz.java b/src/main/java/fr/inrialpes/exmo/linkkey/viz/LKLatticeViz.java new file mode 100644 index 0000000000000000000000000000000000000000..bfce3a9aacb1f21b7420b519e61f798a683698c0 --- /dev/null +++ b/src/main/java/fr/inrialpes/exmo/linkkey/viz/LKLatticeViz.java @@ -0,0 +1,106 @@ +package fr.inrialpes.exmo.linkkey.viz; + +import fr.inrialpes.exmo.linkkey.CandidateLinkkey; +import fr.inrialpes.exmo.linkkey.ExtractionResult; + +import javax.swing.*; +import java.awt.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; + +public class LKLatticeViz extends JComponent { + + private static class Node { + CandidateLinkkey c; + int x,y; + } + + private final int nodeSize = 30; + private final int verticalSpacing = 50; + private int horizontalSpacing = 200; + private ExtractionResult result; + + public java.util.List<java.util.List<CandidateLinkkey>> levels; + + private Map<CandidateLinkkey,Node> nodes; + + private int maxLevelSize; + + public LKLatticeViz(ExtractionResult r) { + result=r; + computeLevels(); + System.out.println(result.getCandidates().size()); + assignPositions(); + } + + + + private void computeLevels() { + levels = new ArrayList<>(); + int nbLevels = result.getBottom().getMaxToTop()+1; + for (int i=0;i<nbLevels;i++) levels.add(new ArrayList<>()); + HashSet<CandidateLinkkey> visited=new HashSet<>(); + //HashSet<CandidateLinkkey> current + + + /*for (CandidateLinkkey x : result.getCandidates()) { + levels.get(x.getMaxToTop()).add(x); + }*/ + for (java.util.List<CandidateLinkkey> l : levels) maxLevelSize = Math.max(maxLevelSize,l.size()); + } + + private int computeWidth() { + return verticalSpacing*(maxLevelSize+2); + } + + private void assignPositions() { + int y=horizontalSpacing; + nodes = new HashMap<>(); + for (java.util.List<CandidateLinkkey> l : levels) { + int x = verticalSpacing + (computeWidth()-verticalSpacing*l.size())/2; + for (CandidateLinkkey c : l) { + Node n = new Node(); + n.c=c; + n.x=x; + n.y=y; + nodes.put(c,n); + x+=verticalSpacing; + } + y+=horizontalSpacing; + } + } + + @Override + public Dimension getPreferredSize() { + return new Dimension(computeWidth(),levels.size()*verticalSpacing); + } + + @Override + protected void paintComponent(Graphics g) { + super.paintComponent(g); + Graphics2D g2 = (Graphics2D) g; + g2.setRenderingHint(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON); + g2.setColor(Color.black); + for (Node n : nodes.values()) { + g.drawOval(n.x,n.y,nodeSize,nodeSize); + /* for (CandidateLinkkey c : n.c.getChildren()) { + Node child = nodes.get(c); + g.drawLine(n.x+nodeSize/2,n.y+nodeSize,child.x+nodeSize/2,child.y); + }*/ + } + + } + + public static void draw(ExtractionResult res) { + LKLatticeViz v = new LKLatticeViz(res); + JFrame f = new JFrame("BlaBla"); + f.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + JScrollPane jsp = new JScrollPane(); + jsp.setViewportView(v); + f.setContentPane(jsp); + f.setSize(new Dimension(300,300)); + f.setVisible(true); + } +}