📄 basicdeduper.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * BasicDeduper.java * Copyright (C) 2003 Mikhail Bilenko * */package weka.deduping;import weka.core.*;import weka.deduping.metrics.*;import weka.deduping.blocking.*;import java.io.Serializable;import java.util.*;import weka.clusterers.Cluster;/** A basic deduper class that takes a set of objects and * identifies disjoint subsets of duplicates * * @author Mikhail Bilenko (mbilenko@cs.utexas.edu) * @version $Revision: 1.7 $ */public class BasicDeduper extends Deduper implements OptionHandler, Serializable { /** A metric measuring similarity between every pair of instances */ InstanceMetric m_metric = new SumInstanceMetric(); /** The proportion of the training fold that should be used for training*/ protected double m_trainProportion = 1.0; /** distance matrix containing the distance between each pair */ protected double[][] m_distanceMatrix = null; /** instance hash, where each Integer index is hashed to an instance */ protected HashMap m_instancesHash = null; /** reverse instance hash, where each instance is hashed to its Integer index */ protected HashMap m_reverseInstancesHash = null; /** the attribute indeces on which to do deduping */ protected int[] m_attrIdxs = null; /** The total number of true objects */ protected int m_numObjects = 0; /** An array containing class values for instances (for faster statistics) */ protected double[] m_classValues = null; /** Number of clusters in the process*/ protected int m_numCurrentObjects = 0; /** holds the clusters */ protected ArrayList m_clusters = null; /** A set of instances to dedupe */ protected Instances m_testInstances = null; /** Use blocking ? */ protected boolean m_useBlocking = false; /** * temporary variable holding cluster assignments */ protected int [] m_clusterAssignments; /** verbose? */ protected boolean m_debug = false; /** Statistics */ protected int m_numTotalPairs = 0; protected int m_numGoodPairs = 0; protected int m_numTruePairs = 0; protected int m_numTotalPairsTrain = 0; // the overall number of pairs in the test split protected int m_numTotalPairsTest = 0; // the overall number of pairs in the test split protected int m_numPotentialDupePairsTrain = 0; protected int m_numActualDupePairsTrain = 0; protected int m_numPotentialNonDupePairsTrain = 0; protected int m_numActualNonDupePairsTrain = 0; protected double m_trainTime = 0; protected double m_testTimeStart = 0; /** Given training data, build the metrics required by the deduper * @param train a set of training data */ public void buildDeduper(Instances trainFold, Instances testInstances) throws Exception { Instances trainInstances = getTrainingSet(trainFold); m_numTotalPairsTrain = trainInstances.numInstances() * (trainInstances.numInstances() - 1) / 2; m_numPotentialDupePairsTrain = numTruePairs(trainInstances); m_numPotentialNonDupePairsTrain = m_numTotalPairsTrain - m_numPotentialDupePairsTrain; // if the indexes have not been set, use all except for class if (m_attrIdxs == null) { m_attrIdxs = new int[trainInstances.numAttributes() - 1]; int classIdx = trainInstances.classIndex(); int counter = 0; for (int i = 0; i < m_attrIdxs.length + 1; i++) { if (i != classIdx) { m_attrIdxs[counter++] = i; } } } // train the instance metric long trainTimeStart = System.currentTimeMillis(); m_metric.buildInstanceMetric(m_attrIdxs); m_metric.trainInstanceMetric(trainInstances, testInstances); // get training statistics m_numActualDupePairsTrain = m_metric.getNumActualPosPairs(); m_numActualNonDupePairsTrain = m_metric.getNumActualNegPairs(); m_trainTime = (System.currentTimeMillis() - trainTimeStart)/1000.0; m_distanceMatrix = null; } /** Identify duplicates within the testing data * @param testInstances a set of instances among which to identify duplicates * @param numObjects the number of "true object" sets to create * @return a list of object sets */ public void findDuplicates(Instances testInstances, int numObjects) throws Exception { m_numObjects = testInstances.numClasses(); m_numTruePairs = numTruePairs(testInstances); m_numTotalPairsTest = testInstances.numInstances() * (testInstances.numInstances() -1) / 2; resetStatistics(); hashInstances(testInstances); createDistanceMatrix(); // assign instances to singleton clusters m_numCurrentObjects = testInstances.numInstances(); m_clusterAssignments = new int[testInstances.numInstances()]; for (int i = 0; i < testInstances.numInstances(); i++) { m_clusterAssignments[i] = i; } // initialize m_clusters arraylist initIntClusters(); if (m_debug) { System.out.println("Starting with " + (m_numCurrentObjects) + " clusters; " + m_clusters.size() + "actual clusters; " + numObjects + " true objects desired"); } // merge clusters until desired number of clusters is reached while (m_numCurrentObjects > numObjects) { if (m_debug) { System.out.println("Merging with " + (m_numCurrentObjects) + " clusters left"); } mergeStep(); } System.out.println("Done deduping with " + m_clusters.size() + " clusters"); } /** * Computes the clusters from the cluster assignments * * @exception Exception if clusters could not be computed successfully */ public ArrayList initIntClusters() throws Exception { m_clusters = new ArrayList(); for (int i=0; i < m_testInstances.numInstances(); i++) { m_clusters.add(new Cluster(new Integer(i))); } return m_clusters; } /** * Internal method that finds two most similar clusters and merges them */ protected void mergeStep() throws Exception{ double bestDistance = Double.MAX_VALUE; double thisDistance; Cluster thisCluster, nextCluster; ArrayList mergeCandidatesList = new ArrayList(); int cluster1_index, cluster2_index; if (m_debug) { System.out.println("\nBefore merge step there are " + m_clusters.size() + " clusters; m_numCurrentObjects=" + m_numCurrentObjects); } // find two most similar clusters for (int i = 0; i < m_clusters.size()-1; i++){ thisCluster = (Cluster)m_clusters.get(i); for (int j = i+1; j < m_clusters.size(); j++) { thisDistance = clusterDistance(thisCluster, (Cluster) m_clusters.get(j)); // If there is a tie, add to the list of top distances if (thisDistance == bestDistance) { mergeCandidatesList.add(new Integer(i)); mergeCandidatesList.add(new Integer(j)); } else if (thisDistance < bestDistance) { // this is the best distance seen this far mergeCandidatesList.clear(); mergeCandidatesList.add(new Integer(i)); mergeCandidatesList.add(new Integer(j)); bestDistance = thisDistance; } } } // randomly pick a most similar pair from the list of candidates int i1 = (int) (mergeCandidatesList.size() * Math.random()); int i2 = (i1 % 2 > 0) ? (i1 - 1) : (i1 + 1); int cluster1Idx = ((Integer) mergeCandidatesList.get(i1)).intValue(); int cluster2Idx = ((Integer) mergeCandidatesList.get(i2)).intValue(); if (m_debug) { System.out.println("\nMerging clusters " + cluster1Idx + "(" + ((Cluster)m_clusters.get(cluster1Idx)).get(0) + ") and " + cluster2Idx + "(" + ((Cluster)m_clusters.get(cluster2Idx)).get(0) + "); distance=" + bestDistance + " actual=" + clusterDistance((Cluster)m_clusters.get(cluster1Idx), (Cluster)m_clusters.get(cluster2Idx))); Instance in1 = (Instance) m_instancesHash.get(((Cluster)m_clusters.get(cluster1Idx)).get(0)); Instance in2 = (Instance) m_instancesHash.get(((Cluster)m_clusters.get(cluster2Idx)).get(0)); if (in1.classValue() == in2.classValue()) { System.out.println("good: " + bestDistance + "\t" + in1 + "\tand" + in2); } else { System.out.println("BAD: " + bestDistance + "\t" + in1 + "\tand" + in2); } } Cluster newCluster = mergeClusters(cluster1Idx, cluster2Idx); // have to remove in order because we're using index, argh if (cluster1Idx > cluster2Idx) { m_clusters.remove(cluster1Idx); m_clusters.remove(cluster2Idx); } else { m_clusters.remove(cluster2Idx); m_clusters.remove(cluster1Idx); } m_clusters.add(newCluster); m_numCurrentObjects--; } /** * internal method that returns the distance between two clusters */ protected double clusterDistance(Cluster cluster1, Cluster cluster2) { if (cluster2 == null || cluster1 == null) { try{ printIntClusters(); } catch(Exception e){} } int i1 = ((Integer) cluster1.get(0)).intValue(); int i2 = ((Integer) cluster2.get(0)).intValue(); return m_distanceMatrix[i1][i2]; } boolean fuckedUp = false; /** Internal method to merge two clusters and update distances */ protected Cluster mergeClusters (int cluster1Idx, int cluster2Idx) throws Exception { Cluster newCluster = new Cluster(); Cluster cluster1 = (Cluster) m_clusters.get(cluster1Idx); Cluster cluster2 = (Cluster) m_clusters.get(cluster2Idx); int cluster1FirstIdx =((Integer) cluster1.get(0)).intValue(); int cluster2FirstIdx =((Integer) cluster2.get(0)).intValue(); newCluster.copyElements(cluster1); newCluster.copyElements(cluster2); // Update the distance matrix depending on the linkage type // go through all clusters and update the distance from first element // to the first element of the new cluster for (int i = 0; i < m_clusters.size(); i++){ if (i != cluster1Idx && i != cluster2Idx) { // skip these clusters themselves Cluster currentCluster = (Cluster) m_clusters.get(i); int currClusterFirstIdx = ((Integer) currentCluster.get(0)).intValue(); if (m_distanceMatrix[cluster1FirstIdx][currClusterFirstIdx] <= m_distanceMatrix[cluster2FirstIdx][currClusterFirstIdx]) { // first cluster is closer, no need to update // but just in case:// m_distanceMatrix[cluster2FirstIdx][currClusterFirstIdx] =// m_distanceMatrix[currClusterFirstIdx][cluster2FirstIdx] =// m_distanceMatrix[cluster1FirstIdx][currClusterFirstIdx]; } else { // second cluster is closer, must update distance between the first representative m_distanceMatrix[cluster1FirstIdx][currClusterFirstIdx] = m_distanceMatrix[currClusterFirstIdx][cluster1FirstIdx] = m_distanceMatrix[cluster2FirstIdx][currClusterFirstIdx]; } // check for infinity links if (m_distanceMatrix[cluster2FirstIdx][currClusterFirstIdx] == Double.POSITIVE_INFINITY) { m_distanceMatrix[cluster1FirstIdx][currClusterFirstIdx] = m_distanceMatrix[currClusterFirstIdx][cluster1FirstIdx] = Double.POSITIVE_INFINITY; } if (m_distanceMatrix[cluster1FirstIdx][currClusterFirstIdx] == Double.POSITIVE_INFINITY) { m_distanceMatrix[cluster2FirstIdx][currClusterFirstIdx] = m_distanceMatrix[currClusterFirstIdx][cluster2FirstIdx] = Double.POSITIVE_INFINITY; } } } // update pair statistics m_numTotalPairs += cluster1.size() * cluster2.size(); int newGoodPairs = numCrossClusterTruePairs(cluster1, cluster2); m_numGoodPairs += newGoodPairs; accumulateStatistics(); return newCluster; } /** * Create the hashtable from given Instances; * keys are numeric indeces, values are actual Instances * * @param data Instances * */ protected void hashInstances (Instances data) { m_testInstances = data; m_instancesHash = new HashMap(); m_reverseInstancesHash = new HashMap(); m_classValues = new double[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { Instance instance = data.instance(i); m_classValues[i] = instance.classValue(); if (!m_instancesHash.containsValue(instance)) { Integer idx = new Integer(i); m_instancesHash.put(idx, instance); m_reverseInstancesHash.put(instance, idx); } else { System.out.println("STupid fuck, dupe! " + i); } } } /** * Fill the distance matrix with values using the metric * */ protected void createDistanceMatrix () throws Exception { int n = m_instancesHash.size(); double sim;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -