📄 basicdeduper.java
字号:
m_distanceMatrix = new double[n][n]; if (m_useBlocking) { for (int i = 0; i < n; i++) { Arrays.fill(m_distanceMatrix[i], Double.MAX_VALUE); } Blocking blocker = new Blocking(); blocker.buildIndex(m_testInstances); InstancePair[] pairs = blocker.getMostSimilarPairs(m_testInstances.numClasses() * 50); for (int i = 0; i < pairs.length && pairs[i] != null; i++) { int idx1 = ((Integer) m_reverseInstancesHash.get(pairs[i].instance1)).intValue(); int idx2 = ((Integer) m_reverseInstancesHash.get(pairs[i].instance2)).intValue(); m_distanceMatrix[idx1][idx2] = m_distanceMatrix[idx1][idx2] = pairs[i].value; } } for (int i = 0; i < n; i++) { for (int j = i+1; j < n; j++) { if (!m_useBlocking || m_distanceMatrix[i][j] != Double.MAX_VALUE) { m_distanceMatrix[i][j] = m_distanceMatrix[j][i] = m_metric.distance((Instance) m_instancesHash.get(new Integer(i)), (Instance) m_instancesHash.get(new Integer(j))); Instance i1 = (Instance) m_instancesHash.get(new Integer(i)); Instance j1 = (Instance) m_instancesHash.get(new Integer(j)); } } } } /** Outputs the current clustering * * @exception Exception if something goes wrong */ public void printIntClusters() throws Exception { if (m_clusters == null) throw new Exception ("Clusters were not created"); for (int i = 0; i < m_clusters.size(); i++) { Cluster cluster = (Cluster) m_clusters.get(i); System.out.println ("Cluster " + i + " consists of " + cluster.size() + " elements"); for (int j = 0; j < cluster.size(); j++) { // Instance instance = (Instance) m_instancesHash.get((Integer) cluster.elementAt(j)); Integer idx = (Integer) cluster.get(j); Instance instance = (Instance) m_instancesHash.get(idx); System.out.println("\t\t" + instance); } } } /** A helper function that stratifies a training set and selects a proportion of * true objects for training * @param instances a set of instances from which to select the training data * @return a subset of those instances */ Instances getTrainingSet(Instances instances) { HashMap classHash = new HashMap(); int numTotalInstances = instances.numInstances(); Random rand = new Random(numTotalInstances); Instances trainInstances = new Instances(instances, (int) (m_trainProportion * numTotalInstances)); // hash each class for (int i=0; i < instances.numInstances(); i++) { Instance instance = instances.instance(i); Double classValue = new Double(instance.classValue()); if (classHash.containsKey(classValue)) { ArrayList list = (ArrayList) classHash.get(classValue); list.add(instance); } else { // this class has not been seen before, create an entry for it ArrayList list = new ArrayList(); list.add(instance); classHash.put(classValue, list); } } // select a desired proportion of classes ArrayList[] classes = new ArrayList[classHash.size()]; classes = (ArrayList[]) classHash.values().toArray(classes); int numClasses = classes.length; int[] indeces = PairwiseSelector.randomSubset((int) (m_trainProportion * numClasses), numClasses); for (int i = 0; i < indeces.length; i++) { for (int j = 0; j < classes[i].size(); j++) { Instance instance = (Instance) classes[i].get(j); trainInstances.add(instance); } } return trainInstances; } /** Set the amount of training * @param trainProportion the proportion of the training set that will be used for learning */ public void setTrainProportion(double trainProportion) { m_trainProportion = trainProportion; } /** Get the amount of training * @return the proportion of the training set that will be used for learning */ public double getTrainProportion() { return m_trainProportion; } /** Given a test set, calculate the number of true pairs * @param instances a set of objects, class has the true object ID * @returns the number of true same-class pairs */ protected int numTruePairs(Instances instances) { int numTruePairs = 0; // get the class counts HashMap classCountMap = new HashMap(); for (int i = 0; i < instances.numInstances(); i++) { Instance instance = instances.instance(i); Double classValue = new Double(instance.classValue()); if (classCountMap.containsKey(classValue)) { Integer counts = (Integer) classCountMap.get(classValue); classCountMap.put(classValue, new Integer(counts.intValue() + 1)); } else { classCountMap.put(classValue, new Integer(1)); } } // calculate the number of pairs Iterator iterator = classCountMap.values().iterator(); while (iterator.hasNext()) { int counts = ((Integer) iterator.next()).intValue(); numTruePairs += counts * (counts - 1) / 2; } return numTruePairs; } /** Given two clusters, calculate the number of true pairs that * will be added when the clusters are merged * @param cluster1 the first cluster to merge * @param cluster2 the second cluster to merge * @returns the number of true pairs that will appear once clusters are merged */ protected int numCrossClusterTruePairs(Cluster cluster1, Cluster cluster2) { int numCCTruePairs = 0; int[] classCounts1 = new int[m_numObjects]; for (int i = 0; i < cluster1.size(); i++) { Integer instanceIdx = (Integer) cluster1.get(i); classCounts1[(int)m_classValues[instanceIdx.intValue()]]++; } int[] classCounts2 = new int[m_numObjects]; for (int i = 0; i < cluster2.size(); i++) { Integer instanceIdx = (Integer) cluster2.get(i); classCounts2[(int)m_classValues[instanceIdx.intValue()]]++; } for (int i = 0; i < m_numObjects; i++) { numCCTruePairs += classCounts1[i] * classCounts2[i]; if (classCounts1[i] != 0 || classCounts2[i] != 0) { // System.out.println(i + "\t" + classCounts1[i] + "\t" + classCounts2[i]); } } return numCCTruePairs; } /** Add the current state of things to statistics */ protected void accumulateStatistics() { Object[] currentStats = new Object[16]; double precision = (m_numGoodPairs+0.0)/m_numTotalPairs; double recall = (m_numGoodPairs+0.0)/m_numTruePairs; double fmeasure = 0; if (precision > 0) { // avoid divide by zero in the p=0&r=0 case fmeasure = 2 * (precision * recall) / (precision + recall); } int statIdx = 0; currentStats[statIdx++] = new Double(m_numCurrentObjects); // Accuracy statistics currentStats[statIdx++] = new Double(recall); currentStats[statIdx++] = new Double(precision); currentStats[statIdx++] = new Double(fmeasure); // Dupe density statistics currentStats[statIdx++] = new Double(m_numTotalPairsTrain); currentStats[statIdx++] = new Double(m_numPotentialDupePairsTrain); currentStats[statIdx++] = new Double(m_numActualDupePairsTrain); currentStats[statIdx++] = new Double(m_numPotentialNonDupePairsTrain); currentStats[statIdx++] = new Double(m_numActualNonDupePairsTrain); currentStats[statIdx++] = new Double((m_numActualNonDupePairsTrain > 0) ? ((m_numActualDupePairsTrain+0.0)/m_numActualNonDupePairsTrain) : 0); currentStats[statIdx++] = new Double((m_numPotentialDupePairsTrain+0.0)/m_numTotalPairsTrain); currentStats[statIdx++] = new Double(m_numTotalPairsTest); currentStats[statIdx++] = new Double(m_numTruePairs); currentStats[statIdx++] = new Double((m_numTruePairs + 0.0)/m_numTotalPairsTest); // Timing statistics currentStats[statIdx++] = new Double(m_trainTime); currentStats[statIdx++] = new Double((System.currentTimeMillis() - m_testTimeStart)/1000.0); m_statistics.add(currentStats); } /** Reset the current statistics */ protected void resetStatistics() { m_statistics = new ArrayList(); m_numGoodPairs = 0; m_numTotalPairs = 0; m_testTimeStart = System.currentTimeMillis(); } /** Set the InstanceMetric that is used * @param metric the InstanceMetric that is used to dedupe */ public void setMetric(InstanceMetric metric) { m_metric = metric; } /** Get the InstanceMetric that is used * @return the InstanceMetric that is used to dedupe */ public InstanceMetric getMetric() { return m_metric; } /** Turn debugging output on/off * @param debug if true, debugging info will be printed */ public void setDebug(boolean debug) { m_debug = debug; } /** See whether debugging output is on/off * @returns if true, debugging info will be printed */ public boolean getDebug() { return m_debug; } /** Turn debugging output on/off * @param debug if true, blocking is on */ public void setUseBlocking(boolean useBlocking) { m_useBlocking = useBlocking; } /** See whether blocking is on/off * @returns if true, blocking is on */ public boolean getUseBlocking() { return m_useBlocking; } /** * Returns an enumeration describing the available options * * @return an enumeration of all the available options **/ public Enumeration listOptions() { Vector newVector = new Vector(2); newVector.addElement(new Option("\tMetric.\n" +"\t(default=ClassifierInstanceMetric)", "M", 1,"-M metric_name metric_options")); return newVector.elements(); } /** * Parses a given list of options. * * Valid options are:<p> * * -M metric options <p> * InstanceMetric used <p> * * @param options the list of options as an array of strings * @exception Exception if an option is not supported * **/ public void setOptions(String[] options) throws Exception { String optionString; String metricString = Utils.getOption('M', options); if (metricString.length() != 0) { String[] metricSpec = Utils.splitOptions(metricString); String metricName = metricSpec[0]; metricSpec[0] = ""; System.out.println("Metric name: " + metricName + "\nMetric parameters: " + concatStringArray(metricSpec)); setMetric(InstanceMetric.forName(metricName, metricSpec)); } } /** * Gets the current settings of Greedy Agglomerative Clustering * * @return an array of strings suitable for passing to setOptions() */ public String [] getOptions() { String [] options = new String [250]; int current = 0; if (m_useBlocking == false) { options[current++] = "-NB"; } options[current++] = "-T"; options[current++] = "" + m_trainProportion; if (m_debug) { options[current++] = "-D"; } options[current++] = "-M"; options[current++] = Utils.removeSubstring(m_metric.getClass().getName(), "weka.deduping.metrics."); if (m_metric instanceof OptionHandler) { String[] metricOptions = ((OptionHandler)m_metric).getOptions(); for (int i = 0; i < metricOptions.length; i++) { options[current++] = metricOptions[i]; } } while (current < options.length) { options[current++] = ""; } return options; } /** A little helper to create a single String from an array of Strings * @param strings an array of strings * @returns a single concatenated string */ public static String concatStringArray(String[] strings) { StringBuffer buffer = new StringBuffer(); for (int i = 0; i < strings.length; i++) { buffer.append(strings[i]); buffer.append(" "); } return buffer.toString(); } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -