⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 basicdeduper.java

📁 wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    BasicDeduper.java *    Copyright (C) 2003 Mikhail Bilenko * */package weka.deduping;import weka.core.*;import weka.deduping.metrics.*;import weka.deduping.blocking.*;import java.io.Serializable;import java.util.*;import weka.clusterers.Cluster;/** A basic deduper class that takes a set of objects and * identifies disjoint subsets of duplicates * * @author Mikhail Bilenko (mbilenko@cs.utexas.edu) * @version $Revision: 1.7 $ */public class BasicDeduper extends Deduper implements OptionHandler, Serializable {  /** A metric measuring similarity between every pair of instances */  InstanceMetric m_metric = new SumInstanceMetric();  /** The proportion of the training fold that should be used for training*/  protected double  m_trainProportion = 1.0;    /** distance matrix containing the distance between each pair */  protected double[][] m_distanceMatrix = null;    /** instance hash, where each Integer index is hashed to an instance */  protected HashMap m_instancesHash = null;  /** reverse instance hash, where each instance is hashed to its Integer index */  protected HashMap m_reverseInstancesHash = null;  /** the attribute indeces on which to do deduping */  protected int[] m_attrIdxs = null;  /** The total number of true objects */  protected int m_numObjects = 0;    /** An array containing class values for instances (for faster statistics) */  protected double[] m_classValues = null;  /** Number of clusters in the process*/  protected int m_numCurrentObjects = 0;  /** holds the clusters */  protected ArrayList m_clusters = null;  /** A set of instances to dedupe */  protected Instances m_testInstances = null;  /** Use blocking ? */  protected boolean m_useBlocking = false;     /**   * temporary variable holding cluster assignments   */  protected int [] m_clusterAssignments;  /** verbose? */  protected boolean m_debug = false;  /** Statistics */  protected int m_numTotalPairs = 0;  protected int m_numGoodPairs = 0;  protected int m_numTruePairs = 0;  protected int m_numTotalPairsTrain = 0;  // the overall number of pairs in the test split  protected int m_numTotalPairsTest = 0;  // the overall number of pairs in the test split  protected int m_numPotentialDupePairsTrain = 0;  protected int m_numActualDupePairsTrain = 0;  protected int m_numPotentialNonDupePairsTrain = 0;  protected int m_numActualNonDupePairsTrain = 0;  protected double m_trainTime = 0;  protected double m_testTimeStart = 0;    /** Given training data, build the metrics required by the deduper   * @param train a set of training data   */  public void buildDeduper(Instances trainFold, Instances testInstances) throws Exception {    Instances trainInstances = getTrainingSet(trainFold);    m_numTotalPairsTrain = trainInstances.numInstances() * (trainInstances.numInstances() - 1) / 2;    m_numPotentialDupePairsTrain = numTruePairs(trainInstances);    m_numPotentialNonDupePairsTrain = m_numTotalPairsTrain - m_numPotentialDupePairsTrain;    // if the indexes have not been set, use all except for class    if (m_attrIdxs == null) {      m_attrIdxs = new int[trainInstances.numAttributes() - 1];      int classIdx = trainInstances.classIndex();      int counter = 0;      for (int i = 0; i < m_attrIdxs.length + 1; i++) {	if (i != classIdx) {	  m_attrIdxs[counter++] = i;	}      }     }    // train the instance metric    long trainTimeStart = System.currentTimeMillis();    m_metric.buildInstanceMetric(m_attrIdxs);    m_metric.trainInstanceMetric(trainInstances, testInstances);    // get training statistics    m_numActualDupePairsTrain = m_metric.getNumActualPosPairs();       m_numActualNonDupePairsTrain = m_metric.getNumActualNegPairs();            m_trainTime = (System.currentTimeMillis() - trainTimeStart)/1000.0;    m_distanceMatrix = null;  }  /** Identify duplicates within the testing data   * @param testInstances a set of instances among which to identify duplicates   * @param numObjects the number of "true object" sets to create   * @return a list of object sets   */  public void findDuplicates(Instances testInstances, int numObjects) throws Exception {    m_numObjects = testInstances.numClasses();    m_numTruePairs = numTruePairs(testInstances);    m_numTotalPairsTest = testInstances.numInstances() * (testInstances.numInstances() -1) / 2;    resetStatistics();    hashInstances(testInstances);    createDistanceMatrix();    // assign instances to singleton clusters    m_numCurrentObjects = testInstances.numInstances();    m_clusterAssignments = new int[testInstances.numInstances()];    for (int i = 0; i < testInstances.numInstances(); i++) {      m_clusterAssignments[i] = i;    }    // initialize m_clusters arraylist    initIntClusters();    if (m_debug) {      System.out.println("Starting with  " + (m_numCurrentObjects) + " clusters; " + m_clusters.size() +			 "actual clusters; " + numObjects + " true objects desired");    }    // merge clusters until desired number of clusters is reached    while (m_numCurrentObjects > numObjects) {      if (m_debug) {	System.out.println("Merging with " + (m_numCurrentObjects) + " clusters left");      }      mergeStep();    }    System.out.println("Done deduping with " + m_clusters.size() + " clusters");  }    /**   * Computes the clusters from the cluster assignments   *    * @exception Exception if clusters could not be computed successfully   */      public ArrayList initIntClusters() throws Exception {    m_clusters = new ArrayList();    for (int i=0; i < m_testInstances.numInstances(); i++) {      m_clusters.add(new Cluster(new Integer(i)));    }    return m_clusters;  }    /**   * Internal method that finds two most similar clusters and merges them   */  protected void mergeStep() throws Exception{    double bestDistance = Double.MAX_VALUE;    double thisDistance;    Cluster thisCluster, nextCluster;    ArrayList mergeCandidatesList = new ArrayList();    int cluster1_index, cluster2_index;    if (m_debug) {       System.out.println("\nBefore merge step there are " + m_clusters.size() +			 " clusters; m_numCurrentObjects=" + m_numCurrentObjects);    }    // find two most similar clusters     for (int i = 0; i < m_clusters.size()-1; i++){      thisCluster = (Cluster)m_clusters.get(i);      for (int j = i+1; j < m_clusters.size(); j++) {	thisDistance = clusterDistance(thisCluster, (Cluster) m_clusters.get(j));	// If there is a tie, add to the list of top distances	if (thisDistance == bestDistance) {	  mergeCandidatesList.add(new Integer(i));	  mergeCandidatesList.add(new Integer(j));	} else if (thisDistance < bestDistance) {  // this is the best distance seen this far	  mergeCandidatesList.clear();	  mergeCandidatesList.add(new Integer(i));	  mergeCandidatesList.add(new Integer(j));	  bestDistance = thisDistance;	}      }    }    // randomly pick a most similar pair from the list of candidates    int i1 = (int) (mergeCandidatesList.size() * Math.random());    int i2 = (i1 % 2 > 0) ? (i1 - 1) : (i1 + 1);    int cluster1Idx = ((Integer) mergeCandidatesList.get(i1)).intValue();    int cluster2Idx = ((Integer) mergeCandidatesList.get(i2)).intValue();    if (m_debug) {      System.out.println("\nMerging clusters " + cluster1Idx + "(" + ((Cluster)m_clusters.get(cluster1Idx)).get(0) + 			 ") and " + cluster2Idx + "(" + ((Cluster)m_clusters.get(cluster2Idx)).get(0) + 			 ");  distance=" + bestDistance +			 " actual=" + clusterDistance((Cluster)m_clusters.get(cluster1Idx),						      (Cluster)m_clusters.get(cluster2Idx)));      Instance in1 =  (Instance) m_instancesHash.get(((Cluster)m_clusters.get(cluster1Idx)).get(0));      Instance in2 =  (Instance) m_instancesHash.get(((Cluster)m_clusters.get(cluster2Idx)).get(0));      if (in1.classValue() == in2.classValue()) {	System.out.println("good: " + bestDistance + "\t" + in1 + "\tand" + in2);      } else {	System.out.println("BAD:  " + bestDistance + "\t" + in1 + "\tand" + in2);      }     }    Cluster newCluster = mergeClusters(cluster1Idx, cluster2Idx);    // have to remove in order because we're using index, argh    if (cluster1Idx > cluster2Idx) {      m_clusters.remove(cluster1Idx);      m_clusters.remove(cluster2Idx);    } else {      m_clusters.remove(cluster2Idx);      m_clusters.remove(cluster1Idx);    }    m_clusters.add(newCluster);    m_numCurrentObjects--;  }    /**   * internal method that returns the distance between two clusters   */  protected double clusterDistance(Cluster cluster1, Cluster cluster2) {    if (cluster2 == null || cluster1 == null) {      try{	printIntClusters();      } catch(Exception e){}    }	    int i1 = ((Integer) cluster1.get(0)).intValue();    int i2 = ((Integer) cluster2.get(0)).intValue();    return m_distanceMatrix[i1][i2];  }     boolean fuckedUp = false;    /** Internal method to merge two clusters and update distances   */  protected Cluster mergeClusters (int cluster1Idx, int cluster2Idx) throws Exception {    Cluster newCluster = new Cluster();    Cluster cluster1 = (Cluster) m_clusters.get(cluster1Idx);    Cluster cluster2 = (Cluster) m_clusters.get(cluster2Idx);    int cluster1FirstIdx =((Integer) cluster1.get(0)).intValue();    int cluster2FirstIdx =((Integer) cluster2.get(0)).intValue();     newCluster.copyElements(cluster1);    newCluster.copyElements(cluster2);        // Update the distance matrix depending on the linkage type    // go through all clusters and update the distance from first element    // to the first element of the new cluster    for (int i = 0; i < m_clusters.size(); i++){      if (i != cluster1Idx && i != cluster2Idx) { // skip these clusters themselves	Cluster currentCluster = (Cluster) m_clusters.get(i);	int currClusterFirstIdx = ((Integer) currentCluster.get(0)).intValue();	if (m_distanceMatrix[cluster1FirstIdx][currClusterFirstIdx] <=	    m_distanceMatrix[cluster2FirstIdx][currClusterFirstIdx]) {	  // first cluster is closer, no need to update	  // but just in case://  	  m_distanceMatrix[cluster2FirstIdx][currClusterFirstIdx] =//  	    m_distanceMatrix[currClusterFirstIdx][cluster2FirstIdx] =//  	    m_distanceMatrix[cluster1FirstIdx][currClusterFirstIdx];	} else {	  // second cluster is closer, must update distance between the first representative	  m_distanceMatrix[cluster1FirstIdx][currClusterFirstIdx] =	    m_distanceMatrix[currClusterFirstIdx][cluster1FirstIdx] =	    m_distanceMatrix[cluster2FirstIdx][currClusterFirstIdx];	}	// check for infinity links	if (m_distanceMatrix[cluster2FirstIdx][currClusterFirstIdx] == Double.POSITIVE_INFINITY) {	  m_distanceMatrix[cluster1FirstIdx][currClusterFirstIdx] =	    m_distanceMatrix[currClusterFirstIdx][cluster1FirstIdx] = Double.POSITIVE_INFINITY;	}	if (m_distanceMatrix[cluster1FirstIdx][currClusterFirstIdx] == Double.POSITIVE_INFINITY) {	  m_distanceMatrix[cluster2FirstIdx][currClusterFirstIdx] =	    m_distanceMatrix[currClusterFirstIdx][cluster2FirstIdx] = Double.POSITIVE_INFINITY;	}      }    }        // update pair statistics    m_numTotalPairs += cluster1.size() * cluster2.size();    int newGoodPairs = numCrossClusterTruePairs(cluster1, cluster2);     m_numGoodPairs += newGoodPairs;    accumulateStatistics();    return newCluster;  }    /**   * Create the hashtable from given Instances;   * keys are numeric indeces, values are actual Instances   *   * @param data Instances   *   */  protected void hashInstances (Instances data) {    m_testInstances = data;    m_instancesHash = new HashMap();    m_reverseInstancesHash = new HashMap();    m_classValues = new double[data.numInstances()];            for (int i = 0; i < data.numInstances(); i++) {      Instance instance = data.instance(i);      m_classValues[i] = instance.classValue();      if (!m_instancesHash.containsValue(instance)) {	Integer idx = new Integer(i);	m_instancesHash.put(idx, instance);	m_reverseInstancesHash.put(instance, idx);      } else {	System.out.println("STupid fuck, dupe! " + i);      }    }  }    /**   * Fill the distance matrix with values using the metric   *   */  protected void createDistanceMatrix () throws Exception {    int n = m_instancesHash.size();    double sim;	

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -