⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 basicdeduper.java

📁 wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
    m_distanceMatrix = new double[n][n];    if (m_useBlocking) {      for (int i = 0; i < n; i++) {	Arrays.fill(m_distanceMatrix[i], Double.MAX_VALUE);      }            Blocking blocker = new Blocking();      blocker.buildIndex(m_testInstances);      InstancePair[] pairs = blocker.getMostSimilarPairs(m_testInstances.numClasses() * 50);      for (int i = 0; i < pairs.length && pairs[i] != null; i++) {	int idx1 = ((Integer) m_reverseInstancesHash.get(pairs[i].instance1)).intValue();	int idx2 = ((Integer) m_reverseInstancesHash.get(pairs[i].instance2)).intValue();	m_distanceMatrix[idx1][idx2] = m_distanceMatrix[idx1][idx2] = pairs[i].value;      }    }        for (int i = 0; i < n; i++) {      for (int j = i+1; j < n; j++) {	if (!m_useBlocking || m_distanceMatrix[i][j] != Double.MAX_VALUE) {	  m_distanceMatrix[i][j] = m_distanceMatrix[j][i] =	    m_metric.distance((Instance) m_instancesHash.get(new Integer(i)),			      (Instance) m_instancesHash.get(new Integer(j)));	  Instance i1 = (Instance) m_instancesHash.get(new Integer(i));	  Instance j1 = (Instance) m_instancesHash.get(new Integer(j));	}      }    }  }    /** Outputs the current clustering   *   * @exception Exception if something goes wrong   */  public void printIntClusters() throws Exception {    if (m_clusters == null)      throw new Exception ("Clusters were not created");    for (int i = 0; i < m_clusters.size(); i++) {      Cluster cluster = (Cluster) m_clusters.get(i);      System.out.println ("Cluster " + i + " consists of " + cluster.size() + " elements");      for (int j = 0; j < cluster.size(); j++) {	//		Instance instance = (Instance) m_instancesHash.get((Integer) cluster.elementAt(j));	Integer idx = (Integer) cluster.get(j);	Instance instance = (Instance) m_instancesHash.get(idx);	System.out.println("\t\t" + instance);      }    }  }  /** A helper function that stratifies a training set and selects a proportion of   * true objects for training   * @param instances a set of instances from which to select the training data   * @return a subset of those instances   */  Instances getTrainingSet(Instances instances) {    HashMap classHash = new HashMap();    int numTotalInstances = instances.numInstances();    Random rand = new Random(numTotalInstances);    Instances trainInstances = new Instances(instances, (int) (m_trainProportion * numTotalInstances));    // hash each class     for (int i=0; i < instances.numInstances(); i++) {      Instance instance = instances.instance(i);      Double classValue = new Double(instance.classValue());      if (classHash.containsKey(classValue)) {	ArrayList list = (ArrayList) classHash.get(classValue);	list.add(instance);      } else {	// this class has not been seen before, create an entry for it	ArrayList list = new ArrayList();	list.add(instance);	classHash.put(classValue, list);      }    }    // select a desired proportion of classes    ArrayList[] classes = new ArrayList[classHash.size()];    classes = (ArrayList[]) classHash.values().toArray(classes);    int numClasses = classes.length;    int[] indeces = PairwiseSelector.randomSubset((int) (m_trainProportion * numClasses), numClasses);    for (int i = 0; i < indeces.length; i++) {      for (int j = 0; j < classes[i].size(); j++) {	Instance instance = (Instance) classes[i].get(j);	trainInstances.add(instance);      }    }     return trainInstances;  }  /** Set the amount of training   * @param trainProportion the proportion of the training set that will be used for learning   */  public void setTrainProportion(double trainProportion) {    m_trainProportion = trainProportion;  }  /** Get the amount of training   * @return the proportion of the training set that will be used for learning   */  public double getTrainProportion() {    return m_trainProportion;  }  /** Given a test set, calculate the number of true pairs   * @param instances a set of objects, class has the true object ID   * @returns the number of true same-class pairs   */  protected int numTruePairs(Instances instances) {    int numTruePairs = 0;    // get the class counts    HashMap classCountMap = new HashMap();    for (int i = 0; i < instances.numInstances(); i++) {      Instance instance = instances.instance(i);      Double classValue = new Double(instance.classValue());      if (classCountMap.containsKey(classValue)) {	Integer counts = (Integer) classCountMap.get(classValue);	classCountMap.put(classValue, new Integer(counts.intValue() + 1));      } else {	classCountMap.put(classValue, new Integer(1));      }    }        // calculate the number of pairs    Iterator iterator = classCountMap.values().iterator();    while (iterator.hasNext()) {      int counts = ((Integer) iterator.next()).intValue();      numTruePairs += counts * (counts - 1) / 2;    }    return numTruePairs;  }  /** Given two clusters, calculate the number of true pairs that   * will be added when the clusters are merged   * @param cluster1 the first cluster to merge   * @param cluster2 the second cluster to merge   * @returns the number of true pairs that will appear once clusters are merged   */  protected int numCrossClusterTruePairs(Cluster cluster1, Cluster cluster2) {    int numCCTruePairs = 0;    int[] classCounts1 = new int[m_numObjects];    for (int i = 0; i < cluster1.size(); i++) {      Integer instanceIdx = (Integer) cluster1.get(i);      classCounts1[(int)m_classValues[instanceIdx.intValue()]]++;    }    int[] classCounts2 = new int[m_numObjects];    for (int i = 0; i < cluster2.size(); i++) {      Integer instanceIdx = (Integer) cluster2.get(i);      classCounts2[(int)m_classValues[instanceIdx.intValue()]]++;    }    for (int i = 0; i < m_numObjects; i++) {      numCCTruePairs += classCounts1[i] * classCounts2[i];      if (classCounts1[i] != 0 || classCounts2[i] != 0) { //  	System.out.println(i + "\t" + classCounts1[i] + "\t" + classCounts2[i]);      }    }    return numCCTruePairs;  }   /** Add the current state of things to statistics */  protected void accumulateStatistics() {    Object[] currentStats = new Object[16];    double precision = (m_numGoodPairs+0.0)/m_numTotalPairs;    double recall = (m_numGoodPairs+0.0)/m_numTruePairs;    double fmeasure = 0;    if (precision > 0) {  // avoid divide by zero in the p=0&r=0 case      fmeasure = 2 * (precision * recall) / (precision + recall);    }    int statIdx = 0;    currentStats[statIdx++] = new Double(m_numCurrentObjects);    // Accuracy statistics    currentStats[statIdx++] = new Double(recall);    currentStats[statIdx++] = new Double(precision);    currentStats[statIdx++] = new Double(fmeasure);    // Dupe density statistics    currentStats[statIdx++] = new Double(m_numTotalPairsTrain);    currentStats[statIdx++] = new Double(m_numPotentialDupePairsTrain);    currentStats[statIdx++] = new Double(m_numActualDupePairsTrain);    currentStats[statIdx++] = new Double(m_numPotentialNonDupePairsTrain);    currentStats[statIdx++] = new Double(m_numActualNonDupePairsTrain);    currentStats[statIdx++] = new Double((m_numActualNonDupePairsTrain > 0) ?					 ((m_numActualDupePairsTrain+0.0)/m_numActualNonDupePairsTrain) : 0);    currentStats[statIdx++] = new Double((m_numPotentialDupePairsTrain+0.0)/m_numTotalPairsTrain);    currentStats[statIdx++] = new Double(m_numTotalPairsTest);        currentStats[statIdx++] = new Double(m_numTruePairs);         currentStats[statIdx++] = new Double((m_numTruePairs + 0.0)/m_numTotalPairsTest);    // Timing statistics    currentStats[statIdx++] = new Double(m_trainTime);    currentStats[statIdx++] = new Double((System.currentTimeMillis() - m_testTimeStart)/1000.0);    m_statistics.add(currentStats);  }  /** Reset the current statistics */  protected void resetStatistics() {    m_statistics = new ArrayList();    m_numGoodPairs = 0;    m_numTotalPairs = 0;    m_testTimeStart = System.currentTimeMillis();  }   /** Set the InstanceMetric that is used   * @param metric the InstanceMetric that is used to dedupe   */  public void setMetric(InstanceMetric metric) {    m_metric = metric;  }  /** Get the InstanceMetric that is used   * @return the InstanceMetric that is used to dedupe   */  public InstanceMetric getMetric() {    return m_metric;  }  /** Turn debugging output on/off   * @param debug if true, debugging info will be printed   */  public void setDebug(boolean debug) {    m_debug = debug;  }  /** See whether debugging output is on/off   * @returns if true, debugging info will be printed   */  public boolean getDebug() {    return m_debug;  }  /** Turn debugging output on/off   * @param debug if true, blocking is on   */  public void setUseBlocking(boolean useBlocking) {    m_useBlocking = useBlocking;  }  /** See whether blocking is on/off   * @returns if true, blocking is on   */  public boolean getUseBlocking() {    return m_useBlocking;  }      /**   * Returns an enumeration describing the available options   *   * @return an enumeration of all the available options   **/  public Enumeration listOptions() {    Vector newVector = new Vector(2);    newVector.addElement(new Option("\tMetric.\n"				    +"\t(default=ClassifierInstanceMetric)", "M", 1,"-M metric_name metric_options"));    return newVector.elements();  }  /**   * Parses a given list of options.   *   * Valid options are:<p>   *   * -M metric options <p>   * InstanceMetric used <p>   *   * @param options the list of options as an array of strings   * @exception Exception if an option is not supported   *   **/  public void setOptions(String[] options) throws Exception {    String optionString;    String metricString = Utils.getOption('M', options);    if (metricString.length() != 0) {      String[] metricSpec = Utils.splitOptions(metricString);      String metricName = metricSpec[0];       metricSpec[0] = "";      System.out.println("Metric name: " + metricName + "\nMetric parameters: " + concatStringArray(metricSpec));      setMetric(InstanceMetric.forName(metricName, metricSpec));    }  }  /**   * Gets the current settings of Greedy Agglomerative Clustering   *   * @return an array of strings suitable for passing to setOptions()   */  public String [] getOptions() {    String [] options = new String [250];    int current = 0;    if (m_useBlocking == false) {       options[current++] = "-NB";     }    options[current++] = "-T";    options[current++] = "" + m_trainProportion;    if (m_debug) {      options[current++] = "-D";    }    options[current++] = "-M";    options[current++] = Utils.removeSubstring(m_metric.getClass().getName(), "weka.deduping.metrics.");    if (m_metric instanceof OptionHandler) {      String[] metricOptions = ((OptionHandler)m_metric).getOptions();      for (int i = 0; i < metricOptions.length; i++) {	options[current++] = metricOptions[i];      }    }     while (current < options.length) {      options[current++] = "";    }    return options;  }  /** A little helper to create a single String from an array of Strings   * @param strings an array of strings   * @returns a single concatenated string   */  public static String concatStringArray(String[] strings) {    StringBuffer buffer = new StringBuffer();    for (int i = 0; i < strings.length; i++) {      buffer.append(strings[i]);      buffer.append(" ");    }    return buffer.toString();  } }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -