⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 classifierinstancemetric.java

📁 wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    ClassifierInstanceMetric.java *    Copyright (C) 2003 Mikhail Bilenko * */package weka.deduping.metrics;import java.util.ArrayList;import java.util.Vector;import java.util.Enumeration;import java.util.Date;import java.text.SimpleDateFormat;import java.io.*;import weka.deduping.*;import weka.core.*;import weka.classifiers.DistributionClassifier;import weka.classifiers.sparse.SVMlight;import weka.classifiers.Evaluation;/**  * ClassifierInstanceMetric class employs a classifier that uses * values returned by various StringMetric's on individual fields * as features and outputs a confidence value that corresponds to * similarity between records * * @author Mikhail Bilenko (mbilenko@cs.utexas.edu) * @version $Revision: 1.5 $ */public class ClassifierInstanceMetric extends InstanceMetric implements OptionHandler, Serializable {    /** Classifier that is used for estimating similarity between records */  protected DistributionClassifier m_classifier = new SVMlight();    /** A selector object that will create training sets */  PairwiseSelector m_selector = new PairwiseSelector();  /** The desired number of training pairs */  protected int m_numPosPairs = 200;  protected int m_numNegPairs = 200;  /** StringMetric prototype that are to be used on each field */  protected StringMetric [] m_stringMetrics = new StringMetric[0];  /** The actual array of metrics */  protected StringMetric [][] m_fieldMetrics = null;  /** A temporary dataset that contains diff-instances for training the classifier */  protected Instances m_diffInstances = null;  /** A default constructor */  public ClassifierInstanceMetric() {  }         /**   * Generates a new ClassifierInstanceMetric that computes   * similarity between records using the specified attributes. Has to   * initialize all metric fields with default string metrics   *   * @param attrIdxs the indeces of attributes that the metric will use   * @exception Exception if the distance metric has not been   * generated successfully.  */  public void buildInstanceMetric(int[] attrIdxs) throws Exception {    // initialize the array of metrics for each attribute    m_attrIdxs = attrIdxs;    m_fieldMetrics = new StringMetric[m_stringMetrics.length][m_attrIdxs.length];    for (int i = 0; i < m_stringMetrics.length; i++) {      for (int j = 0; j < m_attrIdxs.length; j++) {	m_fieldMetrics[i][j] = (StringMetric) m_stringMetrics[i].clone();      }    }   }    /**   * Create a new metric for operating on specified instances   * @param trainData instances for training the metric   * @param testData instances that will be used for testing   */  public void trainInstanceMetric(Instances trainData, Instances testData) throws Exception {    m_selector.initSelector(trainData);        // if we have data-dependent or trainable metrics    // (e.g. vector-space or learnable ED), build them with available    // test/train data    ArrayList [] attrStringLists = null;    for (int i = 0; i < m_stringMetrics.length; i++) {      if (m_stringMetrics[i] instanceof DataDependentStringMetric) {	// populate the list of strings for each attribute now that we need them	if (attrStringLists == null) { 	  attrStringLists = new ArrayList[m_attrIdxs.length];	  for (int j = 0; j < m_attrIdxs.length; j++) {	    attrStringLists[j] = getStringList(trainData, testData, m_attrIdxs[j]);	  }	}	// initialize the data-dependent metric for each attribute	for (int j = 0; j < m_attrIdxs.length; j++) {	  ((DataDependentStringMetric)m_fieldMetrics[i][j]).buildMetric(attrStringLists[j]);	}      }      // if the metric is learnable, train it      if (m_stringMetrics[i] instanceof LearnableStringMetric) {    	for (int j = 0; j < m_attrIdxs.length; j++) {	  ArrayList strPairList = m_selector.getStringPairList(trainData, m_attrIdxs[j],							       m_numPosPairs, m_numNegPairs,							       m_fieldMetrics[i][j]);	  ((LearnableStringMetric)m_fieldMetrics[i][j]).trainMetric(strPairList);	}      }    }        // train the classifier    m_diffInstances = m_selector.getInstances(m_attrIdxs, m_fieldMetrics, m_numPosPairs, m_numNegPairs);    // get the stats on actual training data    AttributeStats classStats = m_diffInstances.attributeStats(m_diffInstances.classIndex());    m_numActualPosPairs = classStats.nominalCounts[0];    m_numActualNegPairs = classStats.nominalCounts[1];        // SANITY CHECK - CROSS-VALIDATION    if (false) {       // dump diff-instances into a temporary file      try {	File diffDir = new File("/tmp/diff");	diffDir.mkdir();	String diffName = trainData.relationName() + "." +	  Utils.removeSubstring(m_fieldMetrics[0].getClass().getName(), "weka.deduping.metrics.");	m_diffInstances.setRelationName(diffName);	PrintWriter writer = new PrintWriter(new BufferedOutputStream (new FileOutputStream(diffDir.getPath() + "/" +											    diffName + ".arff")));	writer.println(m_diffInstances.toString());	writer.close();	// Do a sanity check - dump out the diffInstances, and	// evaluation classification with an SVM. 	long trainTimeStart = System.currentTimeMillis();	SVMlight classifier = new SVMlight();	Evaluation eval = new Evaluation(m_diffInstances);	eval.crossValidateModel(classifier, m_diffInstances, 5);	writer = new PrintWriter(new BufferedOutputStream (new FileOutputStream(diffDir.getPath() + "/" +										diffName + ".dat", true)));	writer.println(eval.pctCorrect());	writer.close();	System.out.println("** Record Sanity:" + (System.currentTimeMillis() - trainTimeStart) + " ms; " +			   eval.pctCorrect() + "% correct\t" +			   eval.numFalseNegatives(0) + "(" + eval.falseNegativeRate(0) + "%) false negatives\t" +			   eval.numFalsePositives(0) + "(" + eval.falsePositiveRate(0) + "%) false positives\t");            } catch (Exception e) {	e.printStackTrace();	System.out.println(e.toString());       }    }    // END SANITY CHECK    System.out.println(getTimestamp() + ":  Building " + m_classifier.getClass().getName());    m_classifier.buildClassifier(m_diffInstances);    System.out.println(getTimestamp() + ":  Done building " + m_classifier.getClass().getName());  }    /** An internal method for creating a list of strings for a   * particular attribute from two sets of instances: trianing and   * test data   * @param trainData a dataset of records in the training fold   * @param testData a dataset of records in the testing fold   * @param attrIdx the index of the attribute for which strings are to be collected   * @return a list of strings that occur for this attribute; duplicates are allowed   */  protected ArrayList getStringList(Instances trainData, Instances testData, int attrIdx) {    ArrayList stringList = new ArrayList();    // go through the training data and get all string values for that attribute    if (trainData != null) {       for (int i = 0; i < trainData.numInstances(); i++) {	Instance instance = trainData.instance(i);	String value = instance.stringValue(attrIdx);	stringList.add(value);      }    }    // go through the test data and get all string values for that attribute    for (int i = 0; i < testData.numInstances(); i++) {      Instance instance = testData.instance(i);      String value = instance.stringValue(attrIdx);      stringList.add(value);    }    return stringList;  }       /**   * Returns distance between two records    * @param instance1 First record.   * @param instance2 Second record.   * @exception Exception if distance could not be calculated.   */  public double distance(Instance instance1, Instance instance2) throws Exception {    // go through all metrics collecting the values of distances for different attributes    double[] distances = new double[m_attrIdxs.length * m_stringMetrics.length + 1];    int counter = 0;     for (int i = 0; i < m_attrIdxs.length; i++) {      String str1 = instance1.stringValue(m_attrIdxs[i]);      String str2 = instance2.stringValue(m_attrIdxs[i]);      for (int j = 0; j < m_stringMetrics.length; j++) { 	if (m_stringMetrics[j].isDistanceBased()) { 	  distances[counter++] = m_fieldMetrics[j][i].distance(str1, str2);	} else {	  distances[counter++] = m_fieldMetrics[j][i].similarity(str1, str2);	}      }    }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -