⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pairwiseselector.java

📁 wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    PairwiseSelector.java *    Copyright (C) 2003 Mikhail Bilenko * */package weka.deduping;import java.util.*;import java.io.Serializable;import weka.core.*;import weka.deduping.metrics.*;import weka.deduping.blocking.*;/**  * PairwiseSelector class.  Given a string metric and training data, * create a set of instance pairs that correspond to metric training data * * @author Mikhail Bilenko (mbilenko@cs.utexas.edu) * @version $Revision: 1.11 $ */public class PairwiseSelector implements OptionHandler, Serializable {  /** The set of instances used for training */  protected Instances m_instances = null;    /** A hashmap where true object IDs are mapped to lists of strings of that object */  protected HashMap m_classInstanceMap = null;  /** A list of classes, each element is the double value of the class attribute */  protected ArrayList m_classValueList = null;  /** A list with all the positive examples as TrainingPair's */  protected ArrayList m_posPairList = null;  /** A list with a sufficient pool of negative examples as TrainingPair's */  protected ArrayList m_negPairList = null;    /** The number of possible same-class pairs */  protected int m_numPotentialPositives = 0;  /** The number of possible different-class pairs */  protected int m_numPotentialNegatives = 0;  /** Output debugging information */  protected boolean m_debug = false;  /** The record pair selection method */  // positives  public static final int POS_MODE_RANDOM_RECORDS = 1;  public static final int POS_MODE_RANDOM_POSITIVES = 2;  public static final int POS_MODE_STATIC_ACTIVE = 4;  public static final Tag[] TAGS_POS_MODE = {    new Tag(POS_MODE_RANDOM_RECORDS, "Random record pairs"),    new Tag(POS_MODE_RANDOM_POSITIVES, "Random positive pairs"),    new Tag(POS_MODE_STATIC_ACTIVE, "Static-active positive pairs"),  };  protected int m_positivesMode = POS_MODE_RANDOM_POSITIVES;  // should the rejected positives be spilled into the negatives set?  protected boolean m_useRejectedPositives = true;  // negatives  public static final int NEG_MODE_RANDOM_RECORDS = 1;  public static final int NEG_MODE_RANDOM_NEGATIVES = 2;  public static final int NEG_MODE_IMPLICIT_NEGATIVES = 4;  public static final Tag[] TAGS_NEG_MODE = {    new Tag(NEG_MODE_RANDOM_RECORDS, "Random record pairs"),    new Tag(NEG_MODE_RANDOM_NEGATIVES, "Random negative pairs"),    new Tag(NEG_MODE_IMPLICIT_NEGATIVES, "Implicit negative pairs")  };  protected int m_negativesMode = NEG_MODE_RANDOM_NEGATIVES;  // should falsely selected implicit negatives be spilled into the positives set?  protected boolean m_useFalseImplicitNegatives = true;  /** String pair selection method */  public static final int STRING_PAIRS_RANDOM = 1;  public static final int STRING_PAIRS_HARDEST = 2;  public static final int STRING_PAIRS_EASIEST = 4;  public static final Tag[] TAGS_STRING_PAIR_MODE = {     new Tag(STRING_PAIRS_RANDOM, "Random string pairs"),     new Tag(STRING_PAIRS_HARDEST, "Hardest string pairs"),     new Tag(STRING_PAIRS_EASIEST, "Easiest string pairs")       };  protected int m_posStringMode = STRING_PAIRS_RANDOM;  protected int m_negStringMode = STRING_PAIRS_RANDOM;       /** The maximum fraction of common tokens that instances can have to      be included as implicit negatives */  protected double m_maxImplicitCommonTokenFraction = 0.2;    /** We will need this reverse comparator class to traverse a TreeSet backwards */  public class ReverseComparator implements Comparator {    public int compare(Object o1, Object o2) {      Comparable c = (Comparable) o1;      return -1 * c.compareTo(o2);    }  }  /** A default constructor */  public PairwiseSelector() {  }  /** Initialize m_classInstanceMap and m_classValueList using a given set of instances   * @param instances a set of instances from which pair examples will be selected   */  public void initSelector(Instances instances) {    m_instances = instances;    m_classValueList = new ArrayList();    m_classInstanceMap = new HashMap();    m_numPotentialPositives = 0;    m_numPotentialNegatives = 0;    // go through all instances, hashing them into lists corresponding to each class    Enumeration enum = instances.enumerateInstances();    while (enum.hasMoreElements()) {      Instance instance = (Instance) enum.nextElement();      if (instance.classIsMissing()) {	System.err.println("Instance " + instance + " has missing class!!!");	continue;      }      Double classValue = new Double(instance.classValue());      // if this class has been seen, add instance to the class's list      if (m_classInstanceMap.containsKey(classValue)) {	ArrayList classInstanceList = (ArrayList) m_classInstanceMap.get(classValue);	classInstanceList.add(instance);      } else {  // create a new list of instances for a previously unseen class	ArrayList classInstanceList = new ArrayList();	classInstanceList.add(instance);	m_classInstanceMap.put(classValue, classInstanceList);	m_classValueList.add(classValue);      }    }    // get the number of potential positive pairs    Iterator iterator = m_classInstanceMap.values().iterator();    while (iterator.hasNext()) {      ArrayList classInstanceList = (ArrayList) iterator.next();      m_numPotentialPositives += classInstanceList.size() * (classInstanceList.size() - 1) / 2;    }    int numInstances = instances.numInstances();    m_numPotentialNegatives = numInstances * (numInstances - 1) / 2 - m_numPotentialPositives;    createPosPairList();    createNegPairList();    System.out.println("m_numPotentialPositives=" + m_numPotentialPositives + "\tm_numPotentialNegatives=" + m_numPotentialNegatives);  }  /** Generate a training set of diffInstances. initSelector must have been called earlier   * to initialize m_posPairList and m_negPairList.         * @param attrIdxs indeces of fields that should be utilized   * @param stringMetrics metrics that should be used on training pairs to generate diffInstances   * @param numPosPairs the desired number of positive (same-class) diffInstance's   * @param numNegPairs the desired number of negative (different-class) diffInstance's   */  public Instances getInstances(int [] attrIdxs, StringMetric[][] stringMetrics,				int numPosPairs, int numNegPairs) throws Exception {    int numActualPositives = 0;    int numActualNegatives = 0;    HashMap checksumMap = new HashMap();    HashSet usedPairSet = new HashSet();    int numTrainingRecords = m_instances.numInstances();    double[] checksumCoeffs = new double[stringMetrics.length * stringMetrics[0].length];    if (m_posPairList == null || m_negPairList == null) {      throw new Exception("Called PairwiseSelector.getInstances before initalization via initSelector!");    }    /*** Create the Instances dataset ***/    // first, create all the numeric attributes    FastVector attrInfoVector = new FastVector();    Random r = new Random(numPosPairs + numNegPairs);    for (int i = 0; i < stringMetrics.length; i++) {      for (int j = 0; j < stringMetrics[i].length; j++) { 	Attribute attr = new Attribute("" + i + "-" + j);	attrInfoVector.addElement(attr);	checksumCoeffs[i*stringMetrics[i].length + j] = r.nextDouble();      }    }    // create the class attribute    FastVector classValues = new FastVector();    classValues.addElement("pos");    classValues.addElement("neg");    Attribute classAttr = new Attribute("class", classValues);    attrInfoVector.addElement(classAttr);    // create the dataset and set the class attribute    Instances instances = new Instances("diffInstances", attrInfoVector, numPosPairs + numNegPairs);    instances.setClass(classAttr);        /*** Positives selection ***/    switch (m_positivesMode) {    case POS_MODE_RANDOM_RECORDS:      // just pick m_numPosPairs random record pairs      int numMisfires = 0;       for (int i = 0; i < numPosPairs && numMisfires < 1000; i++) {	InstancePair pair = createRandomTrainInstancePair(usedPairSet, checksumMap); 	  	Instance trainInstance = createInstance(pair, attrIdxs, stringMetrics);	if (trainInstance != null && isUniqueInstance(trainInstance, checksumMap, checksumCoeffs)) {	  instances.add(trainInstance);	  if (trainInstance.value(trainInstance.numValues()-1) == 0) {	    numActualPositives++;	  } else {	    numActualNegatives++;	  }	} else {	  i--; 	  numMisfires++;	}      }      break;    case POS_MODE_RANDOM_POSITIVES:      // we are sampling from all same-class pairs in the training fold      // randomize the indeces of positive examples and select the desired number      numMisfires = 0;       int [] posPairIdxs = randomSubset(m_numPotentialPositives, m_numPotentialPositives);      for (int i = 0; i < posPairIdxs.length && numActualPositives < numPosPairs && numMisfires < 500; i++) {	Instance posInstance = createInstance((InstancePair) m_posPairList.get(posPairIdxs[i]),					      attrIdxs, stringMetrics);	if (posInstance != null && isUniqueInstance(posInstance, checksumMap, checksumCoeffs)) {	  instances.add(posInstance);	  numActualPositives++;	} else {	  numMisfires++;	}      }      break;    case POS_MODE_STATIC_ACTIVE:      Blocking blocker = new Blocking();      blocker.buildIndex(m_instances);      InstancePair[] pairs = blocker.getMostSimilarPairs(numPosPairs*2);      numMisfires = 0;       for (int i = 0;	   (numActualPositives + numActualNegatives) < numPosPairs && i < pairs.length && pairs[i] != null && numMisfires < 500; i++) {	Instance trainInstance = createInstance(pairs[i], attrIdxs, stringMetrics);	if (trainInstance != null && isUniqueInstance(trainInstance, checksumMap, checksumCoeffs)) {	  if (pairs[i].positive == true) { 	    instances.add(trainInstance);	    numActualPositives++;	  } else {	    if (m_useRejectedPositives) {	      instances.add(trainInstance);	      numActualNegatives++;	    } 	  }	} else {	  numMisfires++;	}      }      System.out.println("After static-active:\t" + numActualPositives + " positives and " +			 numActualNegatives + " negatives");      break;    default:      throw new Exception("Unknown positive selection mode: " + m_positivesMode);    }        /*** Negatives selection ***/    switch (m_negativesMode) {    case NEG_MODE_RANDOM_RECORDS:      // just pick m_numNegPairs random record pairs      int numMisfires = 0;       for (int i = 0; i < numNegPairs && numMisfires < 1000; i++) {	InstancePair pair = createRandomTrainInstancePair(usedPairSet, checksumMap); 	  	Instance trainInstance = createInstance(pair, attrIdxs, stringMetrics);	if (trainInstance != null && isUniqueInstance(trainInstance, checksumMap, checksumCoeffs)) {	  instances.add(trainInstance);	  if (trainInstance.value(trainInstance.numValues()-1) == 0) {	    numActualPositives++;	  } else {	    numActualNegatives++;	  }	} else {	  i--;	  numMisfires++;	}      }      break;    case NEG_MODE_RANDOM_NEGATIVES:      // we are sampling from all different-class pairs in the training fold      // randomize the indeces of negative examples and select the desired number      numMisfires = 0;      int numUniqueNegatives = 0;       int [] negPairIdxs = randomSubset(m_numPotentialNegatives, m_numPotentialNegatives);      for (int i = 0; i < negPairIdxs.length && numUniqueNegatives < numNegPairs && numMisfires < 1000; i++) {	Instance negInstance = createInstance((InstancePair) m_negPairList.get(negPairIdxs[i]),					      attrIdxs, stringMetrics);	if (negInstance != null && isUniqueInstance(negInstance, checksumMap, checksumCoeffs)) {	  instances.add(negInstance);	  numActualNegatives++;	  numUniqueNegatives++;	} else { 	  numMisfires++;	}       }      break;    case NEG_MODE_IMPLICIT_NEGATIVES:      numMisfires = 0;       for (int i = 0; i < numNegPairs && numMisfires < 30000; i++) {	InstancePair pair = createRandomTrainInstancePair(usedPairSet, checksumMap); 	  	Instance trainInstance = createInstance(pair, attrIdxs, stringMetrics);		if (trainInstance != null && isUniqueInstance(trainInstance, checksumMap, checksumCoeffs)) {	  // calculate the fraction of common tokens	  StringBuffer s1 = new StringBuffer();	  StringBuffer s2 = new StringBuffer(); 	  for (int j = 0; j < pair.instance1.numAttributes(); j++) {	    s1.append(pair.instance1.stringValue(j));	    s1.append(" ");	    s2.append(pair.instance2.stringValue(j));	    s2.append(" ");	  }	    	  if (fractionCommonTokens(s1.toString(), s2.toString()) <= m_maxImplicitCommonTokenFraction) {	    // check if the negative is bogus	    if (trainInstance.value(trainInstance.numValues()-1) == 0) {	      System.out.print("False negative!\n\t" + pair.instance1 + "\n\t" + pair.instance2);	      if (m_useFalseImplicitNegatives) {		numActualPositives++; 	      } else { 		trainInstance.setValue(trainInstance.numValues()-1, 1);		numActualNegatives++; 	      }	      instances.add(trainInstance);	    } else { // true implicit negative	      numActualNegatives++; 	      instances.add(trainInstance);	    } 	  } else {  // try an extra pair if this one didn't work out due to too many positive pairs	    numMisfires++;	    i--;	  }	} else {	  // try an extra pair if this one was null or not unique	  numMisfires++;	  i--;	}        }      break;    }    System.out.println();    System.out.println("POSITIVES:  requested=" + numPosPairs + "\tpossible=" + m_numPotentialPositives +		       "\tactual=" + numActualPositives);    System.out.println("NEGATIVES:  requested=" + numNegPairs + "\tpossible=" + m_numPotentialNegatives +		       "\tactual=" + numActualNegatives);    return instances;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -