📄 pairwiseselector.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * PairwiseSelector.java * Copyright (C) 2003 Mikhail Bilenko * */package weka.deduping;import java.util.*;import java.io.Serializable;import weka.core.*;import weka.deduping.metrics.*;import weka.deduping.blocking.*;/** * PairwiseSelector class. Given a string metric and training data, * create a set of instance pairs that correspond to metric training data * * @author Mikhail Bilenko (mbilenko@cs.utexas.edu) * @version $Revision: 1.11 $ */public class PairwiseSelector implements OptionHandler, Serializable { /** The set of instances used for training */ protected Instances m_instances = null; /** A hashmap where true object IDs are mapped to lists of strings of that object */ protected HashMap m_classInstanceMap = null; /** A list of classes, each element is the double value of the class attribute */ protected ArrayList m_classValueList = null; /** A list with all the positive examples as TrainingPair's */ protected ArrayList m_posPairList = null; /** A list with a sufficient pool of negative examples as TrainingPair's */ protected ArrayList m_negPairList = null; /** The number of possible same-class pairs */ protected int m_numPotentialPositives = 0; /** The number of possible different-class pairs */ protected int m_numPotentialNegatives = 0; /** Output debugging information */ protected boolean m_debug = false; /** The record pair selection method */ // positives public static final int POS_MODE_RANDOM_RECORDS = 1; public static final int POS_MODE_RANDOM_POSITIVES = 2; public static final int POS_MODE_STATIC_ACTIVE = 4; public static final Tag[] TAGS_POS_MODE = { new Tag(POS_MODE_RANDOM_RECORDS, "Random record pairs"), new Tag(POS_MODE_RANDOM_POSITIVES, "Random positive pairs"), new Tag(POS_MODE_STATIC_ACTIVE, "Static-active positive pairs"), }; protected int m_positivesMode = POS_MODE_RANDOM_POSITIVES; // should the rejected positives be spilled into the negatives set? protected boolean m_useRejectedPositives = true; // negatives public static final int NEG_MODE_RANDOM_RECORDS = 1; public static final int NEG_MODE_RANDOM_NEGATIVES = 2; public static final int NEG_MODE_IMPLICIT_NEGATIVES = 4; public static final Tag[] TAGS_NEG_MODE = { new Tag(NEG_MODE_RANDOM_RECORDS, "Random record pairs"), new Tag(NEG_MODE_RANDOM_NEGATIVES, "Random negative pairs"), new Tag(NEG_MODE_IMPLICIT_NEGATIVES, "Implicit negative pairs") }; protected int m_negativesMode = NEG_MODE_RANDOM_NEGATIVES; // should falsely selected implicit negatives be spilled into the positives set? protected boolean m_useFalseImplicitNegatives = true; /** String pair selection method */ public static final int STRING_PAIRS_RANDOM = 1; public static final int STRING_PAIRS_HARDEST = 2; public static final int STRING_PAIRS_EASIEST = 4; public static final Tag[] TAGS_STRING_PAIR_MODE = { new Tag(STRING_PAIRS_RANDOM, "Random string pairs"), new Tag(STRING_PAIRS_HARDEST, "Hardest string pairs"), new Tag(STRING_PAIRS_EASIEST, "Easiest string pairs") }; protected int m_posStringMode = STRING_PAIRS_RANDOM; protected int m_negStringMode = STRING_PAIRS_RANDOM; /** The maximum fraction of common tokens that instances can have to be included as implicit negatives */ protected double m_maxImplicitCommonTokenFraction = 0.2; /** We will need this reverse comparator class to traverse a TreeSet backwards */ public class ReverseComparator implements Comparator { public int compare(Object o1, Object o2) { Comparable c = (Comparable) o1; return -1 * c.compareTo(o2); } } /** A default constructor */ public PairwiseSelector() { } /** Initialize m_classInstanceMap and m_classValueList using a given set of instances * @param instances a set of instances from which pair examples will be selected */ public void initSelector(Instances instances) { m_instances = instances; m_classValueList = new ArrayList(); m_classInstanceMap = new HashMap(); m_numPotentialPositives = 0; m_numPotentialNegatives = 0; // go through all instances, hashing them into lists corresponding to each class Enumeration enum = instances.enumerateInstances(); while (enum.hasMoreElements()) { Instance instance = (Instance) enum.nextElement(); if (instance.classIsMissing()) { System.err.println("Instance " + instance + " has missing class!!!"); continue; } Double classValue = new Double(instance.classValue()); // if this class has been seen, add instance to the class's list if (m_classInstanceMap.containsKey(classValue)) { ArrayList classInstanceList = (ArrayList) m_classInstanceMap.get(classValue); classInstanceList.add(instance); } else { // create a new list of instances for a previously unseen class ArrayList classInstanceList = new ArrayList(); classInstanceList.add(instance); m_classInstanceMap.put(classValue, classInstanceList); m_classValueList.add(classValue); } } // get the number of potential positive pairs Iterator iterator = m_classInstanceMap.values().iterator(); while (iterator.hasNext()) { ArrayList classInstanceList = (ArrayList) iterator.next(); m_numPotentialPositives += classInstanceList.size() * (classInstanceList.size() - 1) / 2; } int numInstances = instances.numInstances(); m_numPotentialNegatives = numInstances * (numInstances - 1) / 2 - m_numPotentialPositives; createPosPairList(); createNegPairList(); System.out.println("m_numPotentialPositives=" + m_numPotentialPositives + "\tm_numPotentialNegatives=" + m_numPotentialNegatives); } /** Generate a training set of diffInstances. initSelector must have been called earlier * to initialize m_posPairList and m_negPairList. * @param attrIdxs indeces of fields that should be utilized * @param stringMetrics metrics that should be used on training pairs to generate diffInstances * @param numPosPairs the desired number of positive (same-class) diffInstance's * @param numNegPairs the desired number of negative (different-class) diffInstance's */ public Instances getInstances(int [] attrIdxs, StringMetric[][] stringMetrics, int numPosPairs, int numNegPairs) throws Exception { int numActualPositives = 0; int numActualNegatives = 0; HashMap checksumMap = new HashMap(); HashSet usedPairSet = new HashSet(); int numTrainingRecords = m_instances.numInstances(); double[] checksumCoeffs = new double[stringMetrics.length * stringMetrics[0].length]; if (m_posPairList == null || m_negPairList == null) { throw new Exception("Called PairwiseSelector.getInstances before initalization via initSelector!"); } /*** Create the Instances dataset ***/ // first, create all the numeric attributes FastVector attrInfoVector = new FastVector(); Random r = new Random(numPosPairs + numNegPairs); for (int i = 0; i < stringMetrics.length; i++) { for (int j = 0; j < stringMetrics[i].length; j++) { Attribute attr = new Attribute("" + i + "-" + j); attrInfoVector.addElement(attr); checksumCoeffs[i*stringMetrics[i].length + j] = r.nextDouble(); } } // create the class attribute FastVector classValues = new FastVector(); classValues.addElement("pos"); classValues.addElement("neg"); Attribute classAttr = new Attribute("class", classValues); attrInfoVector.addElement(classAttr); // create the dataset and set the class attribute Instances instances = new Instances("diffInstances", attrInfoVector, numPosPairs + numNegPairs); instances.setClass(classAttr); /*** Positives selection ***/ switch (m_positivesMode) { case POS_MODE_RANDOM_RECORDS: // just pick m_numPosPairs random record pairs int numMisfires = 0; for (int i = 0; i < numPosPairs && numMisfires < 1000; i++) { InstancePair pair = createRandomTrainInstancePair(usedPairSet, checksumMap); Instance trainInstance = createInstance(pair, attrIdxs, stringMetrics); if (trainInstance != null && isUniqueInstance(trainInstance, checksumMap, checksumCoeffs)) { instances.add(trainInstance); if (trainInstance.value(trainInstance.numValues()-1) == 0) { numActualPositives++; } else { numActualNegatives++; } } else { i--; numMisfires++; } } break; case POS_MODE_RANDOM_POSITIVES: // we are sampling from all same-class pairs in the training fold // randomize the indeces of positive examples and select the desired number numMisfires = 0; int [] posPairIdxs = randomSubset(m_numPotentialPositives, m_numPotentialPositives); for (int i = 0; i < posPairIdxs.length && numActualPositives < numPosPairs && numMisfires < 500; i++) { Instance posInstance = createInstance((InstancePair) m_posPairList.get(posPairIdxs[i]), attrIdxs, stringMetrics); if (posInstance != null && isUniqueInstance(posInstance, checksumMap, checksumCoeffs)) { instances.add(posInstance); numActualPositives++; } else { numMisfires++; } } break; case POS_MODE_STATIC_ACTIVE: Blocking blocker = new Blocking(); blocker.buildIndex(m_instances); InstancePair[] pairs = blocker.getMostSimilarPairs(numPosPairs*2); numMisfires = 0; for (int i = 0; (numActualPositives + numActualNegatives) < numPosPairs && i < pairs.length && pairs[i] != null && numMisfires < 500; i++) { Instance trainInstance = createInstance(pairs[i], attrIdxs, stringMetrics); if (trainInstance != null && isUniqueInstance(trainInstance, checksumMap, checksumCoeffs)) { if (pairs[i].positive == true) { instances.add(trainInstance); numActualPositives++; } else { if (m_useRejectedPositives) { instances.add(trainInstance); numActualNegatives++; } } } else { numMisfires++; } } System.out.println("After static-active:\t" + numActualPositives + " positives and " + numActualNegatives + " negatives"); break; default: throw new Exception("Unknown positive selection mode: " + m_positivesMode); } /*** Negatives selection ***/ switch (m_negativesMode) { case NEG_MODE_RANDOM_RECORDS: // just pick m_numNegPairs random record pairs int numMisfires = 0; for (int i = 0; i < numNegPairs && numMisfires < 1000; i++) { InstancePair pair = createRandomTrainInstancePair(usedPairSet, checksumMap); Instance trainInstance = createInstance(pair, attrIdxs, stringMetrics); if (trainInstance != null && isUniqueInstance(trainInstance, checksumMap, checksumCoeffs)) { instances.add(trainInstance); if (trainInstance.value(trainInstance.numValues()-1) == 0) { numActualPositives++; } else { numActualNegatives++; } } else { i--; numMisfires++; } } break; case NEG_MODE_RANDOM_NEGATIVES: // we are sampling from all different-class pairs in the training fold // randomize the indeces of negative examples and select the desired number numMisfires = 0; int numUniqueNegatives = 0; int [] negPairIdxs = randomSubset(m_numPotentialNegatives, m_numPotentialNegatives); for (int i = 0; i < negPairIdxs.length && numUniqueNegatives < numNegPairs && numMisfires < 1000; i++) { Instance negInstance = createInstance((InstancePair) m_negPairList.get(negPairIdxs[i]), attrIdxs, stringMetrics); if (negInstance != null && isUniqueInstance(negInstance, checksumMap, checksumCoeffs)) { instances.add(negInstance); numActualNegatives++; numUniqueNegatives++; } else { numMisfires++; } } break; case NEG_MODE_IMPLICIT_NEGATIVES: numMisfires = 0; for (int i = 0; i < numNegPairs && numMisfires < 30000; i++) { InstancePair pair = createRandomTrainInstancePair(usedPairSet, checksumMap); Instance trainInstance = createInstance(pair, attrIdxs, stringMetrics); if (trainInstance != null && isUniqueInstance(trainInstance, checksumMap, checksumCoeffs)) { // calculate the fraction of common tokens StringBuffer s1 = new StringBuffer(); StringBuffer s2 = new StringBuffer(); for (int j = 0; j < pair.instance1.numAttributes(); j++) { s1.append(pair.instance1.stringValue(j)); s1.append(" "); s2.append(pair.instance2.stringValue(j)); s2.append(" "); } if (fractionCommonTokens(s1.toString(), s2.toString()) <= m_maxImplicitCommonTokenFraction) { // check if the negative is bogus if (trainInstance.value(trainInstance.numValues()-1) == 0) { System.out.print("False negative!\n\t" + pair.instance1 + "\n\t" + pair.instance2); if (m_useFalseImplicitNegatives) { numActualPositives++; } else { trainInstance.setValue(trainInstance.numValues()-1, 1); numActualNegatives++; } instances.add(trainInstance); } else { // true implicit negative numActualNegatives++; instances.add(trainInstance); } } else { // try an extra pair if this one didn't work out due to too many positive pairs numMisfires++; i--; } } else { // try an extra pair if this one was null or not unique numMisfires++; i--; } } break; } System.out.println(); System.out.println("POSITIVES: requested=" + numPosPairs + "\tpossible=" + m_numPotentialPositives + "\tactual=" + numActualPositives); System.out.println("NEGATIVES: requested=" + numNegPairs + "\tpossible=" + m_numPotentialNegatives + "\tactual=" + numActualNegatives); return instances;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -