📄 pairwiseselector.java
字号:
} protected InstancePair createRandomTrainInstancePair(HashSet usedPairSet, HashMap checksumMap) { int numTrainingRecords = m_instances.numInstances(); int idx1, idx2; Integer pairCode, pairCodeOrdered; int numTries = 0; int maxNumTries = 1000; InstancePair pair = null; Random r = new Random(usedPairSet.size() + checksumMap.size()); // select a random pair of instances that has not been // seen before or until we exhaust all possible pairs do { idx1 = r.nextInt(numTrainingRecords); idx2 = r.nextInt(numTrainingRecords); while (idx2 == idx1) { // prevent selecting the same instance twice idx2 = r.nextInt(numTrainingRecords); } pairCode = new Integer(idx1 * numTrainingRecords + idx2); pairCodeOrdered = new Integer(idx2 * numTrainingRecords + idx1); numTries++; } while ((usedPairSet.contains(pairCode) || usedPairSet.contains(pairCodeOrdered)) && numTries < maxNumTries); if (numTries < maxNumTries) { // create the training instance and add it usedPairSet.add(pairCode); usedPairSet.add(pairCodeOrdered); Instance instance1 = m_instances.instance(idx1); Instance instance2 = m_instances.instance(idx2); boolean positive = (instance1.classValue() == instance2.classValue());; pair = new InstancePair(instance1, instance2, positive, 0); } return pair; } /** * Create a nonsparse instance with features corresponding to the * metric values between used fields of the two given instances * @param instancePair a pair of instances that is used for creating the new diffInstance * @param attrIdxs indeces of fields that should be utilized * @param metrics the string metrics that are used to create the training instances * @return a newly created diffInstance, or null if all diff-values are 0 */ protected Instance createInstance (InstancePair pair, int[] attrIdxs, StringMetric[][] metrics ) throws Exception { int numAttributes = metrics.length * metrics[0].length + 1; int numNonNegativeValues = 0; int numValues = 0; double[] values = new double[numAttributes]; for (int i = 0; i < attrIdxs.length; i++) { String val1 = pair.instance1.stringValue(attrIdxs[i]); String val2 = pair.instance2.stringValue(attrIdxs[i]); for (int j = 0; j < metrics.length; j++) { if (metrics[j][i].isDistanceBased()) { values[numValues] = metrics[j][i].distance(val1, val2); } else { values[numValues] = metrics[j][i].similarity(val1, val2); } if (values[numValues] != 0) { numNonNegativeValues++; } numValues++; } } if (pair.positive) { values[numAttributes-1] = 0; } else { values[numAttributes-1] = 1; } // if there were non-zero attributes, return the instance, otherwise return null if (numNonNegativeValues > 0) { return new Instance(1.0, values); } else { return null; } } /** Check whether an instance is unique * @param instance instance to be checked * @param checksumMap a map where checksum values are mapped to lists of instances * @param sumCoeffs coefficients used for computing the checksum * @return true if the instance is unique, false otherwise */ protected boolean isUniqueInstance(Instance instance, HashMap checksumMap, double[] checksumCoeffs) { double checksum = 0; // compute the checksum and round off to overcome machine precision errors for (int i = 0; i < instance.numValues()-1; i++) { checksum += checksumCoeffs[i] * instance.value(i); } checksum = (float) checksum; // if this checksum was encountered before, get a list of instances // that have this checksum, and check if any of them are dupes of this one if (checksumMap.containsKey(new Double(checksum))) { ArrayList checksumList = (ArrayList) checksumMap.get(new Double(checksum)); boolean unique = true; for (int k = 0; k < checksumList.size() && unique; k++) { Instance nextDiffInstance = (Instance) checksumList.get(k); unique = false; for (int l = 0; l < nextDiffInstance.numValues()-1 && !unique; l++) { if (((float)nextDiffInstance.value(l)) != ((float)instance.value(l))) { unique = true; } } if (unique == false) { return false; } } checksumList.add(instance); return true; // no dupes were found among instances with the same checksum } else { // this checksum has not been encountered before ArrayList checksumList = new ArrayList(); checksumMap.put(new Double(checksum), checksumList); checksumList.add(instance); return true; } } /** * Provide an array of string pairs metric using given training instances * * @param metric the metric to train * @param instances data to train the metric on * @exception Exception if training has gone bad. * @return a list of StringPair's that is training data for a particular field */ public ArrayList getStringPairList(Instances instances, int attrIdx, int numPosPairs, int numNegPairs, StringMetric metric) throws Exception { System.out.println("Selecting strings out of " + instances.numInstances() + " instances, first is \n" + instances.instance(0)); ArrayList pairList = new ArrayList(); TreeSet posPairSet = null; TreeSet negPairSet = null; double [] posPairDistances = null; double [] negPairDistances = null; Iterator iterator = null; int numPossiblePosStrPairs = 0, numPossibleNegStrPairs = 0; int numActualPositives = 0, numActualNegatives = 0; // SELECT POSITIVE PAIRS switch (m_posStringMode) { case STRING_PAIRS_EASIEST: posPairSet = new TreeSet(new ReverseComparator()); posPairDistances = populatePosStrPairSet(metric, posPairSet, attrIdx); numPossiblePosStrPairs = posPairSet.size(); // select numPositives iterator = posPairSet.iterator(); for (int i = 0; iterator.hasNext() && i < m_numPotentialPositives && i < numPosPairs; i++) { StringPair posPair = (StringPair) iterator.next(); pairList.add(posPair); } break; case STRING_PAIRS_HARDEST: posPairSet = new TreeSet(); posPairDistances = populatePosStrPairSet(metric, posPairSet, attrIdx); numPossiblePosStrPairs = posPairSet.size(); // select numPositives examples iterator = posPairSet.iterator(); for (int i = 0; iterator.hasNext() && i < m_numPotentialPositives && i < numPosPairs; i++) { StringPair posPair = (StringPair) iterator.next(); pairList.add(posPair); } break; case STRING_PAIRS_RANDOM: // Get string pairs for a given attribute ArrayList strPairList = new ArrayList(); for (int i = 0; i < m_posPairList.size(); i++) { InstancePair pair = (InstancePair) m_posPairList.get(i); String str1 = pair.instance1.stringValue(attrIdx); String str2 = pair.instance2.stringValue(attrIdx); if (!str1.equals(str2) && haveCommonTokens(str1, str2)) { StringPair strPair = new StringPair(str1, str2, true, 0); strPairList.add(strPair); } else { System.out.println("Equal strings, or no common tokens - NOT adding: " + str1 + "\t" + str2); } } numPossiblePosStrPairs = strPairList.size(); // if we have fewer pairs available than requested, return all the ones that were created if (strPairList.size() <= numPosPairs) { System.out.println("INSUFFICIENT available POSITIVE examples, using all " + strPairList.size()); pairList = strPairList; } else { // if we have more than enough potential pairs, sample randomly with replacement int[] indexes = randomSubset(numPosPairs, strPairList.size()); System.out.println("SUFFICIENT available POSITIVE examples, randomly selected " + indexes.length + " of " + strPairList.size()); for (int i = 0; i < indexes.length; i++) { pairList.add(strPairList.get(indexes[i])); } } // for (int i =0 ; i < strPairList.size(); i++) { // StringPair pair = (StringPair) strPairList.get(i); // System.out.println(pair.str1 + "\t\t\t" + pair.str2); // } break; default: throw new Exception("Unknown method for selecting positive pairs: " + m_posStringMode); } numActualPositives = pairList.size(); // we don't need negative string pairs for AffineProbMetric if (!metric.getClass().getName().equals("weka.deduping.metrics.AffineProbMetric")) { // SELECT NEGATIVE PAIRS unless this is AffineProbMetric - it doesn't need negatives switch (m_negStringMode) { case STRING_PAIRS_EASIEST: // Create a map with *all* negatives negPairSet = new TreeSet(); negPairDistances = populateNegStrPairSet(metric, negPairSet, attrIdx); numPossibleNegStrPairs = negPairSet.size(); iterator = negPairSet.iterator(); for (int i = 0; iterator.hasNext() && i < m_numPotentialNegatives && i < numNegPairs; i++) { StringPair negPair = (StringPair) iterator.next(); pairList.add(negPair); System.out.println("EASY: " + negPair.value + "\n\t" + negPair.str1 + "\n\t" + negPair.str2); } break; case STRING_PAIRS_HARDEST: negPairSet = new TreeSet(new ReverseComparator()); negPairDistances = populateNegStrPairSet(metric, negPairSet, attrIdx); numPossibleNegStrPairs = negPairSet.size(); // We will hash each pair of classes that was used so that we don't end up with // too many pairs from the same combination of two classes HashSet usedComboSet = new HashSet(); iterator = negPairSet.iterator(); for (int i = 0; iterator.hasNext() && i < m_numPotentialNegatives && i < numNegPairs; i++) { StringPair negPair = (StringPair) iterator.next(); Double class1class2HashValue = new Double(negPair.class1 * 100000 + negPair.class2); if (!usedComboSet.contains(class1class2HashValue)) { // kludge - comment out for cora1 pairList.add(negPair); // System.out.println("HARD: " + negPair.value + "\n\t" + negPair.str1 + "\n\t" + negPair.str2); usedComboSet.add(class1class2HashValue); // add reverse combo (or allow two per class if commented out // usedComboSet.add(new Double(negPair.class2 * 1000 + negPair.class1)); <- reverse combo } } break; case STRING_PAIRS_RANDOM: // Get string pairs for a given attribute ArrayList strPairList = new ArrayList(); for (int i = 0; i < m_negPairList.size(); i++) { InstancePair pair = (InstancePair) m_negPairList.get(i); String str1 = pair.instance1.stringValue(attrIdx); String str2 = pair.instance2.stringValue(attrIdx); if (!str1.equals(str2)) { StringPair strPair = new StringPair(str1, str2, false, 0); strPairList.add(strPair); } } numPossibleNegStrPairs = strPairList.size(); // if we have fewer pairs available than requested, return all the ones that were created if (strPairList.size() <= numNegPairs) { System.out.println("INSUFFICIENT available NEGATIVE examples, using all " + strPairList.size()); pairList.addAll(strPairList); } else { // if we have enough potential pairs, randomly sample with replacement int[] indexes = randomSubset(numNegPairs, strPairList.size()); System.out.println("SUFFICIENT available NEGATIVE examples, randomly selected " + indexes.length + " of " + strPairList.size()); for (int i = 0; i < indexes.length; i++) { pairList.add(strPairList.get(indexes[i])); } } break; default: throw new Exception("Unknown method for selecting negative pairs: " + m_negStringMode); } } numActualNegatives = pairList.size() - numActualPositives; System.out.println(); System.out.println("**POSITIVES: requested=" + numPosPairs + "\tpossible=" + numPossiblePosStrPairs + "\tactual=" + numActualPositives); System.out.println("**NEGATIVES: requested=" + numNegPairs + "\tpossible=" + numPossibleNegStrPairs + "\tactual=" + numActualNegatives); return pairList; } /** Add a pair to a TreeSet so that there are no collisions, and no values are erased * @param set a set to which a new pair should be added * @param pair a new pair of strings that is to be added; value * fields holds the distance between the strings * @return the unique value of the distance (possibly perturbed) * with which the pair was added */ protected double addUniquePair(TreeSet set, StringPair pair) { Random random = new Random(); double epsilon = 0.0000001; int counter = 0; while (set.contains(pair)) { double perturbation; if (pair.value == 0) { perturbation = Double.MIN_VALUE * random.nextInt(m_numPotentialPositives); } else { perturbation = pair.value * epsilon * ((random.nextDouble() > 0.5) ? 1 : -1); } pair.value += perturbation; counter++; if (counter % 10 == 0) { // increase perturbations if "nearby" values have been exhausted epsilon *= 10; } } set.add(pair); return pair.value; } /** Populate a provided treeset with all positive StringPair's * @param metric a metric that will be used to calculate distance * @param pairSet an empty TreeSet that will be populated * @param attrIdx the index of the attribute for which positive * string pairs are being accumulated * @return an array with distance values of the created pairs */ protected double[] populatePosStrPairSet(StringMetric metric, TreeSet strPairSet, int attrIdx) throws Exception { double [] posPairDistances = new double[m_numPotentialPositives]; Arrays.fill(posPairDistances, Double.MIN_VALUE); int posCounter = 0; for (int i = 0; i < m_posPairList.size(); i++) { InstancePair pair = (InstancePair) m_posPairList.get(i); String str1 = pair.instance1.stringValue(attrIdx); String str2 = pair.instance2.stringValue(attrIdx); // unless the two fields are exact duplicates, create a new pair if (!str1.equals(str2)) { StringPair strPair = new StringPair(str1, str2, true, metric.similarity(str1, str2)); posPairDistances[posCounter++] = addUniquePair(strPairSet, strPair); } } return posPairDistances; } /** Populate a provided treeset with a sufficient population of negative StringPair's * @param metric a metric that will be used to calculate distance between strings * @param pairSet an empty TreeSet that will be populated * @param attrIdx the index of the attribute for which positive * string pairs are being accumulated * @return an array with distance values of the created pairs */ protected double[] populateNegStrPairSet(StringMetric metric, TreeSet strPairSet, int attrIdx) throws Exception { // Create a map with *all* positives double [] negPairDistances = new double[m_numPotentialNegatives]; Arrays.fill(negPairDistances, Double.MIN_VALUE); int negCounter = 0; int[] negPairIdxs; // get a random sample if we have too many possible negatives TODO - are we limiting ourselves here??? negPairIdxs = randomSubset(20000, m_numPotentialNegatives); for (int i = 0; i < negPairIdxs.length; i++) { InstancePair pair = (InstancePair) m_negPairList.get(negPairIdxs[i]); String str1 = pair.instance1.stringValue(attrIdx); String str2 = pair.instance2.stringValue(attrIdx); // unless the two fields are exact duplicates, create a new pair if (!str1.equals(str2)) { StringPair strPair = new StringPair(str1, str2, false, metric.similarity(str1, str2));
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -