⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pairwiseselector.java

📁 wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
  }  protected InstancePair createRandomTrainInstancePair(HashSet usedPairSet, HashMap checksumMap) {    int numTrainingRecords = m_instances.numInstances();    int idx1, idx2;    Integer pairCode, pairCodeOrdered;    int numTries = 0;    int maxNumTries = 1000;    InstancePair pair = null;    Random r = new Random(usedPairSet.size() + checksumMap.size());    // select a random pair of instances that has not been    // seen before or until we exhaust all possible pairs    do {      idx1 = r.nextInt(numTrainingRecords);      idx2 = r.nextInt(numTrainingRecords);      while (idx2 == idx1) { // prevent selecting the same instance twice	idx2 = r.nextInt(numTrainingRecords);      }      pairCode = new Integer(idx1 * numTrainingRecords + idx2);      pairCodeOrdered = new Integer(idx2 * numTrainingRecords + idx1);      numTries++;    } while ((usedPairSet.contains(pairCode) || usedPairSet.contains(pairCodeOrdered))	     && numTries < maxNumTries);    if (numTries < maxNumTries) {       // create the training instance and add it      usedPairSet.add(pairCode);      usedPairSet.add(pairCodeOrdered);      Instance instance1 = m_instances.instance(idx1);      Instance instance2 = m_instances.instance(idx2);      boolean positive = (instance1.classValue() == instance2.classValue());;      pair = new InstancePair(instance1, instance2, positive, 0);    }    return pair; 	    }   /**   * Create a nonsparse instance with features corresponding to the   * metric values between used fields of the two given instances   * @param instancePair a pair of instances that is used for creating the new diffInstance   * @param attrIdxs indeces of fields that should be utilized   * @param metrics the string metrics that are used to create the training instances   * @return a newly created diffInstance, or null if all diff-values are 0   */  protected Instance createInstance (InstancePair pair, int[] attrIdxs, StringMetric[][] metrics ) throws Exception {    int numAttributes = metrics.length * metrics[0].length + 1;    int numNonNegativeValues = 0;    int numValues = 0;    double[] values = new double[numAttributes];      for (int i = 0; i < attrIdxs.length; i++) {      String val1 = pair.instance1.stringValue(attrIdxs[i]);      String val2 = pair.instance2.stringValue(attrIdxs[i]);      for (int j = 0; j < metrics.length; j++) { 	if (metrics[j][i].isDistanceBased()) { 	  values[numValues] = metrics[j][i].distance(val1, val2);	} else {	  values[numValues] = metrics[j][i].similarity(val1, val2);	}	if (values[numValues] != 0) {	  numNonNegativeValues++;	}	numValues++;      }    }        if (pair.positive) {      values[numAttributes-1] = 0;    } else {      values[numAttributes-1] = 1;    }    // if there were non-zero attributes, return the instance, otherwise return null    if (numNonNegativeValues > 0) {       return new Instance(1.0, values);    } else {      return null;    }  }  /** Check whether an instance is unique   * @param instance instance to be checked   * @param checksumMap a map where checksum values are mapped to lists of instances   * @param sumCoeffs coefficients used for computing the checksum   * @return true if the instance is unique, false otherwise   */  protected boolean isUniqueInstance(Instance instance, HashMap checksumMap, double[] checksumCoeffs) {    double checksum = 0;    // compute the checksum and round off to overcome machine precision errors    for (int i = 0; i < instance.numValues()-1; i++) {      checksum += checksumCoeffs[i] * instance.value(i);    }    checksum = (float) checksum;        // if this checksum was encountered before, get a list of instances    // that have this checksum, and check if any of them are dupes of this one    if (checksumMap.containsKey(new Double(checksum))) {      ArrayList checksumList = (ArrayList) checksumMap.get(new Double(checksum));      boolean unique = true;      for (int k = 0; k < checksumList.size() && unique; k++) {	Instance nextDiffInstance = (Instance) checksumList.get(k);	unique = false;	for (int l = 0; l < nextDiffInstance.numValues()-1 && !unique; l++) {	  if (((float)nextDiffInstance.value(l)) != ((float)instance.value(l))) {	    unique = true;	  } 	}	if (unique == false) {	  return false;	}      }      checksumList.add(instance);      return true;  // no dupes were found among instances with the same checksum    } else {  // this checksum has not been encountered before      ArrayList checksumList = new ArrayList();      checksumMap.put(new Double(checksum), checksumList);      checksumList.add(instance);      return true;    }  }      /**   * Provide an array of string pairs metric using given training instances   *   * @param metric the metric to train   * @param instances data to train the metric on   * @exception Exception if training has gone bad.   * @return a list of StringPair's that is training data for a particular field   */  public ArrayList getStringPairList(Instances instances, int attrIdx,				     int numPosPairs, int numNegPairs,				     StringMetric metric) throws Exception {    System.out.println("Selecting strings out of " + instances.numInstances() + " instances, first is \n" + instances.instance(0));    ArrayList pairList = new ArrayList();    TreeSet posPairSet = null;    TreeSet negPairSet = null;    double [] posPairDistances = null;    double [] negPairDistances = null;    Iterator iterator = null;    int numPossiblePosStrPairs = 0, numPossibleNegStrPairs = 0;    int numActualPositives = 0, numActualNegatives = 0;    // SELECT POSITIVE PAIRS    switch (m_posStringMode) {    case STRING_PAIRS_EASIEST:      posPairSet = new TreeSet(new ReverseComparator());      posPairDistances = populatePosStrPairSet(metric, posPairSet, attrIdx);      numPossiblePosStrPairs = posPairSet.size();            // select numPositives       iterator = posPairSet.iterator();      for (int i = 0; iterator.hasNext() && i < m_numPotentialPositives && i < numPosPairs; i++) {	StringPair posPair = (StringPair) iterator.next();	pairList.add(posPair);      }      break;    case STRING_PAIRS_HARDEST:      posPairSet = new TreeSet();      posPairDistances = populatePosStrPairSet(metric, posPairSet, attrIdx);      numPossiblePosStrPairs = posPairSet.size();      // select numPositives examples      iterator = posPairSet.iterator();      for (int i = 0; iterator.hasNext() && i < m_numPotentialPositives && i < numPosPairs; i++) {	StringPair posPair = (StringPair) iterator.next();	pairList.add(posPair);      }      break;    case STRING_PAIRS_RANDOM:      // Get string pairs for a given attribute      ArrayList strPairList = new ArrayList();      for (int i = 0; i < m_posPairList.size(); i++) {	InstancePair pair = (InstancePair) m_posPairList.get(i);	String str1 = pair.instance1.stringValue(attrIdx);	String str2 = pair.instance2.stringValue(attrIdx);	if (!str1.equals(str2) && haveCommonTokens(str1, str2)) {	  StringPair strPair = new StringPair(str1, str2, true, 0);	  strPairList.add(strPair);	} else {	  System.out.println("Equal strings, or no common tokens - NOT adding: " + str1 + "\t" + str2);	}      }       numPossiblePosStrPairs = strPairList.size();            // if we have fewer pairs available than requested, return all the ones that were created      if (strPairList.size() <= numPosPairs) {	System.out.println("INSUFFICIENT available POSITIVE examples, using all " + strPairList.size());	pairList = strPairList;      } else {	// if we have more than enough potential pairs, sample randomly with replacement	int[] indexes = randomSubset(numPosPairs, strPairList.size());	System.out.println("SUFFICIENT available POSITIVE examples, randomly selected " + indexes.length + " of " + strPairList.size());	for (int i = 0; i < indexes.length; i++) {	  pairList.add(strPairList.get(indexes[i]));	}      }      //  	for (int i =0 ; i < strPairList.size(); i++) {      //  	  StringPair pair = (StringPair) strPairList.get(i);      //  	  System.out.println(pair.str1 + "\t\t\t" + pair.str2);      //  	}            break;    default:      throw new Exception("Unknown method for selecting positive pairs: " + m_posStringMode);    }    numActualPositives = pairList.size();    // we don't need negative string pairs for AffineProbMetric    if (!metric.getClass().getName().equals("weka.deduping.metrics.AffineProbMetric")) {            // SELECT NEGATIVE PAIRS unless this is AffineProbMetric - it doesn't need negatives      switch (m_negStringMode) {      case STRING_PAIRS_EASIEST:	// Create a map with *all* negatives	negPairSet = new TreeSet();	negPairDistances = populateNegStrPairSet(metric, negPairSet, attrIdx);	numPossibleNegStrPairs = negPairSet.size();      	iterator = negPairSet.iterator();	for (int i = 0; iterator.hasNext() && i < m_numPotentialNegatives && i < numNegPairs; i++) {	  StringPair negPair = (StringPair) iterator.next();	  pairList.add(negPair);	  System.out.println("EASY:   " + negPair.value + "\n\t" + negPair.str1 + "\n\t" + negPair.str2);	}	break;      case STRING_PAIRS_HARDEST:	negPairSet = new TreeSet(new ReverseComparator());	negPairDistances = populateNegStrPairSet(metric, negPairSet, attrIdx);	numPossibleNegStrPairs = negPairSet.size();	// We will hash each pair of classes that was used so that we don't end up with	// too many pairs from the same combination of two classes	HashSet usedComboSet = new HashSet();          	iterator = negPairSet.iterator();	for (int i = 0; iterator.hasNext() && i < m_numPotentialNegatives && i < numNegPairs; i++) {	  StringPair negPair = (StringPair) iterator.next();	  Double class1class2HashValue = new Double(negPair.class1 * 100000 + negPair.class2);	  if (!usedComboSet.contains(class1class2HashValue)) { //  kludge - comment out for cora1	    pairList.add(negPair);	    //	  System.out.println("HARD:   " + negPair.value + "\n\t" + negPair.str1 + "\n\t" + negPair.str2);	    usedComboSet.add(class1class2HashValue);	    // add reverse combo (or allow two per class if commented out	    //	  usedComboSet.add(new Double(negPair.class2 * 1000 + negPair.class1));  <- reverse combo	  } 	  	}	break;      case STRING_PAIRS_RANDOM:	// Get string pairs for a given attribute	ArrayList strPairList = new ArrayList();	for (int i = 0; i < m_negPairList.size(); i++) {	  InstancePair pair = (InstancePair) m_negPairList.get(i);	  String str1 = pair.instance1.stringValue(attrIdx);	  String str2 = pair.instance2.stringValue(attrIdx);	  if (!str1.equals(str2)) {	    StringPair strPair = new StringPair(str1, str2, false, 0);	    strPairList.add(strPair);	  }	}	numPossibleNegStrPairs = strPairList.size();            	// if we have fewer pairs available than requested, return all the ones that were created	if (strPairList.size() <= numNegPairs) {	  System.out.println("INSUFFICIENT available NEGATIVE examples, using all " + strPairList.size());	  pairList.addAll(strPairList);	} else { // if we have enough potential pairs, randomly sample with replacement	  int[] indexes = randomSubset(numNegPairs, strPairList.size());	  System.out.println("SUFFICIENT available NEGATIVE examples, randomly selected " + indexes.length + " of " + strPairList.size());	  for (int i = 0; i < indexes.length; i++) {	    pairList.add(strPairList.get(indexes[i]));	  }	}	break;	      default:	throw new Exception("Unknown method for selecting negative pairs: " + m_negStringMode);      }    }    numActualNegatives = pairList.size() - numActualPositives;    System.out.println();    System.out.println("**POSITIVES:  requested=" + numPosPairs + "\tpossible=" + numPossiblePosStrPairs +		       "\tactual=" + numActualPositives);    System.out.println("**NEGATIVES:  requested=" + numNegPairs + "\tpossible=" + numPossibleNegStrPairs +		       "\tactual=" + numActualNegatives);    return pairList;  }  /** Add a pair to a TreeSet so that there are no collisions, and no values are erased   * @param set a set to which a new pair should be added   * @param pair a new pair of strings that is to be added; value   * fields holds the distance between the strings   * @return the unique value of the distance (possibly perturbed)   * with which the pair was added   */  protected double addUniquePair(TreeSet set, StringPair pair) {    Random random = new Random();    double epsilon = 0.0000001;    int counter = 0;    while (set.contains(pair)) {      double perturbation;      if (pair.value == 0) {	perturbation = Double.MIN_VALUE * random.nextInt(m_numPotentialPositives);      } else {	perturbation = pair.value * epsilon * ((random.nextDouble() > 0.5) ? 1 : -1);      }      pair.value += perturbation;      counter++;      if (counter % 10 == 0) {  // increase perturbations if "nearby" values have been exhausted	epsilon *= 10;      }    }    set.add(pair);    return pair.value;  }    /** Populate a provided treeset with all positive StringPair's   * @param metric a metric that will be used to calculate distance   * @param pairSet an empty TreeSet that will be populated   * @param attrIdx the index of the attribute for which positive   * string pairs are being accumulated   * @return an array with distance values of the created pairs   */  protected double[] populatePosStrPairSet(StringMetric metric, TreeSet strPairSet, int attrIdx) throws Exception {    double [] posPairDistances = new double[m_numPotentialPositives];    Arrays.fill(posPairDistances, Double.MIN_VALUE);    int posCounter = 0;    for (int i = 0; i < m_posPairList.size(); i++) {      InstancePair pair = (InstancePair) m_posPairList.get(i);      String str1 = pair.instance1.stringValue(attrIdx);      String str2 = pair.instance2.stringValue(attrIdx);      // unless the two fields are exact duplicates, create a new pair      if (!str1.equals(str2)) {	StringPair strPair = new StringPair(str1, str2, true, metric.similarity(str1, str2));	posPairDistances[posCounter++] = addUniquePair(strPairSet, strPair);      }          }     return posPairDistances;  }  /** Populate a provided treeset with a sufficient population of negative StringPair's   * @param metric a metric that will be used to calculate distance between strings   * @param pairSet an empty TreeSet that will be populated   * @param attrIdx the index of the attribute for which positive   * string pairs are being accumulated   * @return an array with distance values of the created pairs   */  protected double[] populateNegStrPairSet(StringMetric metric, TreeSet strPairSet, int attrIdx) throws Exception {    // Create a map with *all* positives    double [] negPairDistances = new double[m_numPotentialNegatives];    Arrays.fill(negPairDistances, Double.MIN_VALUE);    int negCounter = 0;    int[] negPairIdxs;    // get a random sample if we have too many possible negatives  TODO - are we limiting ourselves here???    negPairIdxs = randomSubset(20000, m_numPotentialNegatives);    for (int i = 0; i < negPairIdxs.length; i++) {      InstancePair pair = (InstancePair) m_negPairList.get(negPairIdxs[i]);      String str1 = pair.instance1.stringValue(attrIdx);      String str2 = pair.instance2.stringValue(attrIdx);      // unless the two fields are exact duplicates, create a new pair      if (!str1.equals(str2)) {	StringPair strPair = new StringPair(str1, str2, false, metric.similarity(str1, str2));

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -