⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 instancepair.java

📁 wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器
💻 JAVA
字号:
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    InstancePair.java *    Copyright (C) 2002 Sugato Basu * */package weka.clusterers;import java.util.*;import weka.core.Instance;import weka.core.Instances;/** Class for handling a pair of instances, in terms of indices of instances in an Instances set */public class InstancePair implements Comparable {  /** first instance index */    public int first;   /** second instance index, always <= first */  public int second;   /** MUST_LINK, CANNOT_LINK or DONT_CARE_LINK */  public int linkType;  /** cost of violating constraint */  public double cost;  /** score from active learning algorithm */  /** ----- DEPRECATED: ACTIVE SCORE NO LONGER USED IN PCKMEANS!!!! -----*/  public double activeScore;  /** must-link */  public final static int MUST_LINK = 29;  /** cannot-link */  public final static int CANNOT_LINK = 31;  /** don't care */  public final static int DONT_CARE_LINK = 37;  public static boolean m_isClassAttributeString = false;  /** constructor */  public InstancePair() {  }  /** constructor */  public InstancePair(int a, int b) {    first = a;    second = b;  }  /** constructor */  public InstancePair(int a, int b, int l) {    first = a;    second = b;    linkType = l;  }  /** constructor */  public InstancePair(int a, int b, int l, double c) {    first = a;    second = b;    linkType = l;    cost = c;  }  /** Compare function   * @return 0 if equal, -1 if this.activeScore > a.activeScore, +1 else   * Note: Reverse of conventional compareTo, to force sort in descending order   */  public int compareTo (Object a) {    if (a instanceof InstancePair) {      return compareTo((InstancePair)a);    }    return 0;  }  /** Compare function   * @return 0 if equal, -1 if this.activeScore > a.activeScore, +1 else   * Note: Reverse of conventional compareTo, to force sort in descending order   */  public int compareTo (InstancePair a) {    if (this.activeScore == a.activeScore)       return 0;    else if (this.activeScore > a.activeScore)       return -1;    return +1;  }  /** Equals function   * @return true if same, false else   */  public boolean equals (Object a) {    if (a instanceof InstancePair) {      InstancePair b = (InstancePair) a;      if (this.first==b.first &&	  this.second==b.second && 	  this.linkType==b.linkType) {	return true;      }      else {	return false;      }    }    return super.equals(a);  }  /** hashCode */  public int hashCode() {    return first*second*linkType;  }  /** Finds whether index is in pair */  boolean contains (int num) {    return (first == num || second == num);  }  /** Returns an arraylist of random (both positive and negative) pair objects created from the input   *  @param instances list of instances   *  @param size number of pairs to return   *  @return arraylist of pairs   */  public static ArrayList getPairs(Instances instances, int size) {    return getPairs(instances, size, -1);  }  /** Returns an arraylist of pair objects created from the input set of instances   *  @param instances list of instances   *  @param size number of pairs to return   *  @param fractionMustLinks proportion of Must-Links; if -1 - sample randomly   *  @return arraylist of pairs   */  public static ArrayList getPairs(Instances instances, int size, double fractionMustLinks) {    ArrayList pairs = new ArrayList(size);    int num=0;    Random rand = new Random(42);    m_isClassAttributeString = instances.instance(0).classAttribute().isString();    if (fractionMustLinks != -1) {      int numMustLinks = (int) (fractionMustLinks * size);      int numCannotLinks = size - numMustLinks;      int numClasses = instances.numClasses();	        // stratify instances into lists for each class      HashMap classListMap = new HashMap();      for (int i = 0; i < instances.numInstances(); i++) {	Double classValue = new Double(instances.instance(i).classValue());	if (classListMap.containsKey(classValue)) {	  ArrayList classList = (ArrayList) classListMap.get(classValue);	  classList.add(new Integer(i)); 	} else { // previously unseen class	  ArrayList classList = new ArrayList();	  classList.add(new Integer(i)); 	  classListMap.put(classValue, classList); 	}       }      // select must-links first      while (num < numMustLinks) {	int first = rand.nextInt(instances.numInstances());	int second = 0;	if (!m_isClassAttributeString) {	  Double classValue = new Double(instances.instance(first).classValue());	  ArrayList classList = (ArrayList) classListMap.get(classValue);	  // skip classes with a single instance	  if (classList.size() < 2) {	    continue;	  }	  // select a random instance from the same class	  int idx = rand.nextInt(classList.size());	  second = ((Integer) classList.get(idx)).intValue();	} else { // phylo profile case	  second = rand.nextInt(instances.numInstances());	  while (second == first) {	    second = rand.nextInt(instances.numInstances());	  }	}	if (first > second) { // flip if out of order	  int i = first;	  first = second;	  second = i;	}	Instance firstInstance = instances.instance(first);	Instance secondInstance = instances.instance(second);	if (m_isClassAttributeString) {	  // for handling string valued class attributes corr. to	  // multi-class phylogenetic profiles	  double jaccardSim = jaccardSimilarityOfClassStrings(firstInstance, secondInstance);	  int linkType = InstancePair.DONT_CARE_LINK; 	  double cost = 0;	  if (jaccardSim > 0) {	    	    linkType = InstancePair.MUST_LINK;	    cost = jaccardSim;	  } else if (jaccardSim == 0) {	    linkType = InstancePair.CANNOT_LINK;	    cost = 1.0;	  } else { // jaccardSim < 0 => don't care link	    linkType = InstancePair.DONT_CARE_LINK;	    cost = -1.0;	  }	  InstancePair pair = new InstancePair(first, second, linkType, cost);	  if (first!=second && !pairs.contains(pair) && linkType == InstancePair.MUST_LINK && cost < 1.0) { // to filter homologs	    pairs.add(pair);	    //  	    System.out.println("Instances are:\n" + firstInstance + "\n" + secondInstance);	    //  	    System.out.println("Jaccard sim = " + cost);	    //  	    System.out.println(num + "th pair is: " + pair);	    num++;	  }	  	} else {	  int linkType = (instances.instance(first).classValue() == 			  instances.instance(second).classValue())? 	    InstancePair.MUST_LINK:InstancePair.CANNOT_LINK;	  InstancePair pair = new InstancePair(first, second, linkType);	  if (first != second && !pairs.contains(pair) && linkType == InstancePair.MUST_LINK) {	    pairs.add(pair);	    num++;	  }	}      }      // now add cannot-links - NB:  for now not dealing with string attributes; TODO: handle m_isClassAttributeString      num = 0;      while (num < numCannotLinks) {	// we just sample randomly - arguably less time-efficient, but we don't need to	// create another hash this way.	int first = rand.nextInt(instances.numInstances());	int second = rand.nextInt(instances.numInstances());	while (instances.instance(first).classValue() == instances.instance(second).classValue()) {	  second = rand.nextInt(instances.numInstances());	}	if (first > second) { // flip if out of order	  int i = first;	  first = second;	  second = i;	}	InstancePair pair = new InstancePair(first, second, InstancePair.CANNOT_LINK);	if (!pairs.contains(pair)) {	  pairs.add(pair);	  num++;	}      }      System.out.println("Created " + numMustLinks + " must-links and " + numCannotLinks + " cannot-links.");     } else { // just collect the requested number of instance pairs by sampling randomly      while (num < size) {	int i = rand.nextInt(instances.numInstances());	int j = rand.nextInt(instances.numInstances());	int first = (i<j)? i:j;	int second = (i>=j)? i:j;	Instance firstInstance = instances.instance(first);	Instance secondInstance = instances.instance(second);	if (firstInstance.classAttribute().isString()) {	  // for handling string valued class attributes corr. to	  // multi-class phylogenetic profiles	  double jaccardSim = jaccardSimilarityOfClassStrings(firstInstance, secondInstance);	  int linkType = InstancePair.DONT_CARE_LINK; 	  double cost = 0;	  if (jaccardSim > 0) {	    	    linkType = InstancePair.MUST_LINK;	    cost = jaccardSim;	  } else if (jaccardSim == 0) {	    linkType = InstancePair.CANNOT_LINK;	    cost = 1.0;	  } else { // jaccardSim < 0 => don't care link	    linkType = InstancePair.DONT_CARE_LINK;	    cost = -1.0;	  }	  InstancePair pair = new InstancePair(first, second, linkType, cost);	  if (first!=second && !pairs.contains(pair) && linkType != InstancePair.DONT_CARE_LINK) {	    pairs.add(pair);	    //	    System.out.println(num + "th pair is: " + pair);	    num++;	  }	  	} else {	  int linkType = (instances.instance(first).classValue() == 			  instances.instance(second).classValue())? 	    InstancePair.MUST_LINK:InstancePair.CANNOT_LINK;	  InstancePair pair = new InstancePair(first, second, linkType);	  if (first!=second && !pairs.contains(pair)) {	    pairs.add(pair);	    //	System.out.println(num + "th pair is: " + pair);	    num++;	  }	}      }    }    return pairs;  }  public static double jaccardSimilarityOfClassStrings(Instance a, Instance b) {    String s1 = a.classAttribute().value((int) a.classValue());    String s2 = b.classAttribute().value((int) b.classValue());    //    System.out.println("Trying out " + s1 + " and " + s2);    int numTokens1 = 0, numTokens2 = 0, numCommonTokens = 0;    HashSet set1 = new HashSet();    StringTokenizer tokenizer = new StringTokenizer(s1, "_");    while (tokenizer.hasMoreTokens()) {      set1.add(tokenizer.nextToken());      numTokens1++;    }        tokenizer = new StringTokenizer(s2, "_");    while (tokenizer.hasMoreTokens()) {      if (set1.contains(tokenizer.nextToken())) {	numCommonTokens++;      }      numTokens2++;    }      double jaccSim = 0;    if (numTokens1 + numTokens2 > 0) {      jaccSim = (numCommonTokens + 0.0) / (numTokens1 + numTokens2 - numCommonTokens);    }    if (numTokens1 == 0 || numTokens2 == 0) {      jaccSim = -1; // to indicate DONT_CARE_LINK    }    //      System.out.println("Instances are:\n" + a + "\n" + b);    //      System.out.println("Jaccard sim of " + s1 + " and " + s2 + " = " + jaccSim);    return jaccSim;  }  /** returns string representation of InstancePair    */  public String toString() {    String string = new String();    string = "[" + first + "," + second + ",";    if (linkType == MUST_LINK) {      string = string + "MUST,";    }    else if (linkType == CANNOT_LINK) {      string = string + "CANNOT,";    }    else if (linkType == DONT_CARE_LINK) {      string = string + "DONTCARE,";    }    string += cost + "]";    return string;  }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -