📄 gaincluster.java~
字号:
/** * An algorithm that does clustering based on the gain that is to be * received on using these particular centroids. It does a random * search. * * * * @author Waleed Kadous * @version $Id: GainCluster.java,v 1.3 2002/08/02 05:05:34 waleed Exp $ * $Log: GainCluster.java,v $ * Revision 1.3 2002/08/02 05:05:34 waleed * Hmmm ... fixed the sd's, but then forgot that distance measure * itself has to change. * * Revision 1.2 2002/08/02 05:02:21 waleed * Added better handling of discrete parameters of metafeatures. * Standard deviation is no longer applied. * * * Yuuuuk. This thing is getting really messy and ugly. But too much effort into it. * Needs to be rewritten. */ package tclass.clusteralg; import tclass.*; import tclass.util.*; import java.util.*; // import weka.core.Statistics; public class GainCluster implements ClusterAlgI { static final float MIN_SD = 1e-9f; static final int RANDOM = 1; static final int GENETIC = 2; static final int GAINRATIO = 1; static final int GAIN = 2; static final int CHISQUARE = 3; static final int DISTRATIO = 1; static final int DISTANCE = 2; String baseName = "directed"; String description = "Implements clustering using a random search for high-gain centroids"; EventDescVecI edvi = null; DomDesc domDesc = null; int numTrials = 1000; int clustBias = 1; int minCent = 2; int maxCent = 8; int pepIndex = 0; int searchAlg = RANDOM; int evalMetric = GAINRATIO; int distMetric = DISTRATIO; // These are required by the genetic algorithm. int numRounds = 10; float survivalRate = 0.05f; // This is the ratio of cases which will survive the next round. float crossoverRate = 0.4f; // Percentage of instances that will be "crossed over" float mutationRate = 0.05f; // Random rate of mutation. // These are used for calculations. float[][] points; int[] classes; // An array containing the class of each instance as an int. int[] clusters; // An array containing the cluster of each instance as an int. int numInstances; int numParams; int numClasses; boolean isDiscrete[]; float[] sds; float[][] clustSDs; // Store the individual standard deviation of each cluster. float[] avgs; EventI[] origEvents; ClassHistogram allHist; // The histogram of all classes. ClassHistogram[] chs; // Made global to simplify debugging int[] clusterMem; float allInfo; /** * Name of this clustering algorithm. */ public String name(){ return baseName; } /** * Clone the current object. * */ public Object clone() { try { return super.clone(); } catch (CloneNotSupportedException e){ // Can't happen, or so the java programming book says throw new InternalError(e.toString()); } } public void setDomDesc(DomDesc dd){ domDesc = dd; } /** * Set the description of the incoming Class Stream Events Vector * Note that we need this first ... for the parsing of values, * before we do any actual processing. I expect that the * ClusterAlgMgr should have a copy of it and passes it through in * the constructor, pretty much like the Domain description for the * GlobalExtrMgr */ public void setEventDescVec(EventDescVecI events){ edvi = events; } /** * Provides a description of the clustering algorithm. * This description explains what the basic idea of the clustering * algorihtm is (i.e. the sort of shapes it tried to find). It * should also explain any potential configuration options that * may be used to configure the object, using the configure * option. * * @return The description of this class. */ public String description(){ return description; } /** * Configures this instance so that parameter <i>p</i> has * value <i>v</i>. * * @param p the parameter to set. * @param v the value of the parameter. * @return true if the operation succeeded. * */ public void setParam(String p, String v) throws InvalidParameterException { // Let's just use a simple parameter as an example. if(p.equals("metafeature")){ // So they want to use a particular metafeature. //Try to find the metafeature. pepIndex = edvi.elIndex(v); if(pepIndex == -1){ throw new InvalidParameterException(p, v, "Unknown metafeature " + v); } } else if(p.equals("numTrials")){ // Cool bananas. Do nothing. try { numTrials = Integer.parseInt(v); } catch(NumberFormatException nfe){ throw new InvalidParameterException(p, v, "Could not understand number of trials."); } } else if(p.equals("clustBias")){ // Cool bananas. Do nothing. try { clustBias = Integer.parseInt(v); } catch(NumberFormatException nfe){ throw new InvalidParameterException(p, v, "Could not understand bias exponent."); } } else if(p.equals("minCent")){ // Cool bananas. Do nothing. try { minCent = Integer.parseInt(v); } catch(NumberFormatException nfe){ throw new InvalidParameterException(p, v, "Could not understand number of trials."); } } else if(p.equals("maxCent")){ // Cool bananas. Do nothing. try { maxCent = Integer.parseInt(v); } catch(NumberFormatException nfe){ throw new InvalidParameterException(p, v, "Could not understand number of trials."); } } else if(p.equals("searchAlg")){ if(v.equals("random")){ searchAlg = RANDOM; } else if(v.equals("genetic")){ searchAlg = GENETIC; } else { throw new InvalidParameterException(p, v, "Algorithm is random or genetic"); } } else if(p.equals("dispMeasure")){ if(v.equals("gainratio")){ evalMetric = GAINRATIO; } else if(v.equals("chisquare")){ evalMetric = CHISQUARE; } else { throw new InvalidParameterException(p, v, "Algorithm is random or genetic"); } } else if(p.equals("distMetric")){ if(v.equals("distratio")){ evalMetric = DISTRATIO; } else if(v.equals("distance")){ evalMetric = DISTANCE; } else { throw new InvalidParameterException(p, v, "Distance is distratio or distance"); } } else { throw new InvalidParameterException(p, v, "I was expecting exp"); } } /** * * Describes any parameters used by this global extractor, * to suit a particular domain. * * @return A vector of parameters. */ public ParamVec getParamList(){ ParamVec pv = new ParamVec(); pv.add(new Param("metafeature", "Name of metafeature to operate on", "First metafeature")); pv.add(new Param("numTrials", "Number of Trials to Run", "1000")); pv.add(new Param("minCent", "Minimum number of centroids", "2")); pv.add(new Param("maxCent", "Maximum number of centroids", "8")); pv.add(new Param("clustBias", "Bias exponent towards more clusters", "1")); pv.add(new Param("searchAlg", "Search algorithm: random or genetic", "random")); pv.add(new Param("evalMetric", "Evaluation metric: gainratio, gain or chisquare", "random")); return pv; } public ClusterVecI cluster(ClassStreamEventsVecI csvi){ // int oldDebug = Debug.getDebugLevel(); // Debug.setDebugLevel(Debug.EVERYTHING); // First, flatten the data and get it into a nice form. Debug.dp(Debug.EVERYTHING, ""); if(!createData(csvi)){ // Uh oh ... no instances ;-(. // Return an empty clusterVecI. return new ClusterVec(); } findSDs(); Debug.dp(Debug.EVERYTHING, "Preprocessing complete"); if(allInfo == 0){ return new ClusterVec(); } // Then, normalise the data. // Then pick some random points. int[] bestCentroids = new int[0]; float bestGR = -Float.MAX_VALUE; int[] bestClusterMem = new int[0]; int[] currentCentroids; float currentGR; ClassHistogram[] bestchs = new ClassHistogram[0]; System.out.println("Search Alg = " + searchAlg); if(searchAlg == RANDOM){ for(int i=0; i < numTrials; i++){ currentCentroids = randomCentroids(); currentGR = gainRatio(currentCentroids); Debug.dp(Debug.EVERYTHING, "Test: " + i + " Current: " + currentGR + " Best: " + bestGR); if(currentGR > bestGR){ bestCentroids = currentCentroids; bestGR = currentGR; bestchs = chs; bestClusterMem = clusterMem; } } } else{ // Ok, this is the new thing. TopVector parents = new TopVector((int) (survivalRate*numTrials)); TopVector children = new TopVector((int) (survivalRate*numTrials)); int countSoFar = 0; // Start out with a normal run; pretty much the same as // the random search, except we insert results into the TopVector. for(int i=0; i < numTrials; i++){ currentCentroids = randomCentroids(); currentGR = gainRatio(currentCentroids); parents.add(currentCentroids, currentGR); if(currentGR > bestGR){ bestCentroids = currentCentroids; bestGR = currentGR; bestchs = chs; bestClusterMem = clusterMem; } Debug.dp(Debug.EVERYTHING, "GTest: " + i + " Current: " + currentGR + " Best: " + bestGR); countSoFar++; } for(int i=0; i < (numRounds-1); i++){ children = new TopVector((int) (survivalRate*numTrials)); for(int j=0; j < numTrials; j++){ currentCentroids = breed(parents); // Breeds a bunch of centroids. currentGR = gainRatio(currentCentroids); children.add(currentCentroids, currentGR); if(currentGR > bestGR){ bestCentroids = currentCentroids; bestGR = currentGR; bestchs = chs; bestClusterMem = clusterMem; } Debug.dp(Debug.EVERYTHING, "GTest: " + countSoFar + " Current: " + currentGR + " Best: " + bestGR); countSoFar++; } parents = children; } } if(Debug.getDebugLevel() >= Debug.PROGRESS){ System.err.println("The BEST division (gr = " + bestGR + ", n = " + bestCentroids.length + ") is: "); for(int i=0; i < bestCentroids.length; i++){ System.err.println(bestchs[i]); } } findClustSDs(bestCentroids, bestClusterMem); ClusterVecI cvi = makeClusters(bestCentroids); // Debug.setDebugLevel(oldDebug); return cvi; } public int[] breed(TopVector genes){ Debug.dp(Debug.EVERYTHING, "Breeding ..."); int numGenes = genes.size(); int[] retval; Random R = new Random(); // Why use random here? Because it didn't look // Random. // Crossover stage if(R.nextFloat() < crossoverRate){ // Check if there is a crossover; // Make a set out of the instances. int mumIndex = (int) Math.abs(R.nextInt() % numGenes); int dadIndex = (int) Math.abs(R.nextInt() % numGenes); while(dadIndex == mumIndex){
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -