📄 gaincluster.java~
字号:
dadIndex = (int) Math.abs(R.nextInt() % numGenes); } Debug.dp(Debug.EVERYTHING, "Mother is: " + printArr((int[]) genes.elAt(mumIndex))); Debug.dp(Debug.EVERYTHING, "Father is: " + printArr((int []) genes.elAt(dadIndex))); SortedSet s = new SortedSet(); s.add((int [] ) genes.elAt(mumIndex)); s.add((int [] ) genes.elAt(dadIndex)); int numCentroids = minCent + (int) Math.floor(Math.pow(Math.random(), 1.0/clustBias)*(maxCent-minCent+1)); retval = s.randomSubset(numCentroids); } else { retval = (int[]) genes.elAt((int) Math.abs(R.nextInt() % numGenes)); } // Mutation stage int currentCent; for(int i=0; i < retval.length; i++){ if(R.nextFloat() < mutationRate){ currentCent = (int) (R.nextFloat()*numInstances); while(foundIn(currentCent, retval, retval.length)){ currentCent = (int) (R.nextFloat()*numInstances); } retval[i] = currentCent; } } Debug.dp(Debug.EVERYTHING, "Child is: " + printArr(retval)); return retval; } // Note: Sets the following class variables: // 1. Number of instances // 2. Number of streams // 3. Classes // 4. allHist // 5. allInfo. // 6. numClasses String printArr(int[] array){ StringBuffer retval = new StringBuffer("[ "); for(int i=0; i < array.length; i++){ retval.append(array[i] + " "); } retval.append("]"); return retval.toString(); } boolean createData(ClassStreamEventsVecI csevi){ StreamEventsVecI sevi = csevi.getStreamEventsVec(); ClassificationVecI cvi = csevi.getClassVec(); numClasses = cvi.getClassDescVec().size(); allHist = new ClassHistogram(numClasses); int numStreams = sevi.size(); numParams = edvi.elAt(pepIndex).numParams(); isDiscrete = new boolean[numParams]; EventDescI edi = edvi.elAt(pepIndex); for(int i=0; i < numParams; i++){ if(edi.getDataType(i).getName().equals("discrete")){ isDiscrete[i] = true; Debug.dp(Debug.EVERYTHING, "Param " + i + " is discrete."); } else { isDiscrete[i] = false; } } Debug.dp(Debug.EVERYTHING, "NumParams = " + numParams); // Assume there is at least one StreamEventI numInstances = 0; for(int i=0; i < numStreams; i++){ numInstances += sevi.elAt(i).getEvents(pepIndex).size(); } if(numInstances == 0){ return false; } classes = new int[numInstances]; points = new float[numInstances][numParams]; origEvents = new EventI[numInstances]; int currentPosition = 0; EventVecI events; int currSize; EventI currEvent; for(int i=0; i < numStreams; i++){ events = sevi.elAt(i).getEvents(pepIndex); currSize = events.size(); for(int j=0; j < currSize; j++){ currEvent = events.elAt(j); for(int k=0; k < numParams; k++){ // Loop to copy the values points[currentPosition][k] = currEvent.valOf(k); } // Now set up the class vector classes[currentPosition] = cvi.elAt(i).getRealClass(); allHist.inc(cvi.elAt(i).getRealClass()); origEvents[currentPosition] = currEvent; currentPosition++; } } Debug.dp(Debug.EVERYTHING, "Total instances: " + numInstances); Debug.dp(Debug.EVERYTHING, allHist.toString()); allInfo = allHist.info(); return true; } EventI[] getOrigEvents(){ return origEvents; } void findSDs(){ float[] sumx= new float[numParams]; float[] sumx2= new float[numParams]; avgs = new float[numParams]; sds = new float[numParams]; float currentVal; // First compute the sums ... for(int i=0; i < points.length; i++){ for(int j=0; j < numParams; j++){ currentVal = points[i][j]; sumx[j] += currentVal; sumx2[j] += currentVal*currentVal; } } // And now compute the averages and standard deviations. for(int i=0; i < numParams; i++){ // Debug.dp(Debug.EVERYTHING, "i = " + i); // Debug.dp(Debug.EVERYTHING, "sum = " + sumx[i]); // Debug.dp(Debug.EVERYTHING, "sum2 = " + sumx2[i]); if(!isDiscrete[i]){ avgs[i] = sumx[i]/numInstances; // Var(X) = E(X*X) - E(X)*E(X) // sd = sqrt(var) sds[i] = (float) Math.sqrt(sumx2[i]/numInstances-avgs[i]*avgs[i]); if(sds[i] == 0){ sds[i] = MIN_SD; } } else { sds[i] = 1; } } } void findClustSDs(int[] bestCentroids, int[] bestClustMem){ float[][] sumx= new float[bestCentroids.length][numParams]; float[][] sumx2= new float[bestCentroids.length][numParams]; // avgs = new float[bestCentroids.length][numParams]; float avg; clustSDs = new float[bestCentroids.length][numParams]; int[] numPerCluster = new int[bestCentroids.length]; float currentVal; // First compute the sums ... for(int i=0; i < points.length; i++){ for(int j=0; j < numParams; j++){ currentVal = points[i][j]; sumx[bestClustMem[i]][j] += currentVal; sumx2[bestClustMem[i]][j] += currentVal*currentVal; numPerCluster[bestClustMem[i]]++; } } // And now compute the averages and standard deviations. for(int j=0; j < bestCentroids.length; j++){ for(int i=0; i < numParams; i++){ // Debug.dp(Debug.EVERYTHING, "i = " + i); // Debug.dp(Debug.EVERYTHING, "sum = " + sumx[i]); // Debug.dp(Debug.EVERYTHING, "sum2 = " + sumx2[i]); if(!isDiscrete[i]){ avg = sumx[j][i]/numPerCluster[j]; // Var(X) = E(X*X) - E(X)*E(X) // sd = sqrt(var) clustSDs[j][i] = (float) Math.sqrt(sumx2[j][i]/numPerCluster[j]-avg*avg); if(clustSDs[j][i] == 0){ clustSDs[j][i] = MIN_SD; } } else { clustSDs[j][i] = 1; } } } } // This function generates a random selection of indexes // for centroids. Note: no point can be repeated. int[] randomCentroids(){ int numCentroids = minCent + (int) Math.floor(Math.pow(Math.random(), 1.0/clustBias)*(maxCent-minCent+1)); // Perhaps choice of centroids based on number of // possibilities? There seems to be a bias to smaller cluster // sizes. // Let's see if we can come up with another way ... what if we take the 3rd // root? // Debug.dp(Debug.EVERYTHING, "number of centroids = " + numCentroids); if(numCentroids >= numInstances){ numCentroids = numInstances; Debug.dp(Debug.EMERGENCY, "WARNING: numCentroids was > numInstances "); } // Selects a random number between minCent and maxCent inclusive. int[] retval = new int[numCentroids]; int currentCent; for(int i=0; i < numCentroids; i++){ currentCent = (int) Math.floor(Math.random()*numInstances) ; while(foundIn(currentCent, retval, i)){ currentCent = (int) Math.floor(Math.random()*numInstances); } retval[i] = currentCent; } return retval; } // Linear search function. Sees if point is found in points, up to // (but not including) maximum index maxIndex. boolean foundIn(int point, int[] points, int maxIndex){ for(int i=0; i < maxIndex; i++){ if(points[i] == point){ return true; } } return false; } // Computes the gain ratio of a particular group of centroids. // This is now in the absurd, as it will also, depending on the setting of the evalMetric // calculate the chi'squared or the gain. float gainRatio(int[] centroids){ chs = new ClassHistogram[centroids.length]; // For each centroid, there is a class histogram. int nearest; float newInfo = 0; float splitInfo = 0; float gain = 0; float frac = 0; int[] thisClusterMem = new int[numInstances]; for(int i=0; i < centroids.length; i++){ chs[i] = new ClassHistogram(numClasses); } for(int i=0; i < numInstances; i++){ nearest = nearestCentroid(i, centroids); thisClusterMem[i] = nearest; chs[nearest].inc(classes[i]); // Increment the entry in the histogram of the centroid closest // to the current instance } clusterMem = thisClusterMem; // The part above is the same for all evaluation metrics. The next // part is specific. if(evalMetric == GAINRATIO){ // First compute the gain. Quinlan P. 22 // newinfo = sum{i=1..n} T_i/T*info(T_i) // splitinfo -sum{i=1..n} T_i/T*log2(T_i/T) for(int i=0; i < centroids.length; i++){ frac = ((float) chs[i].getCount())/numInstances; newInfo += frac*chs[i].info(); if(frac!= 0){ splitInfo -= frac*ClassHistogram.log2(frac); } } gain = allInfo - newInfo; // Gain ratio = gain/splitInfo return(gain/splitInfo); } else if(evalMetric == CHISQUARE){ // Alright ... here we go. // Notes: // - There is one row for each class. // - Thus we can retrieve row totals from allHist. // - There is one column for each class. // - Thus we can retrieve the column totals using getCount // - The total count of instances is numInstances. . // The outer loop is by classes. The inner is by cluster. float chisquaretot=0; int df; df = (numClasses-1)*(centroids.length-1); for(int i=0; i < numClasses; i++){ for(int j=0; j < centroids.length; j++){ float expected = ((float) allHist.getCount(i)*chs[j].getCount())/numInstances; //if((expected) < 3.0 && (numInstances > 2*numClasses)){ // Debug.dp(Debug.EMERGENCY, "WARNING: Not enough examples in test! Abandoning this test. Expected = " + expected + " for " + allHist.getCount(i) + " and " + chs[j].getCount() ); // return(0); //} float observed = chs[j].getCount(i); float tmp; //Below needed to handle case where no instances belonging to a // class have events. if(expected != 0){ tmp = (observed-expected)*(observed-expected)/expected; } else { tmp = 0; } chisquaretot += tmp; } } double probability = FastMath.lnChiSqProb(chisquaretot, df); Debug.dp(Debug.EVERYTHING, "Chisquare tot = " + chisquaretot + " df = " + df + " logprob = " + (-probability)); // To avoid precision issues. The smaller the number, the higher the negative // log. return((float) -probability); // This should stop things from being broken outside. } return 0; } int nearestCentroid(int point, int[] centroids){ float smallestDist = Float.MAX_VALUE; int bestIndex = -1; float currentDistance; for(int i=0; i < centroids.length; i++){ currentDistance= distance(point, centroids[i]); if(currentDistance < smallestDist){ smallestDist = currentDistance; bestIndex = i; } } return bestIndex; } // Computes the Euclidean distance, normalised by SD. float distance(int point1, int point2){ float sumDist2=0; // Debug.dp(Debug.EMERGENCY, "p1 = " + point1 + " p2 = " + point2); for(int i=0; i < numParams; i++){ float rawDist; if(!isDiscrete[i]){ rawDist = (points[point1][i]-points[point2][i])/sds[i]; } else { // Use a distance measure of "0" if equal, // 1 if different. rawDist = points[point1][i] == points[point2][i] ? 0: 1; } sumDist2 += rawDist*rawDist; } float retval = (float) Math.sqrt(sumDist2); return retval; } ClusterVecI makeClusters(int[] centroids){ GClustVec gcv = new GClustVec(edvi.elAt(pepIndex), centroids.length, sds, pepIndex, clustSDs, distMetric, origEvents); for(int i=0; i < centroids.length; i++){ GClust gc = new GClust(gcv, origEvents[centroids[i]], i); gcv.insert(gc,i); } return gcv; } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -