📄 gaincluster.java~

📁 用于multivariate时间序列分类
💻 JAVA~
📖 第 1 页 / 共 2 页
字号:
上一页 12
                dadIndex = (int) Math.abs(R.nextInt() % numGenes);             }            Debug.dp(Debug.EVERYTHING, "Mother is: " + printArr((int[]) genes.elAt(mumIndex)));             Debug.dp(Debug.EVERYTHING, "Father is: " + printArr((int []) genes.elAt(dadIndex)));             SortedSet s = new SortedSet();              s.add((int [] ) genes.elAt(mumIndex));             s.add((int [] ) genes.elAt(dadIndex));             int numCentroids = minCent + (int) Math.floor(Math.pow(Math.random(), 1.0/clustBias)*(maxCent-minCent+1));             retval =  s.randomSubset(numCentroids);         }        else {            retval = (int[]) genes.elAt((int) Math.abs(R.nextInt() % numGenes));         }        // Mutation stage        int currentCent;         for(int i=0; i < retval.length; i++){            if(R.nextFloat() < mutationRate){                currentCent = (int) (R.nextFloat()*numInstances);                while(foundIn(currentCent, retval, retval.length)){                    currentCent = (int) (R.nextFloat()*numInstances);                }                retval[i] = currentCent;             }        }        Debug.dp(Debug.EVERYTHING, "Child is: " + printArr(retval));         return retval;     }        // Note: Sets the following class variables:     // 1. Number of instances    // 2. Number of streams    // 3. Classes    // 4. allHist    // 5. allInfo.     // 6. numClasses    String printArr(int[] array){        StringBuffer retval = new StringBuffer("[ ");         for(int i=0; i < array.length; i++){            retval.append(array[i] + " ");         }        retval.append("]");         return retval.toString();     }    boolean createData(ClassStreamEventsVecI csevi){        StreamEventsVecI sevi = csevi.getStreamEventsVec();         ClassificationVecI cvi = csevi.getClassVec();        numClasses = cvi.getClassDescVec().size();         allHist = new ClassHistogram(numClasses);         int numStreams = sevi.size();         numParams = edvi.elAt(pepIndex).numParams();         isDiscrete = new boolean[numParams]; 	EventDescI edi = edvi.elAt(pepIndex); 	for(int i=0; i < numParams; i++){		if(edi.getDataType(i).getName().equals("discrete")){			isDiscrete[i] = true; 		        Debug.dp(Debug.EVERYTHING, "Param " + i + " is discrete."); 		}		else {			isDiscrete[i] = false; 		}			}        Debug.dp(Debug.EVERYTHING, "NumParams = " + numParams);         // Assume there is at least one StreamEventI        numInstances = 0;         for(int i=0; i < numStreams; i++){            numInstances += sevi.elAt(i).getEvents(pepIndex).size();         }        if(numInstances == 0){            return false;         }        classes = new int[numInstances]; 	        points = new float[numInstances][numParams];         origEvents = new EventI[numInstances];         int currentPosition = 0;         EventVecI events;         int currSize;         EventI currEvent;         for(int i=0; i < numStreams; i++){            events = sevi.elAt(i).getEvents(pepIndex);             currSize = events.size();             for(int j=0; j < currSize; j++){                currEvent = events.elAt(j);                 for(int k=0; k < numParams; k++){ // Loop to copy the values                    points[currentPosition][k] = currEvent.valOf(k);                 }                // Now set up the class vector                classes[currentPosition] = cvi.elAt(i).getRealClass();                 allHist.inc(cvi.elAt(i).getRealClass());                 origEvents[currentPosition] = currEvent;                 currentPosition++;             }        }        Debug.dp(Debug.EVERYTHING, "Total instances: " + numInstances);         Debug.dp(Debug.EVERYTHING, allHist.toString());         allInfo = allHist.info();         return true;     }    EventI[] getOrigEvents(){        return origEvents;     }    void findSDs(){        float[] sumx= new float[numParams];         float[] sumx2= new float[numParams];        avgs = new float[numParams];         sds = new float[numParams];                 float currentVal;         // First compute the sums ...        for(int i=0; i < points.length; i++){						for(int j=0; j < numParams; j++){								currentVal = points[i][j];								sumx[j] += currentVal; 								sumx2[j] += currentVal*currentVal; 						}        }        // And now compute the averages and standard deviations.         for(int i=0; i < numParams; i++){            // Debug.dp(Debug.EVERYTHING, "i = " + i);             // Debug.dp(Debug.EVERYTHING, "sum = " + sumx[i]);             // Debug.dp(Debug.EVERYTHING, "sum2 = " + sumx2[i]); 						if(!isDiscrete[i]){     								avgs[i] = sumx[i]/numInstances; 								// Var(X) = E(X*X) - E(X)*E(X)								// sd = sqrt(var)								sds[i] = (float) Math.sqrt(sumx2[i]/numInstances-avgs[i]*avgs[i]); 								if(sds[i] == 0){										sds[i] = MIN_SD; 								}						}						else {								sds[i] = 1; 						}        }    }    void findClustSDs(int[] bestCentroids, int[] bestClustMem){         	float[][] sumx= new float[bestCentroids.length][numParams]; 	float[][] sumx2= new float[bestCentroids.length][numParams];	// avgs = new float[bestCentroids.length][numParams]; 	float avg; 	clustSDs = new float[bestCentroids.length][numParams]; 	int[] numPerCluster = new int[bestCentroids.length]; 	float currentVal; 	// First compute the sums ...	for(int i=0; i < points.length; i++){	    for(int j=0; j < numParams; j++){                currentVal = points[i][j];                sumx[bestClustMem[i]][j] += currentVal;                 sumx2[bestClustMem[i]][j] += currentVal*currentVal;                 numPerCluster[bestClustMem[i]]++;             }        }        // And now compute the averages and standard deviations. 	for(int j=0; j < bestCentroids.length; j++){	    for(int i=0; i < numParams; i++){		// Debug.dp(Debug.EVERYTHING, "i = " + i); 		// Debug.dp(Debug.EVERYTHING, "sum = " + sumx[i]); 		// Debug.dp(Debug.EVERYTHING, "sum2 = " + sumx2[i]); 					if(!isDiscrete[i]){							avg = sumx[j][i]/numPerCluster[j]; 							// Var(X) = E(X*X) - E(X)*E(X)							// sd = sqrt(var)							clustSDs[j][i] = (float) Math.sqrt(sumx2[j][i]/numPerCluster[j]-avg*avg); 							if(clustSDs[j][i] == 0){									clustSDs[j][i] = MIN_SD; 							}					}					else {							clustSDs[j][i] = 1; 					}	    }	}    }    // This function generates a random selection of indexes    // for centroids. Note: no point can be repeated.     int[] randomCentroids(){        int numCentroids = minCent + (int) Math.floor(Math.pow(Math.random(), 1.0/clustBias)*(maxCent-minCent+1));         // Perhaps choice of centroids based on number of        // possibilities? There seems to be a bias to smaller cluster        // sizes.        // Let's see if we can come up with another way ... what if we take the 3rd        // root?                 // Debug.dp(Debug.EVERYTHING, "number of centroids = " + numCentroids);         if(numCentroids >= numInstances){            numCentroids = numInstances;             Debug.dp(Debug.EMERGENCY, "WARNING: numCentroids was > numInstances ");         }        // Selects a random number between minCent and maxCent inclusive.         int[] retval = new int[numCentroids];         int currentCent;         for(int i=0; i < numCentroids; i++){            currentCent = (int) Math.floor(Math.random()*numInstances) ;            while(foundIn(currentCent, retval, i)){                currentCent = (int) Math.floor(Math.random()*numInstances);             }            retval[i] = currentCent;         }        return retval;     }    // Linear search function. Sees if point is found in points, up to    // (but not including) maximum index maxIndex.     boolean foundIn(int point, int[] points, int maxIndex){        for(int i=0; i < maxIndex; i++){            if(points[i] == point){                return true;             }        }        return false;     }        // Computes the gain ratio of a particular group of centroids.     // This is now in the absurd, as it will also, depending on the setting of the evalMetric    // calculate the chi'squared or the gain.     float gainRatio(int[] centroids){        chs = new ClassHistogram[centroids.length];         // For each centroid, there is a class histogram.         int nearest;         float newInfo = 0;         float splitInfo = 0;         float gain = 0;         float frac = 0;         int[] thisClusterMem  = new int[numInstances];                 for(int i=0; i < centroids.length; i++){            chs[i] = new ClassHistogram(numClasses);         }        for(int i=0; i < numInstances; i++){            nearest = nearestCentroid(i, centroids);             thisClusterMem[i] = nearest;             chs[nearest].inc(classes[i]);             // Increment the entry in the histogram of the centroid closest             // to the current instance        }        clusterMem = thisClusterMem; 	// The part above is the same for all evaluation metrics. The next         // part is specific.         if(evalMetric == GAINRATIO){            // First compute the gain. Quinlan P. 22            // newinfo = sum{i=1..n} T_i/T*info(T_i)            // splitinfo -sum{i=1..n} T_i/T*log2(T_i/T)            for(int i=0; i < centroids.length; i++){                frac = ((float) chs[i].getCount())/numInstances;                 newInfo += frac*chs[i].info();                 if(frac!= 0){                    splitInfo -= frac*ClassHistogram.log2(frac);                 }            }            gain = allInfo - newInfo;             // Gain ratio = gain/splitInfo	    return(gain/splitInfo);         }        else if(evalMetric == CHISQUARE){            // Alright ... here we go.             // Notes:             // - There is one row for each class.             // - Thus we can retrieve row totals from allHist.             // - There is one column for each class.             // - Thus we can retrieve the column totals using getCount            // - The total count of instances is numInstances. .             // The outer loop is by classes. The inner is by cluster.             float chisquaretot=0;            int df;             df = (numClasses-1)*(centroids.length-1);             for(int i=0; i < numClasses; i++){                for(int j=0; j < centroids.length; j++){                    float expected = ((float) allHist.getCount(i)*chs[j].getCount())/numInstances;                     //if((expected) < 3.0 && (numInstances > 2*numClasses)){                    //    Debug.dp(Debug.EMERGENCY, "WARNING: Not enough examples in test! Abandoning this test. Expected = " + expected + " for " + allHist.getCount(i) + " and " +  chs[j].getCount() );                     //   return(0);                     //}                    float observed = chs[j].getCount(i);                     float tmp;                     //Below needed to handle case where no instances belonging to a                    // class have events.                     if(expected != 0){                        tmp = (observed-expected)*(observed-expected)/expected;                     }                    else {                        tmp = 0;                     }                    chisquaretot += tmp;                 }            }            double probability = FastMath.lnChiSqProb(chisquaretot, df);             Debug.dp(Debug.EVERYTHING, "Chisquare tot = " + chisquaretot + " df = " + df + " logprob = " + (-probability));             // To avoid precision issues. The smaller the number, the higher the negative            // log.             return((float) -probability); // This should stop things from being broken outside.         }        return 0;     }    int nearestCentroid(int point, int[] centroids){        float smallestDist = Float.MAX_VALUE;         int bestIndex = -1;         float currentDistance;         for(int i=0; i < centroids.length; i++){            currentDistance= distance(point, centroids[i]);             if(currentDistance < smallestDist){                smallestDist = currentDistance;                 bestIndex = i;             }        }        return bestIndex;     }        // Computes the Euclidean distance, normalised by SD.     float distance(int point1, int point2){        float sumDist2=0;         //         Debug.dp(Debug.EMERGENCY, "p1 = " + point1 + " p2 = " + point2);         for(int i=0; i < numParams; i++){	    float rawDist; 	    if(!isDiscrete[i]){		rawDist = (points[point1][i]-points[point2][i])/sds[i]; 	    }	    else {		// Use a distance measure of "0" if equal, 		// 1 if different. 		rawDist = points[point1][i] == points[point2][i] ? 0: 1; 	    }            sumDist2 += rawDist*rawDist;         }        float retval = (float) Math.sqrt(sumDist2);        return retval;     }    ClusterVecI makeClusters(int[] centroids){        GClustVec gcv = new GClustVec(edvi.elAt(pepIndex), centroids.length, sds, pepIndex, clustSDs, distMetric, origEvents);        for(int i=0; i < centroids.length; i++){            GClust gc = new GClust(gcv, origEvents[centroids[i]], i);             gcv.insert(gc,i);         }        return gcv;     }   }
上一页 12
💿 文件大小 2441 K
👤 上传用户 laoniu
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#multivariate #时间序列 #分类
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -