📄 kmeans.java
字号:
* @param v the value of the parameter. * @return true if the operation succeeded. * */ public void setParam(String p, String v) throws InvalidParameterException { // Let's just use a simple parameter as an example. if(p.equals("metafeature")){ // So they want to use a particular metafeature. //Try to find the metafeature. pepIndex = edvi.elIndex(v); if(pepIndex == -1){ throw new InvalidParameterException(p, v, "Unknown metafeature " + v); } } else if(p.equals("numclusters")){ if(v.equals("auto")){ autoNumClusters = true; } else { try { numClusters = Integer.parseInt(v); } catch(NumberFormatException nfe){ throw new InvalidParameterException(p, v, "Could not understand number of exceptions."); } } } else if(p.equals("ignoreclass")){ if(v.equals("true")){ ignoreClasses = true; } else if(v.equals("false")){ ignoreClasses = false; } else { throw new InvalidParameterException(p, v, "Must be either true or false"); } } else if(p.equals("useonly")){ classToCluster = domDesc.getClassDescVec().getId(v); if(classToCluster == -1){ throw new InvalidParameterException(p, v, "Unknown class " + v); } } else if(p.equals("initialdist")){ if(v.equals("random")){ orderedAllocate = false; } else if(v.equals("ordered")){ orderedAllocate = true; } else { throw new InvalidParameterException(p, v, "Must be either random or ordered"); } } else if(p.equals("closeness")){ if(v.equals("centroid")){ useCentroid = true; } else if(v.equals("closest")){ useCentroid = false; } else { throw new InvalidParameterException(p, v, "Must be either centroid or closest"); } } else { throw new InvalidParameterException(p, v, "Unknown parameter "+p); } } /** * * Describes any parameters used by this global extractor, * to suit a particular domain. * * @return A vector of parameters. */ public ParamVec getParamList(){ ParamVec pv = new ParamVec(); pv.add(new Param("metafeature", "The name of the metafeature to apply this to", "First")); pv.add(new Param("numclusters", "Number of clusters. Possible values: auto or a number", "auto")); pv.add(new Param("ignoreclass", "Completely ignore class information. Can be true or false", "false")); pv.add(new Param("useonly", "Only cluster instances of one class. Parameter is the class to cluster.", "First")); pv.add(new Param("initialdist", "Can be either: random, (random cluster distribution), ordered (distribute by order in sequence", "ordered")); pv.add(new Param("closeness", "Either centroid or closest", "centroid")); return pv; } public ClusterVecI cluster(ClassStreamEventsVecI csevi){ // Ok. First let's pull out our data. EventVecI[] data = pullData(csevi); //Now we need to perform an initial clustering. //Create the clusters KMCluster[] clusters = new KMCluster[numClusters]; EventDescI edi = edvi.elAt(pepIndex); float[] sds = findSD(data, edi); for(int i=0; i < clusters.length; i++){ clusters[i] = new KMCluster(edi, pepIndex, useCentroid, sds); } // Now do an initial distribution. distribute(data, clusters); //Now the main loop int numLoops = 0 ; while(redistribute(clusters) && numLoops < 40){ Debug.dp(Debug.PROGRESS, "Redistribution " + numLoops); numLoops++; } //Redistribute returns true while ever there is a change in //cluster membership. return(new KMClusterVec(clusters)); } float[] findSD(EventVecI[] evi, EventDescI edi){ int numParams = edi.numParams(); float[] retval = new float[numParams]; float[] avg = new float[numParams]; // Go through all the data. for(int i=0; i < numParams; i++){ float sumx=0; float sumx2=0; int count = 0; for(int j=0; j < evi.length; j++){ for(int k=0; k < evi[j].size(); k++){ float currentVal = evi[j].elAt(k).valOf(i); sumx += currentVal; sumx2 += currentVal*currentVal; count++; } } // Var(X) = E(X^2)-E(X)^2 float e_x2 = sumx2/count; avg[i] = sumx/count; float ex_2 = avg[i]*avg[i]; float varx = e_x2-ex_2; retval[i] = (float) Math.sqrt(varx); if(retval[i] == 0){ retval[i] = (float) 0.0001; } } Debug.dp(Debug.EVERYTHING, "SD's are: "); for(int i=0; i < retval.length; i++){ Debug.dp(Debug.EVERYTHING, edi.paramName(i) + " avg = " + avg[i] + " sd = " + retval[i]); } return retval; }// From here on in, internal methods. boolean redistribute(KMCluster[] clusters){ // int oldDL = Debug.getDebugLevel(); // Debug.setDebugLevel(Debug.EVERYTHING); Debug.dp(Debug.FN_CALLS, "Redistribute called ..."); boolean hasChanged = false; int numClusters = clusters.length; if(useCentroid){ for(int i=0; i < numClusters; i++){ clusters[i].computeMean(); } } for(int i=0; i < numClusters; i++){ int numEvents = clusters[i].size(); for(int j=0; j < numEvents; j++){ int oldCluster = i; int newCluster = -1; EventI currentEvent = clusters[i].elAt(j); if(useCentroid){ newCluster = closestCentroid(clusters, currentEvent); } else { // So we use closestPoint instead. newCluster = closestPoint(clusters, currentEvent); } if(newCluster != oldCluster){ Debug.dp(Debug.FN_PARAMS, "old = " + oldCluster + " new = " + newCluster); hasChanged = true; clusters[oldCluster].removeEvent(currentEvent); clusters[newCluster].addEvent(currentEvent); numEvents--; j--; } } } // Debug.setDebugLevel(oldDL); return hasChanged; } int closestCentroid(KMCluster[] clusters, EventI ev){ float minDistance = Float.MAX_VALUE; int minCluster = 0; for(int i=0; i < clusters.length; i++){ float dist = clusters[i].distFromCentroid(ev); if(dist <= minDistance){ minDistance = dist; minCluster = i; } } return minCluster; } int closestPoint(KMCluster[] clusters, EventI ev){ float minDistance = Float.MAX_VALUE; int minCluster = 0; for(int i=0; i < clusters.length; i++){ float dist = clusters[i].findClosest(ev); if(dist <= minDistance){ minDistance = dist; minCluster = i; } } return minCluster; } void distribute(EventVecI[] data, KMCluster[] clusters){ // Now let's check. if(orderedAllocate){ // So now we have to allocate on the assumption of locality. int numClusters = clusters.length; for(int i=0; i < data.length; i++){ EventVecI currentEvents = data[i]; // Now we have to allocate these instances to the clusters. // Algorithm is as follows: // Say we have 3 events and 5 clusters. // Then we want to put 0->0 1->2 and 2->4 int numEvents = currentEvents.size(); for(int j=0; j < currentEvents.size(); j++){ int clusterPos = j*numClusters/numEvents; clusters[clusterPos].addEvent(currentEvents.elAt(j)); } } } else { //randomAllocate int numClusters = clusters.length; for(int i=0; i < data.length; i++){ EventVecI currentEvents = data[i]; for(int j=0; j < currentEvents.size(); j++){ int randomCluster = (int) (Math.random()*numClusters); clusters[randomCluster].addEvent(currentEvents.elAt(j)); } } } } // Pulls the data based on the variables for this object being set. EventVecI[] pullData(ClassStreamEventsVecI csevi){ if(ignoreClasses){ int numStreams = csevi.size(); EventVecI[] retval = new EventVecI[numStreams]; StreamEventsVecI sevi = csevi.getStreamEventsVec(); ClassificationVecI cvi = csevi.getClassVec(); int totalEvents = 0; for(int i=0; i < numStreams; i++){ EventVecI events = sevi.elAt(i).getEvents(pepIndex); totalEvents += events.size(); retval[i] = events; } if(autoNumClusters){ numClusters = (int) Math.ceil(((float) totalEvents)/numStreams); } return retval; } else { // The class we want to cluster on is StreamEventsVecI sevi = csevi.getStreamEventsVec(); ClassificationVecI cvi = csevi.getClassVec(); int numStreams = csevi.size(); Vector evs = new Vector(); int totalEvents = 0; int streamCount = 0; for(int i=0; i < numStreams; i++){ if(cvi.elAt(i).getRealClass() == classToCluster){ EventVecI events = sevi.elAt(i).getEvents(pepIndex); totalEvents += events.size(); evs.addElement(events); streamCount++; } } if(autoNumClusters){ numClusters = (int) Math.ceil(((float) totalEvents)/streamCount); } EventVecI[] retval = new EventVecI[evs.size()]; for(int i=0; i < retval.length; i++){ retval[i] = (EventVecI) evs.elementAt(i); } return retval; } } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -