📄 bftree.java
字号:
} for (int j=0; j<nonEmpty; j++) { double distSum = Utils.sum(valDist[j]); if (distSum==0) pClass0[j]=0; else pClass0[j] = valDist[j][0]/distSum; } // sort category according to the probability of class 0.0 String[] sortedValues = new String[nonEmpty]; for (int j=0; j<nonEmpty; j++) { sortedValues[j] = nonEmptyValues[Utils.minIndex(pClass0)]; pClass0[Utils.minIndex(pClass0)] = Double.MAX_VALUE; } // Find a subset of attribute values that maximize impurity decrease // for the attribute values that class frequency is not 0 String tempStr = ""; for (int j=0; j<nonEmpty-1; j++) { currDist = new double[2][numClasses]; if (tempStr=="") tempStr="(" + sortedValues[j] + ")"; else tempStr += "|"+ "(" + sortedValues[j] + ")"; //System.out.println(sortedValues[j]); for (int i=0; i<sortedIndices.length;i++) { Instance inst = data.instance(sortedIndices[i]); if (inst.isMissing(att)) { break; } if (tempStr.indexOf ("(" + att.value((int)inst.value(att)) + ")")!=-1) { currDist[0][(int)inst.classValue()] += weights[i]; } else currDist[1][(int)inst.classValue()] += weights[i]; } double[][] tempDist = new double[2][numClasses]; for (int kk=0; kk<2; kk++) { tempDist[kk] = currDist[kk]; } double[] tempProps = new double[2]; for (int kk=0; kk<2; kk++) { tempProps[kk] = Utils.sum(tempDist[kk]); } if (Utils.sum(tempProps)!=0) Utils.normalize(tempProps); // split missing values int mstart = missingStart; while (mstart < sortedIndices.length) { Instance insta = data.instance(sortedIndices[mstart]); for (int jj = 0; jj < 2; jj++) { tempDist[jj][(int)insta.classValue()] += tempProps[jj] * weights[mstart]; } mstart++; } double currGain; if (useGini) currGain = computeGiniGain(parentDist,tempDist); else currGain = computeInfoGain(parentDist,tempDist); if (currGain>bestGain) { bestGain = currGain; bestSplitString = tempStr; for (int jj = 0; jj < 2; jj++) { System.arraycopy(tempDist[jj], 0, dist[jj], 0, dist[jj].length); } } } } // multi-class problems (exhaustive search) else if (!useHeuristic || nonEmpty<=4) { //else if (!useHeuristic || nonEmpty==2) { // Firstly, for attribute values which class frequency is not zero for (int i=0; i<(int)Math.pow(2,nonEmpty-1); i++) { String tempStr=""; currDist = new double[2][numClasses]; int mod; int bit10 = i; for (int j=nonEmpty-1; j>=0; j--) { mod = bit10%2; // convert from 10bit to 2bit if (mod==1) { if (tempStr=="") tempStr = "("+nonEmptyValues[j]+")"; else tempStr += "|" + "("+nonEmptyValues[j]+")"; } bit10 = bit10/2; } for (int j=0; j<sortedIndices.length;j++) { Instance inst = data.instance(sortedIndices[j]); if (inst.isMissing(att)) { break; } if (tempStr.indexOf("("+att.value((int)inst.value(att))+")")!=-1) { currDist[0][(int)inst.classValue()] += weights[j]; } else currDist[1][(int)inst.classValue()] += weights[j]; } double[][] tempDist = new double[2][numClasses]; for (int k=0; k<2; k++) { tempDist[k] = currDist[k]; } double[] tempProps = new double[2]; for (int k=0; k<2; k++) { tempProps[k] = Utils.sum(tempDist[k]); } if (Utils.sum(tempProps)!=0) Utils.normalize(tempProps); // split missing values int index = missingStart; while (index < sortedIndices.length) { Instance insta = data.instance(sortedIndices[index]); for (int j = 0; j < 2; j++) { tempDist[j][(int)insta.classValue()] += tempProps[j] * weights[index]; } index++; } double currGain; if (useGini) currGain = computeGiniGain(parentDist,tempDist); else currGain = computeInfoGain(parentDist,tempDist); if (currGain>bestGain) { bestGain = currGain; bestSplitString = tempStr; for (int j = 0; j < 2; j++) { //dist[jj] = new double[currDist[jj].length]; System.arraycopy(tempDist[j], 0, dist[j], 0, dist[j].length); } } } } // huristic method to solve multi-classes problems else { // Firstly, for attribute values which class frequency is not zero int n = nonEmpty; int k = data.numClasses(); // number of classes of the data double[][] P = new double[n][k]; // class probability matrix int[] numInstancesValue = new int[n]; // number of instances for an attribute value double[] meanClass = new double[k]; // vector of mean class probability int numInstances = data.numInstances(); // total number of instances // initialize the vector of mean class probability for (int j=0; j<meanClass.length; j++) meanClass[j]=0; for (int j=0; j<numInstances; j++) { Instance inst = (Instance)data.instance(j); int valueIndex = 0; // attribute value index in nonEmptyValues for (int i=0; i<nonEmpty; i++) { if (att.value((int)inst.value(att)).compareToIgnoreCase(nonEmptyValues[i])==0){ valueIndex = i; break; } } P[valueIndex][(int)inst.classValue()]++; numInstancesValue[valueIndex]++; meanClass[(int)inst.classValue()]++; } // calculate the class probability matrix for (int i=0; i<P.length; i++) { for (int j=0; j<P[0].length; j++) { if (numInstancesValue[i]==0) P[i][j]=0; else P[i][j]/=numInstancesValue[i]; } } //calculate the vector of mean class probability for (int i=0; i<meanClass.length; i++) { meanClass[i]/=numInstances; } // calculate the covariance matrix double[][] covariance = new double[k][k]; for (int i1=0; i1<k; i1++) { for (int i2=0; i2<k; i2++) { double element = 0; for (int j=0; j<n; j++) { element += (P[j][i2]-meanClass[i2])*(P[j][i1]-meanClass[i1]) *numInstancesValue[j]; } covariance[i1][i2] = element; } } Matrix matrix = new Matrix(covariance); weka.core.matrix.EigenvalueDecomposition eigen = new weka.core.matrix.EigenvalueDecomposition(matrix); double[] eigenValues = eigen.getRealEigenvalues(); // find index of the largest eigenvalue int index=0; double largest = eigenValues[0]; for (int i=1; i<eigenValues.length; i++) { if (eigenValues[i]>largest) { index=i; largest = eigenValues[i]; } } // calculate the first principle component double[] FPC = new double[k]; Matrix eigenVector = eigen.getV(); double[][] vectorArray = eigenVector.getArray(); for (int i=0; i<FPC.length; i++) { FPC[i] = vectorArray[i][index]; } // calculate the first principle component scores double[] Sa = new double[n]; for (int i=0; i<Sa.length; i++) { Sa[i]=0; for (int j=0; j<k; j++) { Sa[i] += FPC[j]*P[i][j]; } } // sort category according to Sa(s) double[] pCopy = new double[n]; System.arraycopy(Sa,0,pCopy,0,n); String[] sortedValues = new String[n]; Arrays.sort(Sa); for (int j=0; j<n; j++) { sortedValues[j] = nonEmptyValues[Utils.minIndex(pCopy)]; pCopy[Utils.minIndex(pCopy)] = Double.MAX_VALUE; } // for the attribute values that class frequency is not 0 String tempStr = ""; for (int j=0; j<nonEmpty-1; j++) { currDist = new double[2][numClasses]; if (tempStr=="") tempStr="(" + sortedValues[j] + ")"; else tempStr += "|"+ "(" + sortedValues[j] + ")"; for (int i=0; i<sortedIndices.length;i++) { Instance inst = data.instance(sortedIndices[i]); if (inst.isMissing(att)) { break; } if (tempStr.indexOf ("(" + att.value((int)inst.value(att)) + ")")!=-1) { currDist[0][(int)inst.classValue()] += weights[i]; } else currDist[1][(int)inst.classValue()] += weights[i]; } double[][] tempDist = new double[2][numClasses]; for (int kk=0; kk<2; kk++) { tempDist[kk] = currDist[kk]; } double[] tempProps = new double[2]; for (int kk=0; kk<2; kk++) { tempProps[kk] = Utils.sum(tempDist[kk]); } if (Utils.sum(tempProps)!=0) Utils.normalize(tempProps); // split missing values int mstart = missingStart; while (mstart < sortedIndices.length) { Instance insta = data.instance(sortedIndices[mstart]); for (int jj = 0; jj < 2; jj++) { tempDist[jj][(int)insta.classValue()] += tempProps[jj] * weights[mstart]; } mstart++; } double currGain; if (useGini) currGain = computeGiniGain(parentDist,tempDist); else currGain = computeInfoGain(parentDist,tempDist); if (currGain>bestGain) { bestGain = currGain; bestSplitString = tempStr; for (int jj = 0; jj < 2; jj++) { //dist[jj] = new double[currDist[jj].length]; System.arraycopy(tempDist[jj], 0, dist[jj], 0, dist[jj].length); } } } } // Compute weights int attIndex = att.index(); props[attIndex] = new double[2]; for (int k = 0; k < 2; k++) { props[attIndex][k] = Utils.sum(dist[k]); } if (!(Utils.sum(props[attIndex]) > 0)) { for (int k = 0; k < props[attIndex].length; k++) { props[attIndex][k] = 1.0 / (double)props[attIndex].length; } } else { Utils.normalize(props[attIndex]); } // Compute subset weights subsetWeights[attIndex] = new double[2]; for (int j = 0; j < 2; j++) { subsetWeights[attIndex][j] += Utils.sum(dist[j]); } // Then, for the attribute values that class frequency is 0, split it into the // most frequent branch for (int j=0; j<empty; j++) { if (props[attIndex][0]>=props[attIndex][1]) { if (bestSplitString=="") bestSplitString = "(" + emptyValues[j] + ")"; else bestSplitString += "|" + "(" + emptyValues[j] + ")"; } } // clean gain gains[attIndex] = Math.rint(bestGain*10000000)/10000000.0; dists[attIndex] = dist; return bestSplitString; } /** * Split data into two subsets and store sorted indices and weights for two * successor nodes. * * @param subsetIndices sorted indecis of instances for each attribute for two successor node * @param subsetWeights weights of instances for each attribute for two successor node * @param att attribute the split based on * @param splitPoint split point the split based on if att is numeric * @param splitStr split subset the split based on if att is nominal * @param sortedIndices sorted indices of the instances to be split * @param weights weights of the instances to bes split * @param data training data * @throws Exception if something goes wrong */ protected void splitData(int[][][] subsetIndices, double[][][] subsetWeights, Attribute att, double splitPoint, String splitStr, int[][] sortedIndices, double[][] weights, Instances data) throws Exception { int j; // For each attribute for (int i = 0; i < data.numAttributes(); i++) { if (i==data.classIndex()) continue; int[] num = new int[2]; for (int k = 0; k < 2; k++) { subsetIndices[k][i] = new int[sortedIndices[i].length]; subsetWeights[k][i] = new double[weights[i].length]; } for (j = 0; j < sortedIndices[i].length; j++) { Instance inst = data.instance(sortedIndices[i][j]); if (inst.isMissing(att)) { // Split instance up for (int k = 0; k < 2; k++) { if (m_Props[k] > 0) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -