📄 bftree.java

📁 Weka
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
      }      for (int j=0; j<nonEmpty; j++) {	double distSum = Utils.sum(valDist[j]);	if (distSum==0) pClass0[j]=0;	else pClass0[j] = valDist[j][0]/distSum;      }      // sort category according to the probability of class 0.0      String[] sortedValues = new String[nonEmpty];      for (int j=0; j<nonEmpty; j++) {	sortedValues[j] = nonEmptyValues[Utils.minIndex(pClass0)];	pClass0[Utils.minIndex(pClass0)] = Double.MAX_VALUE;      }      // Find a subset of attribute values that maximize impurity decrease      // for the attribute values that class frequency is not 0      String tempStr = "";      for (int j=0; j<nonEmpty-1; j++) {	currDist = new double[2][numClasses];	if (tempStr=="") tempStr="(" + sortedValues[j] + ")";	else tempStr += "|"+ "(" + sortedValues[j] + ")";	//System.out.println(sortedValues[j]);	for (int i=0; i<sortedIndices.length;i++) {	  Instance inst = data.instance(sortedIndices[i]);	  if (inst.isMissing(att)) {	    break;	  }	  if (tempStr.indexOf	      ("(" + att.value((int)inst.value(att)) + ")")!=-1) {	    currDist[0][(int)inst.classValue()] += weights[i];	  } else currDist[1][(int)inst.classValue()] += weights[i];	}	double[][] tempDist = new double[2][numClasses];	for (int kk=0; kk<2; kk++) {	  tempDist[kk] = currDist[kk];	}	double[] tempProps = new double[2];	for (int kk=0; kk<2; kk++) {	  tempProps[kk] = Utils.sum(tempDist[kk]);	}	if (Utils.sum(tempProps)!=0) Utils.normalize(tempProps);	// split missing values	int mstart = missingStart;	while (mstart < sortedIndices.length) {	  Instance insta = data.instance(sortedIndices[mstart]);	  for (int jj = 0; jj < 2; jj++) {	    tempDist[jj][(int)insta.classValue()] += tempProps[jj] * weights[mstart];	  }	  mstart++;	}	double currGain;	if (useGini) currGain = computeGiniGain(parentDist,tempDist);	else currGain = computeInfoGain(parentDist,tempDist);	if (currGain>bestGain) {	  bestGain = currGain;	  bestSplitString = tempStr;	  for (int jj = 0; jj < 2; jj++) {	    System.arraycopy(tempDist[jj], 0, dist[jj], 0,		dist[jj].length);	  }	}      }    }    // multi-class problems (exhaustive search)    else if (!useHeuristic || nonEmpty<=4) {      //else if (!useHeuristic || nonEmpty==2) {      // Firstly, for attribute values which class frequency is not zero      for (int i=0; i<(int)Math.pow(2,nonEmpty-1); i++) {	String tempStr="";	currDist = new double[2][numClasses];	int mod;	int bit10 = i;	for (int j=nonEmpty-1; j>=0; j--) {	  mod = bit10%2; // convert from 10bit to 2bit	  if (mod==1) {	    if (tempStr=="") tempStr = "("+nonEmptyValues[j]+")";	    else tempStr += "|" + "("+nonEmptyValues[j]+")";	  }	  bit10 = bit10/2;	}	for (int j=0; j<sortedIndices.length;j++) {	  Instance inst = data.instance(sortedIndices[j]);	  if (inst.isMissing(att)) {	    break;	  }	  if (tempStr.indexOf("("+att.value((int)inst.value(att))+")")!=-1) {	    currDist[0][(int)inst.classValue()] += weights[j];	  } else currDist[1][(int)inst.classValue()] += weights[j];	}	double[][] tempDist = new double[2][numClasses];	for (int k=0; k<2; k++) {	  tempDist[k] = currDist[k];	}	double[] tempProps = new double[2];	for (int k=0; k<2; k++) {	  tempProps[k] = Utils.sum(tempDist[k]);	}	if (Utils.sum(tempProps)!=0) Utils.normalize(tempProps);	// split missing values	int index = missingStart;	while (index < sortedIndices.length) {	  Instance insta = data.instance(sortedIndices[index]);	  for (int j = 0; j < 2; j++) {	    tempDist[j][(int)insta.classValue()] += tempProps[j] * weights[index];	  }	  index++;	}	double currGain;	if (useGini) currGain = computeGiniGain(parentDist,tempDist);	else currGain = computeInfoGain(parentDist,tempDist);	if (currGain>bestGain) {	  bestGain = currGain;	  bestSplitString = tempStr;	  for (int j = 0; j < 2; j++) {	    //dist[jj] = new double[currDist[jj].length];	    System.arraycopy(tempDist[j], 0, dist[j], 0,		dist[j].length);	  }	}      }    }    // huristic method to solve multi-classes problems    else {      // Firstly, for attribute values which class frequency is not zero      int n = nonEmpty;      int k = data.numClasses();  // number of classes of the data      double[][] P = new double[n][k];      // class probability matrix      int[] numInstancesValue = new int[n]; // number of instances for an attribute value      double[] meanClass = new double[k];   // vector of mean class probability      int numInstances = data.numInstances(); // total number of instances      // initialize the vector of mean class probability      for (int j=0; j<meanClass.length; j++) meanClass[j]=0;      for (int j=0; j<numInstances; j++) {	Instance inst = (Instance)data.instance(j);	int valueIndex = 0; // attribute value index in nonEmptyValues	for (int i=0; i<nonEmpty; i++) {	  if (att.value((int)inst.value(att)).compareToIgnoreCase(nonEmptyValues[i])==0){	    valueIndex = i;	    break;	  }	}	P[valueIndex][(int)inst.classValue()]++;	numInstancesValue[valueIndex]++;	meanClass[(int)inst.classValue()]++;      }      // calculate the class probability matrix      for (int i=0; i<P.length; i++) {	for (int j=0; j<P[0].length; j++) {	  if (numInstancesValue[i]==0) P[i][j]=0;	  else P[i][j]/=numInstancesValue[i];	}      }      //calculate the vector of mean class probability      for (int i=0; i<meanClass.length; i++) {	meanClass[i]/=numInstances;      }      // calculate the covariance matrix      double[][] covariance = new double[k][k];      for (int i1=0; i1<k; i1++) {	for (int i2=0; i2<k; i2++) {	  double element = 0;	  for (int j=0; j<n; j++) {	    element += (P[j][i2]-meanClass[i2])*(P[j][i1]-meanClass[i1])	    *numInstancesValue[j];	  }	  covariance[i1][i2] = element;	}      }      Matrix matrix = new Matrix(covariance);      weka.core.matrix.EigenvalueDecomposition eigen =	new weka.core.matrix.EigenvalueDecomposition(matrix);      double[] eigenValues = eigen.getRealEigenvalues();      // find index of the largest eigenvalue      int index=0;      double largest = eigenValues[0];      for (int i=1; i<eigenValues.length; i++) {	if (eigenValues[i]>largest) {	  index=i;	  largest = eigenValues[i];	}      }      // calculate the first principle component      double[] FPC = new double[k];      Matrix eigenVector = eigen.getV();      double[][] vectorArray = eigenVector.getArray();      for (int i=0; i<FPC.length; i++) {	FPC[i] = vectorArray[i][index];      }      // calculate the first principle component scores      double[] Sa = new double[n];      for (int i=0; i<Sa.length; i++) {	Sa[i]=0;	for (int j=0; j<k; j++) {	  Sa[i] += FPC[j]*P[i][j];	}      }      // sort category according to Sa(s)      double[] pCopy = new double[n];      System.arraycopy(Sa,0,pCopy,0,n);      String[] sortedValues = new String[n];      Arrays.sort(Sa);      for (int j=0; j<n; j++) {	sortedValues[j] = nonEmptyValues[Utils.minIndex(pCopy)];	pCopy[Utils.minIndex(pCopy)] = Double.MAX_VALUE;      }      // for the attribute values that class frequency is not 0      String tempStr = "";      for (int j=0; j<nonEmpty-1; j++) {	currDist = new double[2][numClasses];	if (tempStr=="") tempStr="(" + sortedValues[j] + ")";	else tempStr += "|"+ "(" + sortedValues[j] + ")";	for (int i=0; i<sortedIndices.length;i++) {	  Instance inst = data.instance(sortedIndices[i]);	  if (inst.isMissing(att)) {	    break;	  }	  if (tempStr.indexOf	      ("(" + att.value((int)inst.value(att)) + ")")!=-1) {	    currDist[0][(int)inst.classValue()] += weights[i];	  } else currDist[1][(int)inst.classValue()] += weights[i];	}	double[][] tempDist = new double[2][numClasses];	for (int kk=0; kk<2; kk++) {	  tempDist[kk] = currDist[kk];	}	double[] tempProps = new double[2];	for (int kk=0; kk<2; kk++) {	  tempProps[kk] = Utils.sum(tempDist[kk]);	}	if (Utils.sum(tempProps)!=0) Utils.normalize(tempProps);	// split missing values	int mstart = missingStart;	while (mstart < sortedIndices.length) {	  Instance insta = data.instance(sortedIndices[mstart]);	  for (int jj = 0; jj < 2; jj++) {	    tempDist[jj][(int)insta.classValue()] += tempProps[jj] * weights[mstart];	  }	  mstart++;	}	double currGain;	if (useGini) currGain = computeGiniGain(parentDist,tempDist);	else currGain = computeInfoGain(parentDist,tempDist);	if (currGain>bestGain) {	  bestGain = currGain;	  bestSplitString = tempStr;	  for (int jj = 0; jj < 2; jj++) {	    //dist[jj] = new double[currDist[jj].length];	    System.arraycopy(tempDist[jj], 0, dist[jj], 0,		dist[jj].length);	  }	}      }    }    // Compute weights    int attIndex = att.index();    props[attIndex] = new double[2];    for (int k = 0; k < 2; k++) {      props[attIndex][k] = Utils.sum(dist[k]);    }    if (!(Utils.sum(props[attIndex]) > 0)) {      for (int k = 0; k < props[attIndex].length; k++) {	props[attIndex][k] = 1.0 / (double)props[attIndex].length;      }    } else {      Utils.normalize(props[attIndex]);    }    // Compute subset weights    subsetWeights[attIndex] = new double[2];    for (int j = 0; j < 2; j++) {      subsetWeights[attIndex][j] += Utils.sum(dist[j]);    }    // Then, for the attribute values that class frequency is 0, split it into the    // most frequent branch    for (int j=0; j<empty; j++) {      if (props[attIndex][0]>=props[attIndex][1]) {	if (bestSplitString=="") bestSplitString = "(" + emptyValues[j] + ")";	else bestSplitString += "|" + "(" + emptyValues[j] + ")";      }    }    // clean gain    gains[attIndex] = Math.rint(bestGain*10000000)/10000000.0;    dists[attIndex] = dist;    return bestSplitString;  }  /**   * Split data into two subsets and store sorted indices and weights for two   * successor nodes.   *   * @param subsetIndices 	sorted indecis of instances for each attribute for two successor node   * @param subsetWeights 	weights of instances for each attribute for two successor node   * @param att 		attribute the split based on   * @param splitPoint 		split point the split based on if att is numeric   * @param splitStr 		split subset the split based on if att is nominal   * @param sortedIndices 	sorted indices of the instances to be split   * @param weights 		weights of the instances to bes split   * @param data 		training data   * @throws Exception 		if something goes wrong     */  protected void splitData(int[][][] subsetIndices, double[][][] subsetWeights,      Attribute att, double splitPoint, String splitStr, int[][] sortedIndices,      double[][] weights, Instances data) throws Exception {    int j;    // For each attribute    for (int i = 0; i < data.numAttributes(); i++) {      if (i==data.classIndex()) continue;      int[] num = new int[2];      for (int k = 0; k < 2; k++) {	subsetIndices[k][i] = new int[sortedIndices[i].length];	subsetWeights[k][i] = new double[weights[i].length];      }      for (j = 0; j < sortedIndices[i].length; j++) {	Instance inst = data.instance(sortedIndices[i][j]);	if (inst.isMissing(att)) {	  // Split instance up	  for (int k = 0; k < 2; k++) {	    if (m_Props[k] > 0) {
上一页 1 2 3 45
💿 文件大小 4679 K
👤 上传用户 lidaniel
📂 所属分类其他
🏷️ 相关标签

#Weka
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -