📄 racesearch.java

📁 :<<数据挖掘--实用机器学习技术及java实现>>一书的配套源程序
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
      // keep an eye on how many test instances have been randomly sampled      int sampleCount = 0;      // run the current set of races      while (!won) {	// generate a random binary string	for (int i=0;i<m_numAttribs;i++) {	  if (i != m_classIndex) {	    if (!attributeConstraints[i]) {	      if (r.nextDouble() < 0.5) {		randomB.set(i);	      } else {		randomB.clear(i);	      }	    } else { // this position has been decided from previous races	      if (base[i] == '1') { 		randomB.set(i);	      } else {		randomB.clear(i);	      }	    }	  }	}		// randomly select an instance to test on	int testIndex = Math.abs(r.nextInt() % numInstances);	trainCV = data.trainCV(numInstances, testIndex);	testCV = data.testCV(numInstances, testIndex);	testInstance = testCV.instance(0);	sampleCount++;	/*	if (sampleCount > numInstances) {	  throw new Exception("raceSchemata: No clear winner after sampling "			      +sampleCount+" instances.");			      } */		m_theEvaluator.buildEvaluator(trainCV);		// the evaluator must retrain for every test point	error = -((HoldOutSubsetEvaluator)m_theEvaluator).	  evaluateSubset(randomB, 			 testInstance,			 true);	evaluationCount++;		// see which racers match this random subset	for (int i=0;i<m_numAttribs;i++) {	  if (randomB.get(i)) {	    randomBC[i] = '1';	  } else {	    randomBC[i] = '0';	  }	}	//	System.err.println("Random subset: "+(new String(randomBC)));        checkRaces: for (int i=0;i<numRaces;i++) {	  // if a pair of racers has evaluated more than num instances	  // then bail out---unlikely that having any more atts is any	  // better than the current base set.	  if (((raceStats[i][0].count + raceStats[i][1].count) / 2) > 	      (numInstances)) {	    break raceSet;	  }	  for (int j=0;j<2;j++) {	    boolean matched = true;	    for (int k =0;k<m_numAttribs;k++) {	      if (parallelRaces[i][j][k] != '*') {		if (parallelRaces[i][j][k] != randomBC[k]) {		  matched = false;		  break;		}	      }	    }	    if (matched) { // update the stats for this racer	      //	      System.err.println("Matched "+i+" "+j);	      raceStats[i][j].add(error);		// does this race have a clear winner, meaning we can		// terminate the whole set of parallel races?		if (raceStats[i][0].count > m_samples &&		    raceStats[i][1].count > m_samples) {		  raceStats[i][0].calculateDerived();		  raceStats[i][1].calculateDerived();		  //		  System.err.println(j+" : "+(new String(parallelRaces[i][j])));		  //		  System.err.println(raceStats[i][0]);		  //		  System.err.println(raceStats[i][1]);		  // check the ttest		  double prob;		  // first check for difference in variance		  /*if ((prob = ftest(raceStats[i][0], raceStats[i][1])) < m_sigLevel) {		    System.err.println("Ftest prob (diff var) : "+prob);		    prob = tutest(raceStats[i][0], raceStats[i][1]);		    } else { */ // use ttest for same variance		  //		    System.err.println("Ftest prob : "+prob);		    prob = ttest(raceStats[i][0], raceStats[i][1]);		    // }		    //		  System.err.println("Prob :"+prob);		  if (prob < m_sigLevel) { // stop the races we have a winner!		    if (raceStats[i][0].mean < raceStats[i][1].mean) {		      base = (char [])parallelRaces[i][0].clone();		      m_bestMerit = raceStats[i][0].mean;		      if (m_debug) {			System.err.println("contender 0 won ");		      }		    } else {		      base = (char [])parallelRaces[i][1].clone();		      m_bestMerit = raceStats[i][1].mean;		      if (m_debug) {			System.err.println("contender 1 won");		      }		    }		    if (m_debug) {		      System.err.println((new String(parallelRaces[i][0]))				 +" "+(new String(parallelRaces[i][1])));		      System.err.println("Means : "+raceStats[i][0].mean					 +" vs"+raceStats[i][1].mean);		      System.err.println("Evaluations so far : "					 +evaluationCount);		    }		    won = true;		    break checkRaces;		  }		}	     	    }	  }	}      }      numRaces--;      // set up the next set of races if necessary      if (numRaces > 0 && won) {	parallelRaces = new char [numRaces][2][m_numAttribs-1];	raceStats = new Stats[numRaces][2];	// update the attribute constraints	for (int i=0;i<m_numAttribs;i++) {	  if (i != m_classIndex && !attributeConstraints[i] &&	      base[i] != '*') {	    attributeConstraints[i] = true;	    break;	  }	}	count=0;	for (int i=0;i<numRaces;i++) {	  parallelRaces[i][0] = (char [])base.clone();	  parallelRaces[i][1] = (char [])base.clone();	  for (int j=count;j<m_numAttribs;j++) {	    if (j != m_classIndex && parallelRaces[i][0][j] == '*') {	      parallelRaces[i][0][j] = '1';	      parallelRaces[i][1][j] = '0';	      count = j+1;	      break;	    }	  }	}		if (m_debug) {	  System.err.println("Next sets:\n");	  for (int i=0;i<numRaces;i++) {	    System.err.print(printSets(parallelRaces[i])+"--------------\n");	  }	}      }    }    if (m_debug) {      System.err.println("Total evaluations : "			 +evaluationCount);    }    return attributeList(base);  }  // Adapted from numerical recipes  private double gammln(double xx) {    double x,y,tmp,ser;    final double [] cof = {76.18009172947146,-86.50532032941677,			  24.01409824083091,-1.231739572450155,			  0.1208650973866179e-2,-0.5395239384953e-5};    int j;        y=x=xx;    tmp=x+5.5;    tmp -= (x+0.5)*Math.log(tmp);    ser=1.000000000190015;    for (j=0;j<=5;j++) ser += cof[j]/++y;    return -tmp+Math.log(2.5066282746310005*ser/x);  }  // Adapted from numerical recipes  private double betacf(double a, double b, double x) throws Exception {    final int maxit = 100;    final double eps = 3.0e-7;    final double fpmin = 1.0e-30;    int m,m2;    double aa,c,d,del,h,qab,qam,qap;    //    System.err.println("a "+a+" b "+b+" x "+x);    qab=a+b;    qap=a+1.0;    qam=a-1.0;    c=1.0;    d=1.0-qab*x/qap;    if (Math.abs(d) < fpmin) d=fpmin;    d=1.0/d;    h=d;    for (m=1;m<=maxit;m++) {      m2=2*m;      aa=m*(b-m)*x/((qam+m2)*(a+m2));      d=1.0+aa*d;      if (Math.abs(d) < fpmin) d=fpmin;      c=1.0+aa/c;      if (Math.abs(c) < fpmin) c=fpmin;      d=1.0/d;      h *= d*c;      aa = -(a+m)*(qab+m)*x/((a+m2)*(qap+m2));      d=1.0+aa*d;      if (Math.abs(d) < fpmin) d=fpmin;      c=1.0+aa/c;      if (Math.abs(c) < fpmin) c=fpmin;      d=1.0/d;      del=d*c;      h *= del;      if (Math.abs(del-1.0) < eps) break;    }    if (m > maxit) {      throw new Exception("a or b too big, or maxit too small in betacf");    }    return h;  }  // Adapted from numerical recipes  private double betai(double a, double b, double x) throws Exception {    double bt;    //    System.err.println("***a "+a+" b "+b+" x "+x);    if (x < 0.0 || x > 1.0) {      throw new Exception("Bad x in routine betai");    }    if (x == 0.0 || x == 1.0) bt=0.0;    else      bt=Math.exp(gammln(a+b)-gammln(a)-gammln(b)+a*Math.log(x)+b*Math.log(1.0-x));    if (x < (a+1.0)/(a+b+2.0))      return bt*betacf(a,b,x)/a;    else      return 1.0-bt*betacf(b,a,1.0-x)/b;  }  // ftest for differences in variance. Returns probability that difference  // in variance is due to chance. Adapted from numerical recipes  private double ftest(Stats c1, Stats c2) throws Exception {    double n1 = c1.count; double n2 = c2.count;    double var1 = c1.stdDev*c1.stdDev;    double var2 = c2.stdDev*c2.stdDev;    double ave1 = c1.mean;    double ave2 = c2.mean;    double f,df1,df2,prob;        if (var1 > var2) {      f=var1/var2;      df1=n1-1;      df2=n2-1;    } else {      f=var2/var1;      df1=n2-1;      df2=n1-1;    }    prob = 2.0*betai(0.5*df2,0.5*df1,df2/(df2+df1*(f)));    if (prob > 1.0) prob=2.0-prob;    return prob;  }  // t-test for unequal sample sizes and same variance. Returns probability  // that observed difference in means is due to chance. Adapted from  // numerical recipes  private double ttest(Stats c1, Stats c2) throws Exception {    double n1 = c1.count; double n2 = c2.count;    double var1 = c1.stdDev*c1.stdDev;    double var2 = c2.stdDev*c2.stdDev;    double ave1 = c1.mean;    double ave2 = c2.mean;        double df=n1+n2-2;    double svar=((n1-1)*var1+(n2-1)*var2)/df;    double t=(ave1-ave2)/Math.sqrt(svar*(1.0/n1+1.0/n2));    t = Math.abs(t);    //    System.err.println("t : "+t);    double prob=betai(0.5*df,0.5,df/(df+(t)*(t)));        return prob;  }  // t-test for unequal sample sizes and different variances. Returns the   // probability. Adapted from numerical recipes  private double tutest(Stats c1, Stats c2) throws Exception {    double n1 = c1.count; double n2 = c2.count;    double var1 = c1.stdDev*c1.stdDev;    double var2 = c2.stdDev*c2.stdDev;    double ave1 = c1.mean;    double ave2 = c2.mean;    double t=(ave1-ave2)/Math.sqrt(var1/n1+var2/n2);    t = Math.abs(t);    double df=Math.sqrt(var1/n1+var2/n2)/(Math.sqrt(var1/n1)/(n1-1)+Math.sqrt(var2/n2)/(n2-1));    //    System.err.println("t : "+t);    //    System.err.println("df : "+df);    double prob = betai(0.5*df,0.5,df/(df+Math.sqrt(t)));    return prob;  }      /**   * Performs a rank race---race consisting of no attributes, the top   * ranked attribute, the top two attributes etc. The initial ranking   * is determined by an attribute evaluator.   * @param data the instances to estimate accuracy over   * @return an array of selected attribute indices.   */  private int [] rankRace(Instances data) throws Exception {    char [] baseSet = new char [m_numAttribs];    char [] bestSet;    double bestSetError;    for (int i=0;i<m_numAttribs;i++) {      if (i == m_classIndex) {	baseSet[i] = '-';      } else {	baseSet[i] = '0';      }    }    int numCompetitors = m_numAttribs-1;    char [][] raceSets = new char [numCompetitors+1][m_numAttribs];    int winner;        if (m_ASEval instanceof AttributeEvaluator) {      // generate the attribute ranking first      Ranker ranker = new Ranker();      ((AttributeEvaluator)m_ASEval).buildEvaluator(data);      m_Ranking = ranker.search((AttributeEvaluator)m_ASEval,data);    } else {      ForwardSelection fs = new ForwardSelection();      double [][]rankres;       fs.setGenerateRanking(true);      ((SubsetEvaluator)m_ASEval).buildEvaluator(data);      fs.search(m_ASEval, data);      rankres = fs.rankedAttributes();      m_Ranking = new int[rankres.length];      for (int i=0;i<rankres.length;i++) {	m_Ranking[i] = (int)rankres[i][0];      }    }    // set up the race    raceSets[0] = (char [])baseSet.clone();    for (int i=0;i<m_Ranking.length;i++) {      raceSets[i+1] = (char [])raceSets[i].clone();      raceSets[i+1][m_Ranking[i]] = '1';    }        if (m_debug) {      System.err.println("Initial sets:\n"+printSets(raceSets));    }        // run the race    double [] winnerInfo = raceSubsets(raceSets, data, true);    bestSetError = winnerInfo[1];    bestSet = (char [])raceSets[(int)winnerInfo[0]].clone();    m_bestMerit = bestSetError;    return attributeList(bestSet);  }    /**   * Performs a hill climbing race---all single attribute changes to a   * base subset are raced in parallel. The winner is chosen and becomes   * the new base subset and the process is repeated until there is no   * improvement in error over the base subset.   * @param data the instances to estimate accuracy over   * @return an array of selected attribute indices.   */  private int [] hillclimbRace(Instances data) throws Exception {    double baseSetError;    char [] baseSet = new char [m_numAttribs];    int rankCount = 0;    for (int i=0;i<m_numAttribs;i++) {      if (i != m_classIndex) {	if (m_raceType == FORWARD_RACE) {	  baseSet[i] = '0';	} else {	  baseSet[i] = '1';	}       } else {	baseSet[i] = '-';      }    }    int numCompetitors = m_numAttribs-1;    char [][] raceSets = new char [numCompetitors+1][m_numAttribs];    int winner;    raceSets[0] = (char [])baseSet.clone();    int count = 1;    // initialize each race set to 1 attribute    for (int i=0;i<m_numAttribs;i++) {      if (i != m_classIndex) {	raceSets[count] = (char [])baseSet.clone();	if (m_raceType == BACKWARD_RACE) {	  raceSets[count++][i] = '0';	} else {	  raceSets[count++][i] = '1';	}      }    }    if (m_debug) {      System.err.println("Initial sets:\n"+printSets(raceSets));    }        // race the initial sets (base set either no or all features)    double [] winnerInfo = raceSubsets(raceSets, data, true);    baseSetError = winnerInfo[1];    m_bestMerit = baseSetError;    baseSet = (char [])raceSets[(int)winnerInfo[0]].clone();    if (m_rankingRequested) {      m_rankedAtts[m_rankedSoFar][0] = (int)(winnerInfo[0]-1);      m_rankedAtts[m_rankedSoFar][1] = winnerInfo[1];      m_rankedSoFar++;    }    boolean improved = true;    int j;    // now race until there is no improvement over the base set or only    // one competitor remains    while (improved) {      // generate the next set of competitors      numCompetitors--;      if (numCompetitors == 0) { //race finished!	break;      }      j=0;      // +1. we'll race against the base set---might be able to bail out      // of the race if none from the new set are statistically better      // than the base set. Base set is stored in loc 0.      raceSets = new char [numCompetitors+1][m_numAttribs];      for (int i=0;i<numCompetitors+1;i++) {	raceSets[i] = (char [])baseSet.clone();	if (i > 0) {	  for (int k=j;k<m_numAttribs;k++) {	    if (m_raceType == 1) {
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -