📄 svm_al.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
      if (astd->train_dim_sat_vect) {	for (i=0; i<astd->ndim_sat; i++) {	  astd->train_dim_sat_vect[i][nloop] = astd->train_dim_sat_vect[i][nloop-1];	}      }       if (astd->sv_dim_sat_vect) {	for (i=0; i<astd->ndim_sat; i++) {	  astd->sv_dim_sat_vect[i][nloop] = astd->sv_dim_sat_vect[i][nloop-1];	}      }      if (astd->test_scores) {	for (i=0; i<astd->ntest; i++) {	  astd->test_scores[nloop][i] = astd->test_scores[nloop-1][i];	}      }      /* END LOGGING CODE */      /* this code doesn't get touched till after stuff was added */      nleft -= dec;      cur_hyp_yvect = &(cur_hyp_yvect[dec]);      train_cscores = &(train_cscores[dec]);    }    /* see if there are any indices < sub_ndocs in the score array */    for (i=0; i<nplabeled-sub_ndocs; i++) {      assert (train_cscores[i].i >= sub_ndocs);     }    if (sub_ndocs == nplabeled) {      break;    }    /* now use the scores (& possibly other things) to chose the next examples to learn */    if (nleft < qsize) {      dec = nleft;    } else {      dec = qsize;    }    /* do this even if nleft<qsize to find the min... */    if (!do_random_learning) {      get_top_n(train_cscores, nleft, dec);    }    /* this is where the termination criteria goes - right now its pretty dumb... */    /* (it would be a fn, but since bookkeeping & setting up need to go on in here     * anyway, i'm just computing it */    if ((train_cscores[0].d > 1) && (0)) {      break;    }        /* this only matters when transduction is being used (otherwise its harmless) */    changed = 0;    /* BEGIN LOGGING CODE */    if (astd->query_anvect && astd->query_apvect) {      astd->query_anvect[nloop] = astd->query_apvect[nloop] = 0;    }    /* END LOGGING CODE */    /* query "oracle" */    for (j=0; j<dec; j++) {      int t,tj;      bow_wv *twv;      tj = train_cscores[j].i;      t = tdocs[sub_ndocs+j];      tdocs[sub_ndocs+j] = tdocs[tj];      tdocs[tj] = t;      twv = train_docs[sub_ndocs+j];      train_docs[sub_ndocs+j] = train_docs[tj];      train_docs[tj] = twv;      t = train_yvect[sub_ndocs+j];      train_yvect[sub_ndocs+j] = train_yvect[tj];      train_yvect[tj] = t;      if (svm_al_do_trans) {	if ((train_yvect[sub_ndocs+j] != cur_hyp_yvect[j]) || 	    (weights[sub_ndocs+j] >= svm_trans_cstar - svm_epsilon_a)) {	  changed = 1;	}      }       /* BEGIN LOGGING CODE */      if (astd->query_anvect && astd->query_apvect) {	double out;	if (svm_kernel_type == 0) {	  out = evaluate_model_hyperplane(*W,tb,train_docs[i]);	} else if (svm_al_do_trans) {	  out = evaluate_model_cache(train_docs,weights,hyp_yvect,tb,train_docs[i],nsv);	}	if (train_yvect[sub_ndocs+j]*out > 0) {	  if (train_yvect[sub_ndocs+j] > 0) {	    astd->query_apvect[nloop] ++;	  } else {	    astd->query_anvect[nloop] ++;	  }	}      }      /* END LOGGING CODE */      /* also need to swap the scores - since they will be used if the output doesn't change */      for (i=0; ; i++) {	if (train_cscores[i].i == sub_ndocs+j) {	  train_cscores[i].i = tj;	  break;	}      }      train_cscores[j].i = sub_ndocs+j;      if (astd->scores_added) 	astd->scores_added[sub_ndocs+j] = train_cscores[j].d;    }    for (j=0; j<dec; j++) {      hyp_yvect[sub_ndocs+j] = train_yvect[sub_ndocs+j];    }    if (!changed) {      n_trans_correct ++;    }    last_subndocs = sub_ndocs;    /* calculate tvals that are necessary */    if (svm_use_smo) {      for (j=sub_ndocs; j<dec; j++) {	weights[j] = 0.0;	//tvals[j] doesn't matter      }      sub_ndocs += dec;    } else {      int n;      for (n=0; n<dec; n++) {	for (j=k=0; k<nsv; j++) {	  if (weights[j] != 0.0) {	    tvals[sub_ndocs] += weights[j] * train_yvect[j] * 	      svm_kernel_cache(train_docs[sub_ndocs],train_docs[j]);	    k++;	  }	}	sub_ndocs++;      }    }    /* if we no longer need W, lets ditch it (note - the loop never exits here so a      * valid W is still in place for the calling fn. */    if (!svm_use_smo && svm_kernel_type == 0) {      free(*W);      *W = NULL;    }  }  if (svm_al_do_trans) {    printf("Queried for a total of %d labels.\nSkipped %d loops w/ transduction.\n",	   sub_ndocs, n_trans_correct);  }  free(hyp_yvect);    free(train_scores);  free(tvals);  if (sv_sat_vect) {    free(sv_sat_vect);    free(old_svbitmap);  }  if (train_sat_vect) {    free(train_sat_vect);  }  /* fill everything back in - depermute everything */  for (i=0; i<sub_ndocs; ) {    int t,j;    double td;    bow_wv *twv;    j = tdocs[i];    if (j == i) {      i++;      continue;    }    twv = train_docs[j];    train_docs[j] = train_docs[i];    train_docs[i] = twv;    t = train_yvect[j];    train_yvect[j] = train_yvect[i];    train_yvect[i] = t;    td = weights[j];    weights[j] = weights[i];    weights[i] = td;    tdocs[i] = tdocs[j];    tdocs[j] = j;  }  free(tdocs);  *b = tb;  return nsv;}/* this cuts up the training set into training & validation *//* the data coming in has already been permutated *//* the first docs become the test docs  * (to prevent us from having to move everything) */int al_svm_test_wrapper(bow_wv **docs, int *yvect, double *weights, double *b, 			double **W, int ntrans, int ndocs, int do_ts, 			int do_random_learning, int *permute_table) {  struct al_test_data altd;  int      max_iter;  int      nlabeled;  int      ntrain;  int      nsv;  int      ntest;  int      tp, tn;  bow_wv **train_docs;  int     *train_y;  int  i,j,k;  ntrain = altd.ntest = 0;  nlabeled = ndocs - ntrans;  ntrain = nlabeled/2;  ntest = nlabeled - ntrain;  altd.ntest = ntest;  train_docs = &(docs[ntest]);  train_y = &(yvect[ntest]);  altd.test_docs = docs;  altd.test_yvect = yvect;  max_iter = ((ntrain+svm_al_qsize-1) / svm_al_qsize) + 1;    altd.apvect = (int *) malloc(sizeof(int)*max_iter);  altd.anvect = (int *) malloc(sizeof(int)*max_iter);  altd.nsv_vect = (int *) malloc(sizeof(int)*max_iter);  altd.nbsv_vect = (int *) malloc(sizeof(int)*ntrain);  altd.prb = (double *) malloc(sizeof(double)*max_iter);  altd.nkce_vect = (int *) malloc(sizeof(int)*max_iter);  altd.time_vect = (int *) malloc(sizeof(int)*max_iter);    altd.query_anvect = (int *) malloc(sizeof(int)*max_iter);  altd.query_apvect = (int *) malloc(sizeof(int)*max_iter);  altd.train_anvect = (int *) malloc(sizeof(int)*max_iter);  altd.train_apvect = (int *) malloc(sizeof(int)*max_iter);  if (do_ts) {    altd.test_scores = (double **) malloc(sizeof(double *)*max_iter);        for (i=0; i<max_iter; i++) {      altd.test_scores[i] = (double *) malloc(sizeof(double)*altd.ntest);    }  } else {    altd.test_scores = NULL;  }  altd.npos_added = (int *) malloc(sizeof(int)*max_iter+1);  altd.nneg_added = (int *) malloc(sizeof(int)*max_iter+1);  altd.docs_added = (int *) malloc(sizeof(int)*ntrain);  altd.scores_added = (double *) malloc(sizeof(double)*ntrain);  for (i=0; i<ntrain; i++) {    altd.scores_added[i] = 0.0;  }  memset(altd.apvect, -1, max_iter*sizeof(int));  memset(altd.anvect, -1, max_iter*sizeof(int));    altd.ndim_sat = NDIM_INSPECTED;  altd.sv_dim_sat_vect = (int **) malloc(NDIM_INSPECTED*sizeof(int *));  altd.train_dim_sat_vect = (int **) malloc(NDIM_INSPECTED*sizeof(int *));  for(i=0; i<NDIM_INSPECTED; i++) {    altd.sv_dim_sat_vect[i] = (int *) malloc(sizeof(int)*max_iter);    altd.train_dim_sat_vect[i] = (int *) malloc(sizeof(int)*max_iter);  }  nsv = al_svm_guts(train_docs, train_y, weights, b, W, ntrans, ntrain,		    &altd, do_random_learning);  for (i=tp=tn=0; i<altd.ntest; i++) {    if (altd.test_yvect[i] == 1) {      tp ++;    } else {      tn ++;    }  }  printf("%d positive test documents, %d negative test documents.\npositive accuracy vector: ",tp,tn);  for (i=0; (altd.apvect[i]>=0) && i < max_iter; i++) {    printf("  %d", altd.apvect[i]);  }  printf("\nnegative accuracy vector: ");  for (j=0; j<i; j++) {    printf("  %d", altd.anvect[j]);  }  printf("\nprecision/recall breakeven vector: ");  for (j=0; j<i; j++) {    printf("  %f", altd.prb[j]);  }  printf("\nquery positive accuracy vector: ");  for (j=0; j<i-1; j++) {    printf("  %d",altd.query_apvect[j]);  }  printf("\nquery negative accuracy vector: ");  for (j=0; j<i-1; j++) {    printf("  %d",altd.query_anvect[j]);  }  printf("\ntrain positive accuracy vector: ");  for (j=0; j<i; j++) {    printf("  %d",altd.train_apvect[j]);  }  printf("\ntrain negative accuracy vector: ");  for (j=0; j<i; j++) {    printf("  %d",altd.train_anvect[j]);  }  printf("\nnumber of positive documents inspected: ");  for (j=0; j<i; j++) {    printf(" %d", altd.npos_added[j]);  }  printf("\nnumber of negative documents inspected: ");  for (j=0; j<i; j++) {    printf(" %d", altd.nneg_added[j]);  }  printf("\nnumber of support vectors: ");  for (j=0; j<i; j++) {    printf("  %d", altd.nsv_vect[j]);  }  printf("\nnumber of bounded support vectors: ");  for (j=0; j<i; j++) {    printf("  %d", altd.nbsv_vect[j]);  }  {    int k;    int start_index= MIN(ntrain, svm_init_al_tset);    printf("\n\"Real\" document indices when added: ");    printf("0(%d",permute_table[altd.docs_added[0]]);    for (k=1; k<start_index; k++) {      printf(",%d",permute_table[altd.docs_added[k]]);    }    printf(") ");    for (j=0; j<i-1; j++) {      printf("%d(%d",j+1,permute_table[altd.docs_added[j*svm_al_qsize+start_index]]);      for (k=1; k<svm_al_qsize && k+j*svm_al_qsize+start_index<ntrain; k++) {	printf(",%d",permute_table[altd.docs_added[j*svm_al_qsize+start_index+k]]);      }      printf(") ");    }    printf("\nminimum scores of documents when added: ");    for (j=0; j<i-1; j++) {      printf("  %f", altd.scores_added[j*svm_al_qsize+svm_init_al_tset]);    }    printf("\naverage scores of documents when added: ");    for (j=0; j<i-1; j++) {      double avg = 0.0;      for (k=0; k<svm_al_qsize && k+j*svm_al_qsize+svm_init_al_tset<ntrain; k++) {	avg += altd.scores_added[j*svm_al_qsize+k+svm_init_al_tset];      }      printf("  %f", avg/k);    }  }  printf("\nrunning times: ");  for (j=0; j<i; j++) {    printf("  %d", altd.time_vect[j]);  }  printf("\nkernel_cache calls: ");  for (j=0; j<i; j++) {    printf(" %d", altd.nkce_vect[j]);  }  for (k=0; k<NDIM_INSPECTED; k++) {    /* following is only good if the 0'th # of dimensions == 1 */    int num_words = altd.train_dim_sat_vect[0][i-1];    printf("\nnumber of SV dimensions with more than %d elements (%d total dimensions): ", dim_map(k), num_words);    for (j=0; j<i; j++) {      printf("  %d", altd.sv_dim_sat_vect[k][j]);    }  }  for (k=0; k<NDIM_INSPECTED; k++) {    int num_words = altd.train_dim_sat_vect[0][i-1];    printf("\nnumber of train dimensions with more than %d elements (%d total dimensions): ", dim_map(k), num_words);    for (j=0; j<i; j++) {      printf("  %d", altd.train_dim_sat_vect[k][j]);    }  }  if (do_ts) {    printf("\nbegin score matrix:");    for (j=0; j<i; j++) {      int k;      printf("\n");      for (k=0; k<altd.ntest; k++) {	printf(" %.3f", altd.test_scores[j][k]);      }    }    printf("\nend score matrix\n");    for (i=0; i<max_iter; i++) {      free(altd.test_scores[i]);    }    free(altd.test_scores);  } else {    printf("\n");  }  for(i=0; i<NDIM_INSPECTED; i++) {    free(altd.sv_dim_sat_vect[i]);    free(altd.train_dim_sat_vect[i]);  }  free(altd.docs_added);  free(altd.scores_added);  free(altd.apvect);  free(altd.anvect);  free(altd.prb);  free(altd.nsv_vect);  free(altd.nbsv_vect);  free(altd.time_vect);  free(altd.sv_dim_sat_vect);  free(altd.train_dim_sat_vect);  free(altd.nkce_vect);  free(altd.npos_added);  free(altd.nneg_added);  free(altd.query_anvect);  free(altd.query_apvect);  free(altd.train_anvect);  free(altd.train_apvect);  return nsv;}int al_svm(bow_wv **docs, int *yvect, double *weights, double *b, double **W, 	   int ntrans, int ndocs, int do_rlearn) {  struct al_test_data altd;  bzero(&altd,sizeof(struct al_test_data));  return (al_svm_guts(docs, yvect, weights, b, W, ntrans, ndocs, &altd, do_rlearn));}
上一页 12
💿 文件大小 522 K
👤 上传用户 yuanata
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#mitchell #tom #机器学习 #代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -