📄 svm_learn.c

📁 这是一个采用c++编写的用于机器学习文本分类的SVM算法的实现代码。
💻 C
📖 第 1 页 / 共 5 页
字号:
	 < 1.0) { 
	/* guaranteed to not produce a leave-one-out error */
	if(verbosity==1) {
	  printf("+"); fflush(stdout); 
	}
      }
      else if(xi_fullset[heldout] > 1.0) {
	/* guaranteed to produce a leave-one-out error */
	loo_count++;
	if(label[heldout] > 0)  loo_count_pos++; else loo_count_neg++;
	if(verbosity==1) {
	  printf("-"); fflush(stdout); 
	}
      }
      else {
	loocomputed++;
	heldout_c=learn_parm->svm_cost[heldout]; /* set upper bound to zero */
	learn_parm->svm_cost[heldout]=0;
	/* make sure heldout example is not currently  */
	/* shrunk away. Assumes that lin is up to date! */
	shrink_state.active[heldout]=1;  
	if(verbosity>=2) 
	  printf("\nLeave-One-Out test on example %ld\n",heldout);
	if(verbosity>=1) {
	  printf("(?[%ld]",heldout); fflush(stdout); 
	}
	
	optimize_to_convergence(docs,label,totdoc,totwords,learn_parm,
				kernel_parm,
				kernel_cache,&shrink_state,model,inconsistent,unlabeled,
				a,lin,c,&timing_profile,
				&maxdiff,heldout,(long)2);

	/* printf("%.20f\n",(lin[heldout]-model->b)*(double)label[heldout]); */

	if(((lin[heldout]-model->b)*(double)label[heldout]) <= 0.0) { 
	  loo_count++;                            /* there was a loo-error */
	  if(label[heldout] > 0)  loo_count_pos++; else loo_count_neg++;
	  if(verbosity>=1) {
	    printf("-)"); fflush(stdout); 
	  }
	}
	else {
	  if(verbosity>=1) {
	    printf("+)"); fflush(stdout); 
	  }
	}
	/* now we need to restore the original data set*/
	learn_parm->svm_cost[heldout]=heldout_c; /* restore upper bound */
      }
    } /* end of leave-one-out loop */


    if(verbosity>=1) {
      printf("\nRetrain on full problem"); fflush(stdout); 
    }
    optimize_to_convergence(docs,label,totdoc,totwords,learn_parm,
			    kernel_parm,
			    kernel_cache,&shrink_state,model,inconsistent,unlabeled,
			    a,lin,c,&timing_profile,
			    &maxdiff,(long)-1,(long)1);
    if(verbosity >= 1) 
      printf("done.\n");
    
    
    /* after all leave-one-out computed */
    model->loo_error=100.0*loo_count/(double)totdoc;
    model->loo_recall=(1.0-(double)loo_count_pos/(double)trainpos)*100.0;
    model->loo_precision=(trainpos-loo_count_pos)/
      (double)(trainpos-loo_count_pos+loo_count_neg)*100.0;
    if(verbosity >= 1) {
      fprintf(stdout,"Leave-one-out estimate of the error: error=%.2f%%\n",
	      model->loo_error);
      fprintf(stdout,"Leave-one-out estimate of the recall: recall=%.2f%%\n",
	      model->loo_recall);
      fprintf(stdout,"Leave-one-out estimate of the precision: precision=%.2f%%\n",
	      model->loo_precision);
      fprintf(stdout,"Actual leave-one-outs computed:  %ld (rho=%.2f)\n",
	      loocomputed,learn_parm->rho);
      printf("Runtime for leave-one-out in cpu-seconds: %.2f\n",
	     (double)(get_runtime()-runtime_start_loo)/100.0);
    }
  }
    
  if(learn_parm->alphafile[0])
    write_alphas(learn_parm->alphafile,a,label,totdoc);
  
  shrink_state_cleanup(&shrink_state);
  free(label);
  free(inconsistent);
  free(unlabeled);
  free(c);
  free(a);
  free(a_fullset);
  free(xi_fullset);
  free(lin);
  free(learn_parm->svm_cost);
}


/* Learns an SVM regression model based on the training data in
   docs/label. The resulting model is returned in the structure
   model. */

void svm_learn_regression(DOC **docs, double *value, long int totdoc, 
			  long int totwords, LEARN_PARM *learn_parm, 
			  KERNEL_PARM *kernel_parm, 
			  KERNEL_CACHE **kernel_cache, MODEL *model)
     /* docs:        Training vectors (x-part) */
     /* class:       Training value (y-part) */
     /* totdoc:      Number of examples in docs/label */
     /* totwords:    Number of features (i.e. highest feature index) */
     /* learn_parm:  Learning paramenters */
     /* kernel_parm: Kernel paramenters */
     /* kernel_cache:Initialized Cache, if using a kernel. NULL if
                     linear. Note that it will be free'd and reassigned */
     /* model:       Returns learning result (assumed empty before called) */
{
  long *inconsistent,i,j;
  long inconsistentnum;
  long upsupvecnum;
  double loss,model_length,example_length;
  double maxdiff,*lin,*a,*c;
  long runtime_start,runtime_end;
  long iterations,kernel_cache_size;
  long *unlabeled;
  double r_delta_sq=0,r_delta,r_delta_avg;
  double *xi_fullset; /* buffer for storing xi on full sample in loo */
  double *a_fullset;  /* buffer for storing alpha on full sample in loo */
  TIMING timing_profile;
  SHRINK_STATE shrink_state;
  DOC **docs_org;
  long *label;

  /* set up regression problem in standard form */
  docs_org=docs;
  docs = (DOC **)my_malloc(sizeof(DOC)*2*totdoc);
  label = (long *)my_malloc(sizeof(long)*2*totdoc);
  c = (double *)my_malloc(sizeof(double)*2*totdoc);
  for(i=0;i<totdoc;i++) {   
    j=2*totdoc-1-i;
    docs[i]=create_example(i,0,0,docs_org[i]->costfactor,docs_org[i]->fvec);
    label[i]=+1;
    c[i]=value[i];
    docs[j]=create_example(j,0,0,docs_org[i]->costfactor,docs_org[i]->fvec);
    label[j]=-1;
    c[j]=value[i];
  }
  totdoc*=2;

  /* need to get a bigger kernel cache */
  if(*kernel_cache) {
    kernel_cache_size=(*kernel_cache)->buffsize*sizeof(CFLOAT)/(1024*1024);
    kernel_cache_cleanup(*kernel_cache);
    (*kernel_cache)=kernel_cache_init(totdoc,kernel_cache_size);
  }

  runtime_start=get_runtime();
  timing_profile.time_kernel=0;
  timing_profile.time_opti=0;
  timing_profile.time_shrink=0;
  timing_profile.time_update=0;
  timing_profile.time_model=0;
  timing_profile.time_check=0;
  timing_profile.time_select=0;
  kernel_cache_statistic=0;

  learn_parm->totwords=totwords;

  /* make sure -n value is reasonable */
  if((learn_parm->svm_newvarsinqp < 2) 
     || (learn_parm->svm_newvarsinqp > learn_parm->svm_maxqpsize)) {
    learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize;
  }

  init_shrink_state(&shrink_state,totdoc,(long)MAXSHRINK);

  inconsistent = (long *)my_malloc(sizeof(long)*totdoc);
  unlabeled = (long *)my_malloc(sizeof(long)*totdoc);
  a = (double *)my_malloc(sizeof(double)*totdoc);
  a_fullset = (double *)my_malloc(sizeof(double)*totdoc);
  xi_fullset = (double *)my_malloc(sizeof(double)*totdoc);
  lin = (double *)my_malloc(sizeof(double)*totdoc);
  learn_parm->svm_cost = (double *)my_malloc(sizeof(double)*totdoc);
  model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2));
  model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2));
  model->index = (long *)my_malloc(sizeof(long)*(totdoc+2));

  model->at_upper_bound=0;
  model->b=0;	       
  model->supvec[0]=0;  /* element 0 reserved and empty for now */
  model->alpha[0]=0;
  model->lin_weights=NULL;
  model->totwords=totwords;
  model->totdoc=totdoc;
  model->kernel_parm=(*kernel_parm);
  model->sv_num=1;
  model->loo_error=-1;
  model->loo_recall=-1;
  model->loo_precision=-1;
  model->xa_error=-1;
  model->xa_recall=-1;
  model->xa_precision=-1;
  inconsistentnum=0;

  r_delta=estimate_r_delta(docs,totdoc,kernel_parm);
  r_delta_sq=r_delta*r_delta;

  r_delta_avg=estimate_r_delta_average(docs,totdoc,kernel_parm);
  if(learn_parm->svm_c == 0.0) {  /* default value for C */
    learn_parm->svm_c=1.0/(r_delta_avg*r_delta_avg);
    if(verbosity>=1) 
      printf("Setting default regularization parameter C=%.4f\n",
	     learn_parm->svm_c);
  }

  for(i=0;i<totdoc;i++) {    /* various inits */
    inconsistent[i]=0;
    a[i]=0;
    lin[i]=0;
    unlabeled[i]=0;
    if(label[i] > 0) {
      learn_parm->svm_cost[i]=learn_parm->svm_c*learn_parm->svm_costratio*
	docs[i]->costfactor;
    }
    else if(label[i] < 0) {
      learn_parm->svm_cost[i]=learn_parm->svm_c*docs[i]->costfactor;
    }
  }

  /* caching makes no sense for linear kernel */
  if((kernel_parm->kernel_type == LINEAR) && (*kernel_cache)) {
    printf("WARNING: Using a kernel cache for linear case will slow optimization down!\n");
  } 

  if(verbosity==1) {
    printf("Optimizing"); fflush(stdout);
  }

  /* train the svm */
  iterations=optimize_to_convergence(docs,label,totdoc,totwords,learn_parm,
				     kernel_parm,*kernel_cache,&shrink_state,
				     model,inconsistent,unlabeled,a,lin,c,
				     &timing_profile,&maxdiff,(long)-1,
				     (long)1);
  
  if(verbosity>=1) {
    if(verbosity==1) printf("done. (%ld iterations)\n",iterations);

    printf("Optimization finished (maxdiff=%.5f).\n",maxdiff); 

    runtime_end=get_runtime();
    if(verbosity>=2) {
      printf("Runtime in cpu-seconds: %.2f (%.2f%% for kernel/%.2f%% for optimizer/%.2f%% for final/%.2f%% for update/%.2f%% for model/%.2f%% for check/%.2f%% for select)\n",
        ((float)runtime_end-(float)runtime_start)/100.0,
        (100.0*timing_profile.time_kernel)/(float)(runtime_end-runtime_start),
	(100.0*timing_profile.time_opti)/(float)(runtime_end-runtime_start),
	(100.0*timing_profile.time_shrink)/(float)(runtime_end-runtime_start),
        (100.0*timing_profile.time_update)/(float)(runtime_end-runtime_start),
        (100.0*timing_profile.time_model)/(float)(runtime_end-runtime_start),
        (100.0*timing_profile.time_check)/(float)(runtime_end-runtime_start),
        (100.0*timing_profile.time_select)/(float)(runtime_end-runtime_start));
    }
    else {
      printf("Runtime in cpu-seconds: %.2f\n",
	     (runtime_end-runtime_start)/100.0);
    }

    if(learn_parm->remove_inconsistent) {	  
      inconsistentnum=0;
      for(i=0;i<totdoc;i++) 
	if(inconsistent[i]) 
	  inconsistentnum++;
      printf("Number of SV: %ld (plus %ld inconsistent examples)\n",
	     model->sv_num-1,inconsistentnum);
    }
    else {
      upsupvecnum=0;
      for(i=1;i<model->sv_num;i++) {
	if(fabs(model->alpha[i]) >= 
	   (learn_parm->svm_cost[(model->supvec[i])->docnum]-
	    learn_parm->epsilon_a)) 
	  upsupvecnum++;
      }
      printf("Number of SV: %ld (including %ld at upper bound)\n",
	     model->sv_num-1,upsupvecnum);
    }
    
    if((verbosity>=1) && (!learn_parm->skip_final_opt_check)) {
      loss=0;
      model_length=0; 
      for(i=0;i<totdoc;i++) {
	if((lin[i]-model->b)*(double)label[i] < (-learn_parm->eps+(double)label[i]*c[i])-learn_parm->epsilon_crit)
	  loss+=-learn_parm->eps+(double)label[i]*c[i]-(lin[i]-model->b)*(double)label[i];
	model_length+=a[i]*label[i]*lin[i];
      }
      model_length=sqrt(model_length);
      fprintf(stdout,"L1 loss: loss=%.5f\n",loss);
      fprintf(stdout,"Norm of weight vector: |w|=%.5f\n",model_length);
      example_length=estimate_sphere(model,kernel_parm); 
      fprintf(stdout,"Norm of longest example vector: |x|=%.5f\n",
	      length_of_longest_document_vector(docs,totdoc,kernel_parm));
    }
    if(verbosity>=1) {
      printf("Number of kernel evaluations: %ld\n",kernel_cache_statistic);
    }
  }
    
  if(learn_parm->alphafile[0])
    write_alphas(learn_parm->alphafile,a,label,totdoc);

  /* this makes sure the model we return does not contain pointers to the 
     temporary documents */
  for(i=1;i<model->sv_num;i++) { 
    j=model->supvec[i]->docnum;
    if(j >= (totdoc/2)) {
      j=totdoc-j-1;
    }
    model->supvec[i]=docs_org[j];
  }
  
  shrink_state_cleanup(&shrink_state);
  for(i=0;i<totdoc;i++)
    free_example(docs[i],0);
  free(docs);
  free(label);
  free(inconsistent);
  free(unlabeled);
  free(c);
  free(a);
  free(a_fullset);
  free(xi_fullset);
  free(lin);
  free(learn_parm->svm_cost);
}

void svm_learn_ranking(DOC **docs, double *rankvalue, long int totdoc, 
		       long int totwords, LEARN_PARM *learn_parm, 
		       KERNEL_PARM *kernel_parm, KERNEL_CACHE **kernel_cache, 
		       MODEL *model)
     /* docs:        Training vectors (x-part) */
     /* rankvalue:   Training target values that determine the ranking */
     /* totdoc:      Number of examples in docs/label */
     /* totwords:    Number of features (i.e. highest feature index) */
     /* learn_parm:  Learning paramenters */
     /* kernel_parm: Kernel paramenters */
     /* kernel_cache:Initialized pointer to Cache of size 1*totdoc, if 
	             using a kernel. NULL if linear. NOTE: Cache is 
                     getting reinitialized in this function */
     /* model:       Returns learning result (assumed empty before called) */
{
  DOC **docdiff;
  long i,j,k,totpair,kernel_cache_size;
  double *target,*alpha,cost;
  long *greater,*lesser;
  MODEL *pairmodel;
  SVECTOR *flow,*fhigh;

  totpair=0;
  for(i=0;i<totdoc;i++) {
    for(j=i+1;j<totdoc;j++) {
      if((docs[i]->queryid==docs[j]->queryid) && (rankvalue[i] != rankvalue[j])) {
	totpair++;
      }
    }
  }
💿 文件大小 1570 K
👤 上传用户 horse2000
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#SVM #编写 #机器学习 #文本分类
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -