📄 svm_learn.c

📁 这是一个采用c++编写的用于机器学习文本分类的SVM算法的实现代码。
💻 C
📖 第 1 页 / 共 5 页
字号:

  printf("Constructing %ld rank constraints...",totpair); fflush(stdout);
  docdiff=(DOC **)my_malloc(sizeof(DOC)*totpair);
  target=(double *)my_malloc(sizeof(double)*totpair); 
  greater=(long *)my_malloc(sizeof(long)*totpair); 
  lesser=(long *)my_malloc(sizeof(long)*totpair); 

  k=0;
  for(i=0;i<totdoc;i++) {
    for(j=i+1;j<totdoc;j++) {
      if(docs[i]->queryid == docs[j]->queryid) {
	cost=(docs[i]->costfactor+docs[j]->costfactor)/2.0;
	if(rankvalue[i] > rankvalue[j]) {
	  if(kernel_parm->kernel_type == LINEAR)
	    docdiff[k]=create_example(k,0,0,cost,
				      sub_ss(docs[i]->fvec,docs[j]->fvec));
	  else {
	    flow=copy_svector(docs[j]->fvec);
	    flow->factor=-1.0;
	    flow->next=NULL;
	    fhigh=copy_svector(docs[i]->fvec);
	    fhigh->factor=1.0;
	    fhigh->next=flow;
	    docdiff[k]=create_example(k,0,0,cost,fhigh);
	  }
	  target[k]=1;
	  greater[k]=i;
	  lesser[k]=j;
	  k++;
	}
	else if(rankvalue[i] < rankvalue[j]) {
	  if(kernel_parm->kernel_type == LINEAR)
	    docdiff[k]=create_example(k,0,0,cost,
				      sub_ss(docs[i]->fvec,docs[j]->fvec));
	  else {
	    flow=copy_svector(docs[j]->fvec);
	    flow->factor=-1.0;
	    flow->next=NULL;
	    fhigh=copy_svector(docs[i]->fvec);
	    fhigh->factor=1.0;
	    fhigh->next=flow;
	    docdiff[k]=create_example(k,0,0,cost,fhigh);
	  }
	  target[k]=-1;
	  greater[k]=i;
	  lesser[k]=j;
	  k++;
	}
      }
    }
  }
  printf("done.\n"); fflush(stdout);

  /* need to get a bigger kernel cache */
  if(*kernel_cache) {
    kernel_cache_size=(*kernel_cache)->buffsize*sizeof(CFLOAT)/(1024*1024);
    kernel_cache_cleanup(*kernel_cache);
    (*kernel_cache)=kernel_cache_init(totpair,kernel_cache_size);
  }

  /* must use unbiased hyperplane on difference vectors */
  learn_parm->biased_hyperplane=0;
  pairmodel=(MODEL *)my_malloc(sizeof(MODEL));
  svm_learn_classification(docdiff,target,totpair,totwords,learn_parm,
			   kernel_parm,(*kernel_cache),pairmodel,NULL);

  /* Transfer the result into a more compact model. If you would like
     to output the original model on pairs of documents, see below. */
  alpha=(double *)my_malloc(sizeof(double)*totdoc); 
  for(i=0;i<totdoc;i++) {
    alpha[i]=0;
  }
  for(i=1;i<pairmodel->sv_num;i++) {
    alpha[lesser[(pairmodel->supvec[i])->docnum]]-=pairmodel->alpha[i];
    alpha[greater[(pairmodel->supvec[i])->docnum]]+=pairmodel->alpha[i];
  }
  model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2));
  model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2));
  model->index = (long *)my_malloc(sizeof(long)*(totdoc+2));
  model->supvec[0]=0;  /* element 0 reserved and empty for now */
  model->alpha[0]=0;
  model->sv_num=1;
  for(i=0;i<totdoc;i++) {
    if(alpha[i]) {
      model->supvec[model->sv_num]=docs[i];
      model->alpha[model->sv_num]=alpha[i];
      model->index[i]=model->sv_num;
      model->sv_num++;
    }
    else {
      model->index[i]=-1;
    }
  }
  model->at_upper_bound=0;
  model->b=0;	       
  model->lin_weights=NULL;
  model->totwords=totwords;
  model->totdoc=totdoc;
  model->kernel_parm=(*kernel_parm);
  model->loo_error=-1;
  model->loo_recall=-1;
  model->loo_precision=-1;
  model->xa_error=-1;
  model->xa_recall=-1;
  model->xa_precision=-1;

  free(alpha);
  free(greater);
  free(lesser);
  free(target);

  /* If you would like to output the original model on pairs of
     document, replace the following lines with '(*model)=(*pairmodel);' */
  for(i=0;i<totpair;i++)
    free_example(docdiff[i],1);
  free(docdiff);
  free_model(pairmodel,0);
}


/* The following solves a freely defined and given set of
   inequalities. The optimization problem is of the following form:

   min 0.5 w*w + C sum_i C_i \xi_i
   s.t. x_i * w > rhs_i - \xi_i

   This corresponds to the -z o option. */

void svm_learn_optimization(DOC **docs, double *rhs, long int
			    totdoc, long int totwords, 
			    LEARN_PARM *learn_parm, 
			    KERNEL_PARM *kernel_parm, 
			    KERNEL_CACHE *kernel_cache, MODEL *model,
			    double *alpha)
     /* docs:        Left-hand side of inequalities (x-part) */
     /* rhs:         Right-hand side of inequalities */
     /* totdoc:      Number of examples in docs/label */
     /* totwords:    Number of features (i.e. highest feature index) */
     /* learn_parm:  Learning paramenters */
     /* kernel_parm: Kernel paramenters */
     /* kernel_cache:Initialized Cache of size 1*totdoc, if using a kernel. 
                     NULL if linear.*/
     /* model:       Returns solution as SV expansion (assumed empty before called) */
     /* alpha:       Start values for the alpha variables or NULL
	             pointer. The new alpha values are returned after 
		     optimization if not NULL. Array must be of size totdoc. */
{
  long i,*label;
  long misclassified,upsupvecnum;
  double loss,model_length,example_length;
  double maxdiff,*lin,*a,*c;
  long runtime_start,runtime_end;
  long iterations,maxslackid,svsetnum;
  long *unlabeled,*inconsistent;
  double r_delta_sq=0,r_delta,r_delta_avg;
  long *index,*index2dnum;
  double *weights,*slack,*alphaslack;
  CFLOAT *aicache;  /* buffer to keep one row of hessian */

  TIMING timing_profile;
  SHRINK_STATE shrink_state;

  runtime_start=get_runtime();
  timing_profile.time_kernel=0;
  timing_profile.time_opti=0;
  timing_profile.time_shrink=0;
  timing_profile.time_update=0;
  timing_profile.time_model=0;
  timing_profile.time_check=0;
  timing_profile.time_select=0;
  kernel_cache_statistic=0;

  learn_parm->totwords=totwords;

  /* make sure -n value is reasonable */
  if((learn_parm->svm_newvarsinqp < 2) 
     || (learn_parm->svm_newvarsinqp > learn_parm->svm_maxqpsize)) {
    learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize;
  }

  init_shrink_state(&shrink_state,totdoc,(long)MAXSHRINK);

  label = (long *)my_malloc(sizeof(long)*totdoc);
  unlabeled = (long *)my_malloc(sizeof(long)*totdoc);
  inconsistent = (long *)my_malloc(sizeof(long)*totdoc);
  c = (double *)my_malloc(sizeof(double)*totdoc);
  a = (double *)my_malloc(sizeof(double)*totdoc);
  lin = (double *)my_malloc(sizeof(double)*totdoc);
  learn_parm->svm_cost = (double *)my_malloc(sizeof(double)*totdoc);
  model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2));
  model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2));
  model->index = (long *)my_malloc(sizeof(long)*(totdoc+2));

  model->at_upper_bound=0;
  model->b=0;	       
  model->supvec[0]=0;  /* element 0 reserved and empty for now */
  model->alpha[0]=0;
  model->lin_weights=NULL;
  model->totwords=totwords;
  model->totdoc=totdoc;
  model->kernel_parm=(*kernel_parm);
  model->sv_num=1;
  model->loo_error=-1;
  model->loo_recall=-1;
  model->loo_precision=-1;
  model->xa_error=-1;
  model->xa_recall=-1;
  model->xa_precision=-1;

  r_delta=estimate_r_delta(docs,totdoc,kernel_parm);
  r_delta_sq=r_delta*r_delta;

  r_delta_avg=estimate_r_delta_average(docs,totdoc,kernel_parm);
  if(learn_parm->svm_c == 0.0) {  /* default value for C */
    learn_parm->svm_c=1.0/(r_delta_avg*r_delta_avg);
    if(verbosity>=1) 
      printf("Setting default regularization parameter C=%.4f\n",
	     learn_parm->svm_c);
  }

  learn_parm->biased_hyperplane=0; /* learn an unbiased hyperplane */

  learn_parm->eps=0.0;      /* No margin, unless explicitly handcoded
                               in the right-hand side in the training
                               set.  */

  for(i=0;i<totdoc;i++) {    /* various inits */
    docs[i]->docnum=i;
    a[i]=0;
    lin[i]=0;
    c[i]=rhs[i];       /* set right-hand side */
    unlabeled[i]=0;
    inconsistent[i]=0;
    learn_parm->svm_cost[i]=learn_parm->svm_c*learn_parm->svm_costratio*
      docs[i]->costfactor;
    label[i]=1;
  }
  if(learn_parm->sharedslack) /* if shared slacks are used, they must */
    for(i=0;i<totdoc;i++)     /*  be used on every constraint */
      if(!docs[i]->slackid) {
	perror("Error: Missing shared slacks definitions in some of the examples.");
	exit(0);
      }
      
  /* compute starting state for initial alpha values */
  if(alpha) {
    if(verbosity>=1) {
      printf("Computing starting state..."); fflush(stdout);
    }
    index = (long *)my_malloc(sizeof(long)*totdoc);
    index2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
    weights=(double *)my_malloc(sizeof(double)*(totwords+1));
    aicache = (CFLOAT *)my_malloc(sizeof(CFLOAT)*totdoc);
    for(i=0;i<totdoc;i++) {    /* create full index and clip alphas */
      index[i]=1;
      alpha[i]=fabs(alpha[i]);
      if(alpha[i]<0) alpha[i]=0;
      if(alpha[i]>learn_parm->svm_cost[i]) alpha[i]=learn_parm->svm_cost[i];
    }
    if(kernel_parm->kernel_type != LINEAR) {
      for(i=0;i<totdoc;i++)     /* fill kernel cache with unbounded SV */
	if((alpha[i]>0) && (alpha[i]<learn_parm->svm_cost[i]) 
	   && (kernel_cache_space_available(kernel_cache))) 
	  cache_kernel_row(kernel_cache,docs,i,kernel_parm);
      for(i=0;i<totdoc;i++)     /* fill rest of kernel cache with bounded SV */
	if((alpha[i]==learn_parm->svm_cost[i]) 
	   && (kernel_cache_space_available(kernel_cache))) 
	  cache_kernel_row(kernel_cache,docs,i,kernel_parm);
    }
    (void)compute_index(index,totdoc,index2dnum);
    update_linear_component(docs,label,index2dnum,alpha,a,index2dnum,totdoc,
			    totwords,kernel_parm,kernel_cache,lin,aicache,
			    weights);
    (void)calculate_svm_model(docs,label,unlabeled,lin,alpha,a,c,
			      learn_parm,index2dnum,index2dnum,model);
    for(i=0;i<totdoc;i++) {    /* copy initial alphas */
      a[i]=alpha[i];
    }
    free(index);
    free(index2dnum);
    free(weights);
    free(aicache);
    if(verbosity>=1) {
      printf("done.\n");  fflush(stdout);
    }   
  } 

  /* removing inconsistent does not work for general optimization problem */
  if(learn_parm->remove_inconsistent) {	  
    learn_parm->remove_inconsistent = 0;
    printf("'remove inconsistent' not available in this mode. Switching option off!"); fflush(stdout);
  }

  /* caching makes no sense for linear kernel */
  if(kernel_parm->kernel_type == LINEAR) {
    kernel_cache = NULL;   
  } 

  if(verbosity==1) {
    printf("Optimizing"); fflush(stdout);
  }

  /* train the svm */
  if(learn_parm->sharedslack)
    iterations=optimize_to_convergence_sharedslack(docs,label,totdoc,
				     totwords,learn_parm,kernel_parm,
				     kernel_cache,&shrink_state,model,
				     a,lin,c,&timing_profile,
				     &maxdiff);
  else
    iterations=optimize_to_convergence(docs,label,totdoc,
				     totwords,learn_parm,kernel_parm,
				     kernel_cache,&shrink_state,model,
				     inconsistent,unlabeled,
				     a,lin,c,&timing_profile,
				     &maxdiff,(long)-1,(long)1);
  
  if(verbosity>=1) {
    if(verbosity==1) printf("done. (%ld iterations)\n",iterations);

    misclassified=0;
    for(i=0;(i<totdoc);i++) { /* get final statistic */
      if((lin[i]-model->b)*(double)label[i] <= 0.0) 
	misclassified++;
    }

    printf("Optimization finished (maxdiff=%.5f).\n",maxdiff); 

    runtime_end=get_runtime();
    if(verbosity>=2) {
      printf("Runtime in cpu-seconds: %.2f (%.2f%% for kernel/%.2f%% for optimizer/%.2f%% for final/%.2f%% for update/%.2f%% for model/%.2f%% for check/%.2f%% for select)\n",
        ((float)runtime_end-(float)runtime_start)/100.0,
        (100.0*timing_profile.time_kernel)/(float)(runtime_end-runtime_start),
	(100.0*timing_profile.time_opti)/(float)(runtime_end-runtime_start),
	(100.0*timing_profile.time_shrink)/(float)(runtime_end-runtime_start),
        (100.0*timing_profile.time_update)/(float)(runtime_end-runtime_start),
        (100.0*timing_profile.time_model)/(float)(runtime_end-runtime_start),
        (100.0*timing_profile.time_check)/(float)(runtime_end-runtime_start),
        (100.0*timing_profile.time_select)/(float)(runtime_end-runtime_start));
    }
    else {
      printf("Runtime in cpu-seconds: %.2f\n",
	     (runtime_end-runtime_start)/100.0);
    }
  }
  if((verbosity>=1) && (!learn_parm->skip_final_opt_check)) {
    loss=0;
    model_length=0; 
    for(i=0;i<totdoc;i++) {
      if((lin[i]-model->b)*(double)label[i] < c[i]-learn_parm->epsilon_crit)
	loss+=c[i]-(lin[i]-model->b)*(double)label[i];
      model_length+=a[i]*label[i]*lin[i];
    }
    model_length=sqrt(model_length);
    fprintf(stdout,"Norm of weight vector: |w|=%.5f\n",model_length);
  }
  
  if(learn_parm->sharedslack) {
    index = (long *)my_malloc(sizeof(long)*totdoc);
    index2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
    maxslackid=0;
    for(i=0;i<totdoc;i++) {    /* create full index */
      index[i]=1;
      if(maxslackid<docs[i]->slackid)
	maxslackid=docs[i]->slackid;
    }
💿 文件大小 1570 K
👤 上传用户 horse2000
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#SVM #编写 #机器学习 #文本分类
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -