📄 mysvmclassifier.cpp

📁 本程序是一本很经典的知识响亮基的源代码　学起来比较方便　请大家使用
💻 CPP
📖 第 1 页 / 共 5 页
字号:
    while((!space_or_null((int)line[pos])) && line[pos]) pos++;
    if(sscanf(featurepair,"qid:%ld%s",&wnum,junk)==1) {
      /* it is the query id */
      (*queryid)=(long)wnum;
    }
    else if(sscanf(featurepair,"sid:%ld%s",&wnum,junk)==1) {
      /* it is the slack id */
      if(wnum > 0) 
	(*slackid)=(long)wnum;
      else {
	perror ("Slack-id must be greater or equal to 1!!!\n"); 
	printf("LINE: %s\n",line);
	exit (1); 
      }
    }
    else if(sscanf(featurepair,"cost:%lf%s",&weight,junk)==1) {
      /* it is the example-dependent cost factor */
      (*costfactor)=(double)weight;
    }
    else if(sscanf(featurepair,"%ld:%lf%s",&wnum,&weight,junk)==2) {
      /* it is a regular feature */
      if(wnum<=0) { 
	perror ("Feature numbers must be larger or equal to 1!!!\n"); 
	printf("LINE: %s\n",line);
	exit (1); 
      }
      if((wpos>0) && ((words[wpos-1]).wnum >= wnum)) { 
	perror ("Features must be in increasing order!!!\n"); 
	printf("LINE: %s\n",line);
	exit (1); 
      }
      (words[wpos]).wnum=wnum;
      (words[wpos]).weight=(FVAL)weight; 
      wpos++;
    }
    else {
      perror ("Cannot parse feature/value pair!!!\n"); 
      printf("'%s' in LINE: %s\n",featurepair,line);
      exit (1); 
    }
  }
  (words[wpos]).wnum=0;
  (*numwords)=wpos+1;
  return(1);
}

double* CMySVMClassifier::read_alphas(char *alphafile,long totdoc)
     /* reads the alpha vector from a file as written by the
        write_alphas function */
{
  FILE *fl;
  double *alpha;
  long dnum;

  if ((fl = fopen (alphafile, "r")) == NULL)
  { perror (alphafile); exit (1); }

  alpha = (double *)my_malloc(sizeof(double)*totdoc);
  if(verbosity>=1) {
    printf("Reading alphas..."); fflush(stdout);
  }
  dnum=0;
  while((!feof(fl)) && fscanf(fl,"%lf\n",&alpha[dnum]) && (dnum<totdoc)) {
    dnum++;
  }
  if(dnum != totdoc)
  { perror ("\nNot enough values in alpha file!"); exit (1); }
  fclose(fl);

  if(verbosity>=1) {
    printf("done\n"); fflush(stdout);
  }

  return(alpha);
}

void CMySVMClassifier::nol_ll(char *file, long int *nol, long int *wol, long int *ll) 
     /* Grep through file and count number of lines, maximum number of
        spaces per line, and longest line. */
{
  FILE *fl;
  int ic;
  char c;
  long current_length,current_wol;

  if ((fl = fopen (file, "r")) == NULL)
  { perror (file); exit (1); }
  current_length=0;
  current_wol=0;
  (*ll)=0;
  (*nol)=1;
  (*wol)=0;
  while((ic=getc(fl)) != EOF) {
    c=(char)ic;
    current_length++;
    if(space_or_null((int)c)) {
      current_wol++;
    }
    if(c == '\n') {
      (*nol)++;
      if(current_length>(*ll)) {
	(*ll)=current_length;
      }
      if(current_wol>(*wol)) {
	(*wol)=current_wol;
      }
      current_length=0;
      current_wol=0;
    }
  }
  fclose(fl);
}

long CMySVMClassifier::minl(long int a, long int b)
{
  if(a<b)
    return(a);
  else
    return(b);
}

long CMySVMClassifier::maxl(long int a, long int b)
{
  if(a>b)
    return(a);
  else
    return(b);
}

long CMySVMClassifier::get_runtime(void)
{
  clock_t start;
  start = clock();
  return((long)((double)start*100.0/(double)CLOCKS_PER_SEC));
}


//# ifdef MICROSOFT

int CMySVMClassifier::isnan(double a)
{
  return(_isnan(a));
}

//# endif

int CMySVMClassifier::space_or_null(int c) {
  if (c==0)
    return 1;
  return isspace(c);
}

void* CMySVMClassifier::my_malloc(size_t size)
{
  void *ptr;
  ptr=(void *)malloc(size);
  if(!ptr) { 
    perror ("Out of memory!\n"); 
    exit (1); 
  }
  return(ptr);
}

void CMySVMClassifier::copyright_notice(void)
{
  printf("\nCopyright: Thorsten Joachims, thorsten@joachims.org\n\n");
  printf("This software is available for non-commercial use only. It must not\n");
  printf("be modified and distributed without prior permission of the author.\n");
  printf("The author is not responsible for implications from the use of this\n");
  printf("software.\n\n");
}

//SVM LEARNING FUNCTIONS
void CMySVMClassifier::svm_learn_classification(DOC **docs, double *classes, long int
			      totdoc, long int totwords, 
			      LEARN_PARM *learn_parm, 
			      KERNEL_PARM *kernel_parm, 
			      KERNEL_CACHE *kernel_cache, 
			      MODEL *model,
			      double *alpha)
     /* docs:        Training vectors (x-part) */
     /* class:       Training labels (y-part, zero if test example for
                     transduction) */
     /* totdoc:      Number of examples in docs/label */
     /* totwords:    Number of features (i.e. highest feature index) */
     /* learn_parm:  Learning paramenters */
     /* kernel_parm: Kernel paramenters */
     /* kernel_cache:Initialized Cache of size totdoc, if using a kernel. 
                     NULL if linear.*/
     /* model:       Returns learning result (assumed empty before called) */
     /* alpha:       Start values for the alpha variables or NULL
	             pointer. The new alpha values are returned after 
		     optimization if not NULL. Array must be of size totdoc. */
{
  long *inconsistent,i,*label;
  long inconsistentnum;
  long misclassified,upsupvecnum;
  double loss,model_length,example_length;
  double maxdiff,*lin,*a,*c;
  long runtime_start,runtime_end;
  long iterations;
  long *unlabeled,transduction;
  long heldout;
  long loo_count=0,loo_count_pos=0,loo_count_neg=0,trainpos=0,trainneg=0;
  long loocomputed=0,runtime_start_loo=0,runtime_start_xa=0;
  double heldout_c=0,r_delta_sq=0,r_delta,r_delta_avg;
  long *index,*index2dnum;
  double *weights;
  CFLOAT *aicache;  /* buffer to keep one row of hessian */

  double *xi_fullset; /* buffer for storing xi on full sample in loo */
  double *a_fullset;  /* buffer for storing alpha on full sample in loo */
  TIMING timing_profile;
  SHRINK_STATE shrink_state;

  runtime_start=get_runtime();
  timing_profile.time_kernel=0;
  timing_profile.time_opti=0;
  timing_profile.time_shrink=0;
  timing_profile.time_update=0;
  timing_profile.time_model=0;
  timing_profile.time_check=0;
  timing_profile.time_select=0;
  kernel_cache_statistic=0;

  learn_parm->totwords=totwords;

  /* make sure -n value is reasonable */
  if((learn_parm->svm_newvarsinqp < 2) 
     || (learn_parm->svm_newvarsinqp > learn_parm->svm_maxqpsize)) {
    learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize;
  }

  init_shrink_state(&shrink_state,totdoc,(long)MAXSHRINK);

  label = (long *)my_malloc(sizeof(long)*totdoc);
  inconsistent = (long *)my_malloc(sizeof(long)*totdoc);
  unlabeled = (long *)my_malloc(sizeof(long)*totdoc);
  c = (double *)my_malloc(sizeof(double)*totdoc);
  a = (double *)my_malloc(sizeof(double)*totdoc);
  a_fullset = (double *)my_malloc(sizeof(double)*totdoc);
  xi_fullset = (double *)my_malloc(sizeof(double)*totdoc);
  lin = (double *)my_malloc(sizeof(double)*totdoc);
  learn_parm->svm_cost = (double *)my_malloc(sizeof(double)*totdoc);
  model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2));
  model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2));
  model->index = (long *)my_malloc(sizeof(long)*(totdoc+2));

  model->at_upper_bound=0;
  model->b=0;	       
  model->supvec[0]=0;  /* element 0 reserved and empty for now */
  model->alpha[0]=0;
  model->lin_weights=NULL;
  model->totwords=totwords;
  model->totdoc=totdoc;
  model->kernel_parm=(*kernel_parm);
  model->sv_num=1;
  model->loo_error=-1;
  model->loo_recall=-1;
  model->loo_precision=-1;
  model->xa_error=-1;
  model->xa_recall=-1;
  model->xa_precision=-1;
  inconsistentnum=0;
  transduction=0;

  r_delta=estimate_r_delta(docs,totdoc,kernel_parm);
  r_delta_sq=r_delta*r_delta;

  r_delta_avg=estimate_r_delta_average(docs,totdoc,kernel_parm);
  if(learn_parm->svm_c == 0.0) {  /* default value for C */
    learn_parm->svm_c=1.0/(r_delta_avg*r_delta_avg);
    if(verbosity>=1) 
      printf("Setting default regularization parameter C=%.4f\n",
	     learn_parm->svm_c);
  }

  learn_parm->eps=-1.0;      /* equivalent regression epsilon for
				classification */

  for(i=0;i<totdoc;i++) {    /* various inits */
    docs[i]->docnum=i;
    inconsistent[i]=0;
    a[i]=0;
    lin[i]=0;
    c[i]=0.0;
    unlabeled[i]=0;
    if(classes[i] == 0) {
      unlabeled[i]=1;
      label[i]=0;
      transduction=1;
    }
    if(classes[i] > 0) {
      learn_parm->svm_cost[i]=learn_parm->svm_c*learn_parm->svm_costratio*
	docs[i]->costfactor;
      label[i]=1;
      trainpos++;
    }
    else if(classes[i] < 0) {
      learn_parm->svm_cost[i]=learn_parm->svm_c*docs[i]->costfactor;
      label[i]=-1;
      trainneg++;
    }
    else {
      learn_parm->svm_cost[i]=0;
    }
  }
  if(verbosity>=2) {
    printf("%ld positive, %ld negative, and %ld unlabeled examples.\n",trainpos,trainneg,totdoc-trainpos-trainneg); fflush(stdout);
  }

  /* caching makes no sense for linear kernel */
  if(kernel_parm->kernel_type == LINEAR) {
    kernel_cache = NULL;   
  } 

  /* compute starting state for initial alpha values */
  if(alpha) {
    if(verbosity>=1) {
      printf("Computing starting state..."); fflush(stdout);
    }
    index = (long *)my_malloc(sizeof(long)*totdoc);
    index2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
    weights=(double *)my_malloc(sizeof(double)*(totwords+1));
    aicache = (CFLOAT *)my_malloc(sizeof(CFLOAT)*totdoc);
    for(i=0;i<totdoc;i++) {    /* create full index and clip alphas */
      index[i]=1;
      alpha[i]=fabs(alpha[i]);
      if(alpha[i]<0) alpha[i]=0;
      if(alpha[i]>learn_parm->svm_cost[i]) alpha[i]=learn_parm->svm_cost[i];
    }
    if(kernel_parm->kernel_type != LINEAR) {
      for(i=0;i<totdoc;i++)     /* fill kernel cache with unbounded SV */
	if((alpha[i]>0) && (alpha[i]<learn_parm->svm_cost[i]) 
	   && (kernel_cache_space_available(kernel_cache))) 
	  cache_kernel_row(kernel_cache,docs,i,kernel_parm);
      for(i=0;i<totdoc;i++)     /* fill rest of kernel cache with bounded SV */
	if((alpha[i]==learn_parm->svm_cost[i]) 
	   && (kernel_cache_space_available(kernel_cache))) 
	  cache_kernel_row(kernel_cache,docs,i,kernel_parm);
    }
    (void)compute_index(index,totdoc,index2dnum);
    update_linear_component(docs,label,index2dnum,alpha,a,index2dnum,totdoc,
			    totwords,kernel_parm,kernel_cache,lin,aicache,
			    weights);
    (void)calculate_svm_model(docs,label,unlabeled,lin,alpha,a,c,
			      learn_parm,index2dnum,index2dnum,model);
    for(i=0;i<totdoc;i++) {    /* copy initial alphas */
      a[i]=alpha[i];
    }
    free(index);
    free(index2dnum);
    free(weights);
    free(aicache);
    if(verbosity>=1) {
      printf("done.\n");  fflush(stdout);
    }   
  } 

  if(transduction) {
    learn_parm->svm_iter_to_shrink=99999999;
    if(verbosity >= 1)
      printf("\nDeactivating Shrinking due to an incompatibility with the transductive \nlearner in the current version.\n\n");
  }

  if(transduction && learn_parm->compute_loo) {
    learn_parm->compute_loo=0;
    if(verbosity >= 1)
      printf("\nCannot compute leave-one-out estimates for transductive learner.\n\n");
  }    

  if(learn_parm->remove_inconsistent && learn_parm->compute_loo) {
    learn_parm->compute_loo=0;
    printf("\nCannot compute leave-one-out estimates when removing inconsistent examples.\n\n");
  }    

  if(learn_parm->compute_loo && ((trainpos == 1) || (trainneg == 1))) {
    learn_parm->compute_loo=0;
    printf("\nCannot compute leave-one-out with only one example in one class.\n\n");
  }    


  if(verbosity==1) {
    printf("Optimizing"); fflush(stdout);
  }

  /* train the svm */
  iterations=optimize_to_convergence(docs,label,totdoc,totwords,learn_parm,
				     kernel_parm,kernel_cache,&shrink_state,model,
				     inconsistent,unlabeled,a,lin,
				     c,&timing_profile,
				     &maxdiff,(long)-1,
				     (long)1);
  
  if(verbosity>=1) {
    if(verbosity==1) printf("done. (%ld iterations)\n",iterations);

    misclassified=0;
    for(i=0;(i<totdoc);i++) { /* get final statistic */
      if((lin[i]-model->b)*(double)label[i] <= 0.0) 
	misclassified++;
    }

    printf("Optimization finished (%ld misclassified, maxdiff=%.5f).\n",
	   misclassified,maxdiff); 

    runtime_end=get_runtime();
    if(verbosity>=2) {
      printf("Runtime in cpu-seconds: %.2f (%.2f%% for kernel/%.2f%% for optimizer/%.2f%% for final/%.2f%% for update/%.2f%% for model/%.2f%% for check/%.2f%% for select)\n",
        ((float)runtime_end-(float)runtime_start)/100.0,
        (100.0*timing_profile.time_kernel)/(float)(runtime_end-runtime_start),
	(100.0*timing_profile.time_opti)/(float)(runtime_end-runtime_start),
	(100.0*timing_profile.time_shrink)/(float)(runtime_end-runtime_start),
        (100.0*timing_profile.time_update)/(float)(runtime_end-runtime_start),
        (100.0*timing_profile.time_model)/(float)(runtime_end-runtime_start),
        (100.0*timing_profile.time_check)/(float)(runtime_end-runtime_start),
        (100.0*timing_profile.time_select)/(float)(runtime_end-runtime_start));
    }
    else {
      printf("Runtime in cpu-seconds: %.2f\n",
	     (runtime_end-runtime_start)/100.0);
    }
💿 文件大小 41 K
👤 上传用户 liangshuo800
📂 所属分类 VC书籍
🏷️ 相关标签

#程序 #源代码 #比较 #家
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -