📄 svm_base.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 5 页
字号:
      wv_tmp->entry[j].wi = wv_tmp1->entry[j].wi;      wv_tmp->entry[j].count = wv_tmp1->entry[j].count;    }    tf_transform(wv_tmp);    bow_wv_normalize_weights_by_summing(wv_tmp);    docs[ndocs] = wv_tmp;  }    return ndocs;}/* C sucks - this is just a fn to pass to bow_heap_next_wv */static int silly_currying_global_v1, silly_currying_global_v2;int use_train_and_submodel(bow_cdoc *cdoc) {  return ((cdoc->type == bow_doc_train && 	   (silly_currying_global_v1 == cdoc->class ||  	    silly_currying_global_v2 == cdoc->class)) ? 	  1 : 0);}int use_transduction_docs(bow_cdoc *cdoc) {  return (((1 << cdoc->type) & transduce_class) ? 1 : 0);}/* helper fn for adding the data for a training example to the barrel */int add_sv_barrel(bow_barrel *new_barrel,double *weights, int *yvect, int *tdocs, 		  double b, int model_no, int nsv) {  bow_cdoc  cdoc_pos, cdoc_neg;  bow_wv   *dummy_wv_neg;  bow_wv   *dummy_wv_pos;  int       n_meta_docs=0;  int ni, pi, i, j, num_words;  num_words = bow_num_words();  dummy_wv_pos = bow_wv_new(num_words);  dummy_wv_neg = bow_wv_new(num_words);  dummy_wv_pos->num_entries = dummy_wv_neg->num_entries = 0;  cdoc_pos.type = bow_doc_ignore;  cdoc_neg.normalizer = cdoc_pos.prior = 0.0;  cdoc_pos.filename = NULL;  cdoc_pos.class_probs = NULL;  cdoc_pos.class = 0;    cdoc_neg.type = bow_doc_ignore;  cdoc_neg.normalizer = cdoc_neg.prior = 0.0;  cdoc_neg.filename = NULL;  cdoc_neg.class_probs = NULL;  cdoc_neg.class = 0;  if (model_no == 0) {    /* insert an two empty docs into the barrel so that the      * ancillary data has a place to live */    cdoc_neg.word_count = 0;    bow_barrel_add_document(new_barrel, &cdoc_neg, dummy_wv_pos);    bow_barrel_add_document(new_barrel, &cdoc_neg, dummy_wv_pos);    n_meta_docs = 2;  }  cdoc_pos.normalizer = b;  cdoc_pos.class = map_y_to_class(model_no,(int) 1);  cdoc_neg.class = map_y_to_class(model_no,(int) -1);     ni = pi = 0;  for (i=j=0; j<nsv; i++) {    if (weights[i] > svm_epsilon_a) {      if (yvect[i] > 0) {	if (pi > num_words) {	  dummy_wv_pos->num_entries = pi;	  cdoc_pos.word_count = pi;	  bow_barrel_add_document(new_barrel, &cdoc_pos, dummy_wv_pos);	  pi = 0;	  n_meta_docs++;	}	dummy_wv_pos->entry[pi].weight = (float) weights[i];	dummy_wv_pos->entry[pi].count = tdocs[i] + 1;	dummy_wv_pos->entry[pi].wi = pi;	pi++;      } else {	if (ni > num_words) {	  dummy_wv_neg->num_entries = pi;	  cdoc_neg.word_count = ni;	  bow_barrel_add_document(new_barrel, &cdoc_neg, dummy_wv_pos);	  ni = 0;	  n_meta_docs++;	}	dummy_wv_neg->entry[ni].weight = (float) weights[i];	dummy_wv_neg->entry[ni].count = tdocs[i] + 1;	dummy_wv_neg->entry[ni].wi = ni;	ni++;      }      j++;    }  }      cdoc_pos.word_count = pi;  dummy_wv_pos->num_entries = pi;  bow_barrel_add_document(new_barrel, &cdoc_pos, dummy_wv_pos);   cdoc_neg.word_count = ni;  dummy_wv_neg->num_entries = ni;  bow_barrel_add_document(new_barrel, &cdoc_neg, dummy_wv_neg);  bow_wv_free(dummy_wv_pos);  bow_wv_free(dummy_wv_neg);  return (n_meta_docs+2);}bow_barrel *svm_vpc_merge(bow_barrel *src_barrel) {  double        b;  int           cto;         /* for pairwise - works with npass */  bow_wv      **docs;        /* a doc major matrix */  int           max_nsv;     /* highest # of nsv's in a submodel */  int           mdocs;       /* the number of docs in the current submodel */  bow_wv      **model_weights;  int           n_meta_docs; /* # of documents that will go into the class barrel			      * before the weight vectors will */  int           nclasses;  int           ndocs;        /* total # of documents to be trained & transduced */  int           ntrain;       /* # of documents to be trained upon */  int           ntrans;       /* # of "unlabeled" docs to use in transduction */  bow_barrel   *class_barrel;  int           nloops;      /* # of the current submodel being built */  int           npass;       /* tmp for making submodels from the src_barrel */  int           nsv;         /* # of support vectors for the current model */  int           num_words;  bow_wv      **sub_docs;  int          *tdocs;       /* trans table of indices in docs to indices 			      * in the original barrel */  int           total_docs;  /* total # of docs (some not for training) */  int          *utdocs;      /* trans table of the docs in our training set			      * to those in the actually used in the models */  float        *weight_vect;  double       *weights;     /* lagrange multipliers */  bow_wv      **W;           /* hyperplane for lin. folding */  int          *yvect;  int i,j;#ifndef HAVE_LOQO  if (svm_use_smo != 1) {    fprintf(stderr,"Can only use SMO, no other solvers were built,\n"	    "rebuild libbow with pr_loqo to use another algorithm.\n");  }#endif#ifdef HAVE_FPSETMASK  fpsetmask(~(FP_X_INV | FP_X_DNML | FP_X_DZ | FP_X_OFL | FP_X_UFL | FP_X_IMP));#endif  total_docs = src_barrel->cdocs->length;  nclasses = bow_barrel_num_classes(src_barrel);  weight_vect = NULL;  model_weights = NULL;  W = NULL;  yvect = NULL;  /* note - this OVER allocates - uses ALL, instead of just those for training */  docs = (bow_wv **) alloca(sizeof(bow_wv *)*total_docs);  tdocs = (int *) alloca(sizeof(int)*(total_docs+1));  mdocs = 0; /* to shut gcc up */  nsv = 0;  if (nclasses == 1) {    fprintf(stderr, "Cannot build SVM with only 1 class.\n");    fflush(stderr);    return NULL;  } else if (nclasses == 2) {    if (svm_kernel_type != FISHER) {      vote_type = PAIRWISE;    }  }  if (weight_type && svm_kernel_type == FISHER) {    weight_type = 0;    tf_transform_type = RAW;  }  if ((weight_type && vote_type == PAIRWISE) || weight_type == INFOGAIN) {    svm_weight_style = WEIGHTS_PER_MODEL;  } else if (weight_type) {    svm_weight_style = WEIGHTS_PER_BARREL;  } else {    svm_weight_style = NO_WEIGHTS;  }  if (svm_weight_style != WEIGHTS_PER_MODEL) {    ntrain = make_doc_array(src_barrel, docs, tdocs, bow_cdoc_is_train);    if (ntrain < 2) {      if (ntrain)	bow_wv_free(docs[0]);      fprintf(stderr, "Cannot build svm with less than 2 documents\n");      fflush(stderr);      return NULL;    }        /* append these trans docs to the arrays that were filled in above */    ntrans = make_doc_array(src_barrel, &(docs[ntrain]), &(tdocs[ntrain]),			    use_transduction_docs);    ndocs = ntrain + ntrans;    utdocs = (int *) alloca(sizeof(int)*ndocs);    for (i=0; i<ndocs; i++) {      utdocs[i] = i;    }    sub_docs = docs;    mdocs = ndocs;    svm_set_barrel_weights(docs, NULL, ndocs, &weight_vect);    kcache_init(ndocs);  } else {    /* the ndocs value is the number of training documents that will     * actually be used - this is done now JUST to fill up the tdocs array. */    ntrain = make_doc_array(src_barrel, docs, tdocs, bow_cdoc_is_train);       if (ntrain < 2) {      if (ntrain)	bow_wv_free(docs[0]);      fprintf(stderr, "Cannot build svm with less than 2 documents\n");      fflush(stderr);      return NULL;    }        /* figure out the # of ntrans */    ntrans = make_doc_array(src_barrel, &(docs[ntrain]), &(tdocs[ntrain]),			    use_transduction_docs);        ndocs = ntrain + ntrans;    /* since we don't need the docs for a while, free them */    for (i=0; i<ndocs; i++) {      bow_wv_free(docs[i]);    }    model_weights = (bow_wv **) malloc(sizeof(bow_wv *)*nclasses);    utdocs = (int *) alloca(sizeof(int)*ndocs);    /* the sub_docs vector will be rewritten with wv's to be used each iteration */    sub_docs = alloca(sizeof(bow_wv *)*ndocs);  }    /* build the naive bayes model for the kernel... */  if (svm_kernel_type == FISHER) {    /* this isn't too bad since the cache REALLY should be large enough     * to hold everything anyway (the cache doesn't get flushed) */    if (vote_type == PAIRWISE) {      fprintf(stderr, "Fisher kernel not implemented for pairwise models yet.\n");      return NULL;    }    svm_setup_fisher(src_barrel,docs,nclasses,ndocs);    weight_type = 0;  }  weights = (double *) alloca(sizeof(double)*ndocs);  yvect = (int *) alloca(sizeof(int)*ndocs);  /* put together the resultant barrel */  class_barrel = bow_barrel_new(src_barrel->wi2dvf->size, 2, sizeof(bow_cdoc), 				src_barrel->cdocs->free_func);  class_barrel->method = src_barrel->method;  class_barrel->is_vpc = 1;      /* make a temp word array big enough to fill a whole strip of the wi2dvf table */  num_words = bow_num_words();  n_meta_docs = 0;  /* this is the beginning of the for loop */  max_nsv = -1;  nloops = 0;  npass = 0;  if (svm_kernel_type == 0) {    if (vote_type == PAIRWISE) {      W = (bow_wv **) malloc(sizeof(bow_wv *)*(nclasses-1)*nclasses/2);    } else {      W = (bow_wv **) malloc(sizeof(bow_wv *)*nclasses);    }  }  for (npass=0, cto=1; 1; ) {    /* initialize & pull together the classes for the npass'th model... */    if (vote_type == PAIRWISE) {      if (cto == nclasses) {	npass ++;	if (npass == nclasses-1) {	  break;	}	cto = npass+1;      }      if (svm_weight_style == WEIGHTS_PER_MODEL) {	silly_currying_global_v1 = npass;	silly_currying_global_v2 = cto;	/* this gets called here since the doctype labels are in the barrel */	/* utdocs is filled with actual indices, not indices of the train set */	mdocs = make_doc_array(src_barrel, sub_docs, utdocs, use_train_and_submodel);			/* put the labels in for the labeled docs. */	for (i=0; i<mdocs; i++) {	  bow_cdoc *cdoc = (GET_CDOC_ARRAY_EL(src_barrel,utdocs[i]));	  yvect[i] = map_class_to_y(npass, cdoc->class);	}	/* even though this set of docs is always the same (since all of the 	 * unlabeled data is used for each pairwise document [this is not 	 * suggested to with a barrel w/ more than 2 classes] is used) we 	 * still grab it, since the starting position for the unlabeled data	 * isn't known beforehand (its a slight hack) */	ntrans = make_doc_array(src_barrel, &(sub_docs[mdocs]), 				&(utdocs[mdocs]), use_transduction_docs);	/* this says that it is unlabelled */	for (i=0; i<ntrans; i++) {	  yvect[i+mdocs] = 0;	}	mdocs = mdocs + ntrans;	/* utdocs holds the barrel indices we're interested in the sub-model	 * indices - so we need to remap utdocs */	for (i=j=0; j<mdocs; i++) {	  if (tdocs[i] < utdocs[j]) {	    continue;	  } else {	    utdocs[j] = i;	    j++;	  }	}      } else {	for (i=j=0; i<ntrain; i++) {	  bow_cdoc *cdoc = (GET_CDOC_ARRAY_EL(src_barrel,tdocs[i]));	  if ((cdoc->class == npass) || (cdoc->class == cto)) {	    sub_docs[j] = docs[i];	    yvect[j] = map_class_to_y(npass, cdoc->class);	    utdocs[j] = i;	    j++;	  }	}	for (i=0; i<ntrans; j++,i++) {	    sub_docs[j] = docs[i+ntrain];	    utdocs[j] = i+ntrain;	    yvect[j] = 0;	}	mdocs = j;      }    } else {      if (npass == nclasses) {	break;      }      /* all docs should be included - the yvect will do the proper mapping */      for (i=0; i<ntrain; i++) {	bow_cdoc *cdoc = (GET_CDOC_ARRAY_EL(src_barrel,tdocs[i]));	/* this map will be extended to make the barrel handle more than 2 classes */	yvect[i] = map_class_to_y(npass, cdoc->class);      }      for (i=0; i<ntrans; i++) {	yvect[i+ntrain] = 0;      }            if (svm_weight_style == WEIGHTS_PER_MODEL) {	for (i=0; i<mdocs; i++) {	  /* the weight values are not correct - they include the last values */	  /* make_doc_array does this for pairwise voting */	  tf_transform(docs[i]);	}      }    }    if (svm_weight_style == WEIGHTS_PER_MODEL) {      svm_set_barrel_weights(sub_docs, yvect, mdocs, &weight_vect);      model_weights[nloops] = bow_wv_new(num_words);      for (i=j=0; i<num_words; i++) {	if (weight_vect[i] != 0.0) {	  model_weights[nloops]->entry[j].wi = i;	  model_weights[nloops]->entry[j].count = 1;	  model_weights[nloops]->entry[j].weight = weight_vect[i];	  j++;	}      }      free(weight_vect);      model_weights[nloops]->num_entries = j;    }    if (mdocs < 2) {      bow_error("Cannot create SVM with only 1 document!\n");    }    fprintf(stderr,"Learning %dth model\n",nloops);    if (svml_basename) {      char *tmp;      FILE *f = stdout;      tmp = malloc(sizeof(char)*(20+strlen(svml_basename)));      sprintf(tmp,"train_%d_%s",nloops,svml_basename);      f = fopen (tmp, "w");      for (i=0; i<mdocs; i++) {	fprintf(f,"%d ", yvect[i]);	for (j=0; j<sub_docs[i]->num_entries; j++) {	  fprintf (f,"%d:%f ",1+sub_docs[i]->entry[j].wi, sub_docs[i]->entry[j].weight);	}	fprintf(f,"\n");      }      fclose(f);      /* set up the test output file */      sprintf(tmp,"test_%s",svml_basename);      svml_test_file = fopen (tmp, "w");      free(tmp);      nsv = 0;      W[nloops] = bow_wv_new(0);    } else {      /* only useful with test-in-train - ONLY build models after a certain point       * (like when the previously acquired data runs out) */      if ((!test_in_train) || ((test_in_train) && (nloops >= model_starting_no))) {	nsv = tlf_svm(sub_docs,yvect,weights,&b,&(W[nloops]),ntrans,mdocs);      }    }    if (vote_type == PAIRWISE && weight_type) {      for (i=0; i<mdocs; i++) {	bow_wv_free(sub_docs[i]);      }    }    if (max_nsv < nsv) {      max_nsv = nsv;    }    /* now we need to drop the significant classes into the barrel */    if (!test_in_train) {      n_meta_docs += add_sv_barrel(class_barrel, weights, yvect, utdocs, b, nloops, nsv);    }        if (vote_type == PAIRWISE) {      cto++;    } else {      npass ++;    }
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -