📄 svm_base.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
    nloops++;  }  if (test_in_train) {    exit(0);  }  if (svm_kernel_type == 0) {    bow_cdoc cdoc;    cdoc.filename = NULL;    cdoc.class_probs = NULL;    cdoc.type = bow_doc_ignore;    cdoc.class = 1;    for (i=0; i<nloops; i++) {      cdoc.word_count = W[i]->num_entries;      bow_barrel_add_document(class_barrel, &cdoc, W[i]);      bow_wv_free(W[i]);    }    free(W);  }  /* if it was per model, the cache would need to be alloc-ed & de-alloced locally */  if (svm_weight_style != WEIGHTS_PER_MODEL) {    kcache_clear();  }  /* place the model weights into the barrel */  if (svm_weight_style == WEIGHTS_PER_MODEL) {    bow_cdoc cdoc;    cdoc.filename = NULL;    cdoc.class_probs = NULL;    cdoc.type = bow_doc_ignore;    cdoc.class = 1;  /* this is fine since all of the docs are class 0 & we		      * know how many meta docs there are */    for (i=0; i<nloops; i++) {      cdoc.word_count = model_weights[i]->num_entries;      bow_barrel_add_document(class_barrel, &cdoc, model_weights[i]);      bow_wv_free(model_weights[i]);    }    free(model_weights);  }  /* the docs were freed before just to save memory - now we need them again   * & the optimizer's done, so a lot of memory is no longer being used */  if (svm_weight_style == WEIGHTS_PER_MODEL && vote_type == PAIRWISE) {    make_doc_array(src_barrel, docs, tdocs, bow_cdoc_is_train);    /* append these trans docs to the arrays that were filled in above */    make_doc_array(src_barrel, &(docs[ntrain]), &(tdocs[ntrain]),			    use_transduction_docs);  }  /* now add all of the documents from the doc barrel to the class barrel */  for (i=0; i<ndocs; i++) {    /* add the i'th document to the class_barrel */    /* first we need to make a new cdoc */    bow_cdoc cdoc;    memcpy(&cdoc, GET_CDOC_ARRAY_EL(src_barrel, tdocs[i]), sizeof(bow_cdoc));    cdoc.filename = strdup(cdoc.filename);    cdoc.class = 0;    bow_barrel_add_document(class_barrel, &cdoc, docs[i]);  }  /* this has to be done after all possible dv's have been created */  if (!((vote_type == PAIRWISE && weight_type) || weight_type == INFOGAIN)      && weight_type) { /* if no weights are used at all this isn't nec. */    bow_dv *dv;    j = bow_num_words();    for (i=0; i<j; i++) {      dv = bow_wi2dvf_dv (class_barrel->wi2dvf, i);      if (dv) {	dv->idf = weight_vect[i];      }    }    free(weight_vect);  }  if (vote_type == PAIRWISE) {    BARREL_GET_MAX_NSV(class_barrel) = max_nsv;  } else {    BARREL_GET_MAX_NSV(class_barrel) = -1*max_nsv;  }  BARREL_GET_NCLASSES(class_barrel) = nclasses;  BARREL_GET_NMETA_DOCS(class_barrel) = n_meta_docs;  class_barrel->classnames = bow_int4str_new(0);  for (i=0; i<nclasses; i++) {    /* drop a class label in */    bow_str2int(class_barrel->classnames, bow_int2str(src_barrel->classnames, i));  }  for (i=0; i<ndocs; i++) {    bow_wv_free(docs[i]);  }    return class_barrel;}inline double evaluate_model(bow_wv **docs, double *weights, int *yvect, double b, 			     bow_wv *query_wv, int nsv) {  double sum,tmp;  int i,j;  for (i=j=0, sum=0.0; j<nsv; i++) {    if (weights[i] != 0.0) {      tmp = kernel(docs[i],query_wv);      sum += yvect[i]*weights[i]*tmp;      j++;    }  }  return (sum - b);}/* similar to above, but to only for when the cache should be used */inline double evaluate_model_cache(bow_wv **docs, double *weights, int *yvect, double b, 			     bow_wv *query_wv, int nsv) {  double sum,tmp;  int i,j;  for (i=j=0, sum=0.0; j<nsv; i++) {    if (weights[i] != 0.0) {      tmp = svm_kernel_cache(docs[i],query_wv);      sum += yvect[i]*weights[i]*tmp;      j++;    }  }  return (sum - b);}inline double evaluate_model_hyperplane(double *W, double b, bow_wv *query_wv) {  return (dprod_sd(query_wv,W)-b);}/* this & setup_docs are for "caching" the barrel into its wv form */static void clear_model_cache () {  int i;  if (model_cache.barrel) {    for (i=0; i<model_cache.ndocs; i++) {      bow_wv_free(model_cache.docs[i]);    }    for (i=0; i<model_cache.nmodels; i++) {      free(model_cache.indices[i]);      free(model_cache.weights[i]);      free(model_cache.yvect[i]);      if (svm_weight_style == WEIGHTS_PER_MODEL) {	free(model_cache.word_weights.sub_model[i]);      }      if (svm_kernel_type == 0) {	free(model_cache.W[i]);      }    }    free(model_cache.docs);    free(model_cache.indices);    free(model_cache.weights);    free(model_cache.yvect);    free(model_cache.bvect);    free(model_cache.sizes);    if (svm_weight_style == WEIGHTS_PER_MODEL) {      free(model_cache.word_weights.sub_model);    } else if (svm_weight_style == WEIGHTS_PER_BARREL) {      free(model_cache.word_weights.barrel);    }    if (svm_kernel_type == 0) {      free(model_cache.W);    }  }  model_cache.barrel = NULL;}/* this fn fills *sub_docs with the m-th submodel (it pulls the docs * from the cache that setup_docs fills & then sets whatever weights * are necessary) *//* the query vector should already be normalized */void make_sub_model(int m, int weight_style, bow_wv ***sub_docs) {  bow_wv **docs;  int     *indices;  float  *weights;  bow_we  *v2;  int i,j;  docs = *sub_docs;  for(j=0; j<model_cache.sizes[m]; j++) {    docs[j] = model_cache.docs[model_cache.indices[m][j]];  }  if (weight_style) {    indices = model_cache.indices[m];    weights = model_cache.word_weights.sub_model[m];    for (i=0; i<model_cache.sizes[m]; i++) {      int n = docs[i]->num_entries;      int di = indices[i];      v2 = docs[i]->entry;      for (j=0; j<n; j++) {	v2[j].weight = weights[v2[j].wi] * model_cache.oweights[di][j];      }    }  }}static void setup_docs(bow_barrel *barrel, int nclasses, int nmodels) {  bow_cdoc    *cdoc;  int          classnum, c_old;  bow_wv      *dtmp;  bow_dv_heap *heap;  int          ndocs;  int          nmeta_docs;  int          nwords;  int          total_words;  int h,i,j,k,l;  nmeta_docs = BARREL_GET_NMETA_DOCS(barrel);  ndocs = barrel->cdocs->length - nmeta_docs;  total_words = bow_num_words();  clear_model_cache();  model_cache.docs = (bow_wv **) malloc(sizeof(bow_wv *)*ndocs);  model_cache.indices = (int **) malloc(sizeof(int *)*nmodels);  model_cache.weights = (double **) malloc(sizeof(double *)*nmodels);  model_cache.yvect = (int **) malloc(sizeof(int *)*nmodels);  model_cache.bvect = (double *) malloc(sizeof(double)*nmodels);  model_cache.sizes = (int *) malloc(sizeof(int)*nmodels);  if (weight_type) {    if (vote_type == PAIRWISE || weight_type == INFOGAIN) {      svm_weight_style = WEIGHTS_PER_MODEL;      model_cache.word_weights.sub_model = (float **) malloc(sizeof(float *)*nmodels);      if (tf_transform_type) 	model_cache.oweights = (float **) malloc(sizeof(float *)*ndocs);    } else {      svm_weight_style = WEIGHTS_PER_BARREL;      model_cache.word_weights.barrel = (float *) malloc(sizeof(float *)*total_words);    }  } else {    svm_weight_style = NO_WEIGHTS;  }  if (svm_kernel_type == 0) {    model_cache.W = (double **) malloc(sizeof(double *)*nmodels);  } else {    model_cache.W = NULL;  }  /* Create the Heap of vectors of all documents */  heap = bow_make_dv_heap_from_wi2dvf(barrel->wi2dvf);   /* throw away the first 2 - they hold only ancillary info    * (see the macros at the top of the file) */  bow_heap_next_wv(heap, barrel, &dtmp, bow_cdoc_yes);  bow_heap_next_wv(heap, barrel, &dtmp, bow_cdoc_yes);  /* grab the meta documents first & setup the arrays */  for (h=0,l=2; h<nmodels; h++) {    classnum=c_old=-1;    for (nwords=j=0,k=-1; l<nmeta_docs; l++) {    /* only go thru for 2 different classes */      cdoc = bow_cdocs_di2doc (barrel->cdocs, l);      /* if this isn't what the last one was,  */      if ((cdoc->class != classnum) && (c_old != cdoc->class) && (k==1)) {	break;      }      bow_heap_next_wv(heap, barrel, &dtmp, bow_cdoc_yes);            if ((cdoc->class != classnum) && (c_old != cdoc->class)) {	if (k==-1) {	  /* do the stuff that needs done once for each model */	  model_cache.bvect[h] = cdoc->normalizer;	  nwords = dtmp->num_entries;	  model_cache.indices[h] = (int *) malloc(sizeof(int)*nwords);	  model_cache.weights[h] = (double *) malloc(sizeof(double)*nwords);	  model_cache.yvect[h] = (int *) malloc(sizeof(int)*nwords);	} else {  /* in an already initialized model, but we need to grow arrays */	  nwords += dtmp->num_entries;	  model_cache.indices[h] = (int *) realloc(model_cache.indices[h], sizeof(int)*(nwords));	  model_cache.weights[h] = (double *) realloc(model_cache.weights[h], sizeof(double)*nwords);	  model_cache.yvect[h] = (int *) realloc(model_cache.yvect[h], sizeof(int)*nwords);	}	k++;	c_old = classnum;	classnum = cdoc->class;      } else {  /* already seen this class - need to grow some arrays */	nwords += dtmp->num_entries;	model_cache.indices[h] = (int *) realloc(model_cache.indices[h], sizeof(int)*(nwords));	model_cache.weights[h] = (double *) realloc(model_cache.weights[h], sizeof(double)*nwords);	model_cache.yvect[h] = (int *) realloc(model_cache.yvect[h], sizeof(int)*nwords);      }      for (i=0; j<nwords; j++,i++) {	model_cache.indices[h][j] = dtmp->entry[i].count - 1;	model_cache.weights[h][j] = dtmp->entry[i].weight;	model_cache.yvect[h][j] = ((k == 0) ? 1.0 : -1.0);      }    }    model_cache.sizes[h] = nwords;      }  /* if there are cached hyperplanes, lets grab them... */  if (svm_kernel_type == 0) {    for (i=0; i<nmodels; i++) {      bow_heap_next_wv(heap, barrel, &dtmp, bow_cdoc_yes);      model_cache.W[i] = (double *) malloc(total_words*sizeof(double));      for (h=j=0; j<dtmp->num_entries; h++) {	if (h == dtmp->entry[j].wi) {	  model_cache.W[i][h] = dtmp->entry[j].weight;	  j++;	} else {	  model_cache.W[i][h] = 0.0;	}      }      for (; h<total_words; h++) {	model_cache.W[i][h] = 0.0;      }    }#ifdef DEBUG    for (j=0; j<total_words; j++) {      tmp = model_cache.W[0][j] + model_cache.W[1][j];      assert(tmp >= -1*svm_epsilon_crit && tmp <= svm_epsilon_crit);    }#endif  }  /* any kind of pairwise weights needs its own set of weights, since the domain   * for each model is different...  Info-gain also needs it since items relevant   * & useful in one model may be of no use in another (since there are always only   * 2 classes...) */  if (svm_weight_style == WEIGHTS_PER_MODEL) {    for (h=0; h<nmodels; h++) {      bow_heap_next_wv(heap, barrel, &dtmp, bow_cdoc_yes);      model_cache.word_weights.sub_model[h] = (float *) malloc(sizeof(float)*total_words);      for (i=j=0; i<total_words; i++) {	if ((j < dtmp->num_entries) && (dtmp->entry[j].wi == i)) {	  model_cache.word_weights.sub_model[h][i] = dtmp->entry[j].weight;	  j++;	} else {	  model_cache.word_weights.sub_model[h][i] = 0.0;	}      }    }  } else if (svm_weight_style == WEIGHTS_PER_BARREL) {    bow_dv *dv;        for (h=0; h<total_words; h++) {      dv = bow_wi2dvf_dv (barrel->wi2dvf, h);      if (dv) {	model_cache.word_weights.barrel[h] = dv->idf;      } else {	model_cache.word_weights.barrel[h] = 0.0;      }    }  }    /* the rest of the documents are just the training documents - keep   * grabbing them until they're gone */  for (h=0; heap->length; h++) {    bow_heap_next_wv(heap, barrel, &dtmp, bow_cdoc_yes);    model_cache.docs[h] = bow_wv_new(dtmp->num_entries);        for (j=0; j<dtmp->num_entries; j++) {      model_cache.docs[h]->entry[j].wi = dtmp->entry[j].wi;      model_cache.docs[h]->entry[j].count = dtmp->entry[j].count;    }    /*    if (svm_kernel_type == FISHER) {      for (j=0; j<model_cache.docs[h]->num_entries; j++) {	model_cache.docs[h]->entry[j].weight = (float) model_cache.docs[h]->entry[j].count;      }      model_cache.docs[h]->normalizer = 1.0;      continue;    }*/    tf_transform(model_cache.docs[h]);    /* this means that the weights will change with every model &      * therefore we need to keep track of what they were initially (after the tf_transform) */    if (svm_weight_style == WEIGHTS_PER_MODEL && tf_transform_type) {      model_cache.oweights[h] = (float *) malloc(sizeof(float)*dtmp->num_entries);      for (j=0; j<model_cache.docs[h]->num_entries; j++) {	model_cache.oweights[h][j] = model_cache.docs[h]->entry[j].weight;      }    } else {      /* otherwise, the weights should be set now... */      if (svm_weight_style == NO_WEIGHTS) {	bow_wv_normalize_weights_by_summing(model_cache.docs[h]);	for (j=0; j<model_cache.docs[h]->num_entries; j++) {	  model_cache.docs[h]->entry[j].weight *= model_cache.docs[h]->normalizer;	}      } else {	for (j=0; j<model_cache.docs[h]->num_entries; j++) {	  model_cache.docs[h]->entry[j].weight *= 	    model_cache.word_weights.barrel[model_cache.docs[h]->entry[j].wi];	}	bow_wv_normalize_weights_by_summing(model_cache.docs[h]);	for (j=0; j<model_cache.docs[h]->num_entries; j++) {	  model_cache.docs[h]->entry[j].weight *= model_cache.docs[h]->normalizer;	}      }    }    /* the oweights (original weights) in the svm_wv now has the proper,     * tf_transformed & normalized value. */  }  model_cache.barrel = barrel;  model_cache.ndocs = h;  model_cache.nmodels = nmodels;}int svm_score(bow_barrel *barrel, bow_wv *query_wv, bow_score *bscores, 	      int bscores_len, int loo_class) {  int          ci;  int          max_nsv;  double      *model_vals;  bow_score   *myscores;  float       *base_qwv_weights;  int          nclasses;  int          nmodels;  int          ntied;  int          num_scores;  int          set_weights;  bow_wv     **sub_docs;  int          voting_scheme;  int i, ii, j, k;  /* This should be initialized in case BSCORES_LEN is larger than the number   * of classes in the barrel */  for (ci=0; ci < bscores_len; ci++) {    bscores[ci].weight = 0.0;    bscores[ci].di = 0;    bscores[ci].name = "default";  }  base_qwv_weights = NULL;  max_nsv = BARREL_GET_MAX_NSV(barrel);  nclasses = BARREL_GET_NCLASSES(barrel);  if (max_nsv < 0) {    max_nsv *= -1;    nmodels = nclasses;    voting_scheme = AGAINST_ALL;  } else {    nmodels = nclasses*(nclasses-1)/2;    voting_scheme = PAIRWISE;  }  if
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -