📄 knn.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
  for (wvi = 0; wvi < query_wv->num_entries; wvi++)    {      if (TF_B(query_weights))	{	  query_wv->entry[wvi].weight = 1;	}      else if (TF_L(query_weights))	{	  query_wv->entry[wvi].weight = 1 + log(query_wv->entry[wvi].count);	}      else if (TF_N(query_weights))	{	  query_wv->entry[wvi].weight = query_wv->entry[wvi].count;	}      else if (max_tf < query_wv->entry[wvi].count)	{	  max_tf = query_wv->entry[wvi].count;	}    }  /* Pass two - Get the correct weights for 'a' or 'm'. Do IDF if     required. */  for(wvi = 0; wvi < query_wv->num_entries; wvi++)     {       if (TF_M(query_weights))	{	  query_wv->entry[wvi].weight = (double)query_wv->entry[wvi].count / (double) max_tf;	}      else if (TF_A(query_weights))	{	  query_wv->entry[wvi].weight = 0.5 + 0.5 * ((double)query_wv->entry[wvi].count / (double) max_tf);	}      /* Cheat here - we can leverage off the fact that I've so	 far only implemented  one IDF method. Otherwise I'd have	 to store raw statistics for the word occurances for each	 word. */      if (IDF_T(query_weights))	  {	    dv = bow_wi2dvf_dv (barrel->wi2dvf,	query_wv->entry[wvi].wi);	    query_wv->entry[wvi].weight *= dv->idf;	  }    }  /* Done - the idf term was calculated earlier on */}voidbow_knn_normalise_query_weights (bow_wv *query){  if (NORM_C(query_weights))    {      bow_wv_normalize_weights_by_vector_length(query);    }}/* Little :) function lifted from tfidf.c and edited to do exactly   what we need here. Was originally called bow_tfidf_score */intbow_knn_get_k_best (bow_barrel *barrel, bow_wv *query_wv,  		    bow_score *scores, int best){  bow_dv_heap *heap;   bow_cdoc *doc;   int num_scores = 0;           /* How many elements are in this array */   int current_di, wi, current_index, i;   double current_score = 0.0, doc_tfidf;   float tmp;   /* Create the Heap of vectors of documents */   heap = bow_make_dv_heap_from_wv (barrel->wi2dvf, query_wv);    /* Keep looking at document/word entries until the heap is emptied */   while (heap->length > 0)     {       /* Get the index of the document we're currently working on */       current_di = heap->entry[0].current_di;        /* Get the document structure */       doc = bow_cdocs_di2doc (barrel->cdocs, current_di);       /* If it's not a model document, then move on to next one */       if (doc->type != bow_doc_train)         {           do              {               bow_dv_heap_update (heap);             }           while ((current_di == heap->entry[0].current_di)                  && (heap->length > 0));                    /* Try again */           continue;         }        /* Reset the index into the query word vector */       current_index = 0;        /* Reset the score */       current_score = 0.0;        /* Loop over all the words this document has in common with our	 query document, summing up the score. We know the words come	 out of the heap in index order and we know the words in the	 query word vector are in index order as well. */      do         {           wi = heap->entry[0].wi;           doc_tfidf = heap->entry[0].dv->entry[heap->entry[0].index].weight;	  /* Find the corresponding word in the query word vector */ 	  /* Note - we know this word is in the query because we built	     the heap using only the query words. */          while (wi > (query_wv->entry[current_index].wi))             current_index++;           assert (wi == query_wv->entry[current_index].wi); 	  /* Now we can add something to the score. Normalisation	     happens outside this loop, we just need to check for the	     idf factor stuff. The tf weights are just fine. */	  /* Multiply the tfidf weights */	  /* printf("%f * %f\n", query_wv->entry[current_index].weight,		 doc_tfidf); */	  tmp = query_wv->entry[current_index].weight * doc_tfidf;	  /* Plop this into the current score */	  current_score += tmp;	  /* A test to make sure we haven't got NaN. */           assert (current_score == current_score);            /* Now we need to update the heap - moving this element on	     to its              new position */           bow_dv_heap_update (heap);         }       while ((current_di == heap->entry[0].current_di)              && (heap->length > 0));       /* Now check for normalisation */      if(NORM_C(query_weights))	{	  current_score *= query_wv->normalizer;	}      if(NORM_C(doc_weights))	{	  current_score *= doc->normalizer;	}      assert (current_score == current_score); /* checking for NaN */       /* We now hopefully have the correct score for doc */      /* Store the result in the SCORES array */       /* If we haven't filled the list, or we beat the last item in	 the list */       if ((num_scores < best)           || (scores[num_scores - 1].weight < current_score))         {           /* We're going to search up the list comparing element i-1 with              our current score and moving it down the list if it's worse */           if (num_scores < best)             {               i = num_scores;               num_scores++;             }           else             i = num_scores - 1; 	  /* Shift down all the bits of the array that need shifting */           for (; (i > 0) && (scores[i - 1].weight < current_score); i--) 	    {	      scores[i].di = scores[i-1].di; 	      scores[i].weight = scores[i-1].weight; 	      scores[i].name = scores[i-1].name; 	    }           /* Insert our new score */           scores[i].weight = current_score;           scores[i].di = current_di;	  scores[i].name = doc->filename;        }     }   bow_free (heap);    /* All done - return the number of elements we have */   return num_scores; } /* Get class scores using closest K neighbors.  */intbow_knn_score (bow_barrel *barrel, bow_wv *query_wv, 	       bow_score *bscores, int bscores_len,	       int loo_class){  int count;  int ni,ci;  double scores_sum = 0.0;  int num_scores;    bow_score *neighbors;         /* Place to hold scores of nearest neighbors */  double *scores;  /* This should be initialized in case BSCORES_LEN is larger than the number   * of classes in the barrel */  for (ci=0; ci < bscores_len; ci++)    {      bscores[ci].weight = 0.0;      bscores[ci].di = 0;      bscores[ci].name = "default";    }  /* Get scores of neighbors */  neighbors = alloca (sizeof (bow_score) * knn_k);  count = bow_knn_get_k_best (barrel, query_wv, neighbors, knn_k);  /* Allocate space for class scores */  scores = alloca (bow_barrel_num_classes (barrel) * sizeof (double));  for (ci=0; ci < bow_barrel_num_classes (barrel); ci++)    scores[ci] = 0.0;  /* Put contributing document scores into class scores */  for (ni=0; ni < count; ni++)    {      /* Get the class of this document */      bow_cdoc *doc = bow_cdocs_di2doc (barrel->cdocs, neighbors[ni].di);      scores[doc->class] += neighbors[ni].weight;      scores_sum += neighbors[ni].weight;    }  num_scores = 0;  /* Put SCORES into BSCORES in sorted order */  /* Each round, find the best remainaing score and put it into bscores */  for (ci=0; ci < bow_barrel_num_classes (barrel); ci++)    {      if (num_scores < bscores_len	  || bscores[num_scores-1].weight < scores[ci])	{	  int dsi;	  /* We are going to put this score and class index into SCORES	   * because either 1) there is an empty space in SCORES, or 2)	   * SCORES[CI] is larger than the smallest score currently there */	  if (num_scores < bscores_len)	    num_scores++;	  dsi = num_scores - 1;	  /* Shift down all the entries that are smaller than SCORES[CI] */	  for (; dsi > 0 && bscores[dsi-1].weight < scores[ci]; dsi--)	    {	      bscores[dsi].weight = bscores[dsi-1].weight;	      bscores[dsi].name = bscores[dsi-1].name;	      bscores[dsi].di = bscores[dsi-1].di;	    }	  bscores[dsi].weight = scores[ci];	  bscores[dsi].di = ci;	  bscores[dsi].name = "default";	}    }  return num_scores;}rainbow_method bow_method_knn = {  "knn",  bow_knn_set_weights,  0,				/* no weight scaling function */  bow_knn_normalise_weights,  bow_knn_classification_barrel,  NULL,                         /* We don't do priors */  bow_knn_score,  bow_knn_query_set_weights,  bow_knn_normalise_query_weights,  bow_barrel_free,  0};void _register_method_knn () __attribute__ ((constructor));void _register_method_knn (){  bow_method_register_with_name ((bow_method*)&bow_method_knn, "knn",				 sizeof (rainbow_method),				 &knn_argp_child);  bow_argp_add_child (&knn_argp_child);}
上一页 12
💿 文件大小 522 K
👤 上传用户 yuanata
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#mitchell #tom #机器学习 #代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -