📄 info_gain.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
      /* Calculate and store the information gain. */      ret[wi] = (total_entropy 		 - ((((double)with_word_total / (double)grand_total) 		     * with_word_entropy)		    + (((double)without_word_total / (double)grand_total) 		       * without_word_entropy)));      /* Not comparing with 0 here because of round-off error. */      assert (ret[wi] >= -1e-7);      if (ret[wi] < 0)	ret[wi] = 0;      if (wi % 100 == 0)	bow_verbosify (bow_progress,		       "\b\b\b\b\b\b\b\b\b%9d", max_wi - wi);    }  bow_verbosify (bow_progress, "\n");  return ret;}float *bow_infogain_per_wi_new (bow_barrel *barrel, int num_classes, int *size){  if (bow_infogain_event_model == bow_event_word)    return bow_infogain_per_wi_new_word_event (barrel, num_classes, size);  else if (bow_infogain_event_model == bow_event_document)    return bow_infogain_per_wi_new_document_event (barrel, num_classes, size);  else if (bow_infogain_event_model == bow_event_document_then_word)    bow_error ("document_then_word for infogain not implemented");  else    bow_error ("bad bow_infogain_event_model");  return NULL;}/* Return a malloc()'ed array containing an infomation-gain score for   each word index, but the infogain scores are computing from   co-occurance of word pairs. */float *bow_infogain_per_wi_new_using_pairs (bow_barrel *barrel, int num_classes,				     int *size){  /* `count' == Counts of documents.     `pair'== Pair of words. */  float count[num_classes];  float count_with_pair[num_classes];  float count_without_pair[num_classes];  bow_cdoc *doc1, *doc2;  double entropy_unconditional;  double entropy_with_pair;  double entropy_without_pair;  int max_wi = MIN (barrel->wi2dvf->size, bow_num_words());  float count_total = 0;   float count_with_pair_total = 0;  float count_without_pair_total = 0;  int i, j, wi1, wi2, dvi1, dvi2;  bow_dv *dv1, *dv2;#if 0  struct _igpair {    float ig;    int wi1;    int wi2;  } igpair[max_wi*max_wi];#else  float ig;#endif  bow_verbosify (bow_progress, 		 "Calculating info gain... words ::          ");  *size = max_wi;  /* First set all the arrays to zero */  for(i = 0; i < num_classes; i++)     {      count[i] = 0;      count_with_pair[i] = 0;      count_without_pair[i] = 0;    }  /* Now set up the unconditional counts totals. */  for (i = 0; i < barrel->cdocs->length ; i++)    {      doc1 = bow_cdocs_di2doc (barrel->cdocs, i);      if (doc1->type == bow_doc_train) 	{	  count[doc1->class] += doc1->prior;	  count_total += doc1->prior;	}    }  /* Calculate the unconditional entropy */  entropy_unconditional = bow_entropy (count, num_classes);  /* Now loop over all pairs of words. */  for (wi1 = 0; wi1 < max_wi; wi1++)     {      for (wi2 = wi1+1; wi2 < max_wi; wi2++)	{	  /* Get the document vectors */	  dv1 = bow_wi2dvf_dv (barrel->wi2dvf, wi1);	  dv2 = bow_wi2dvf_dv (barrel->wi2dvf, wi2);	  if (dv1 == NULL || dv2 == NULL)	    {	      /* igpair[wi1][wi2] = 0; */	      continue;	    }	  count_with_pair_total = 0;	/* Create totals for this pair of dv's.	   ...i.e. find documents in which both WI1 and WI2 occur. */	  for (dvi1 = 0, dvi2 = 0; dvi1 < dv1->length; dvi1++)	    {	      /* Find the entry in DV2 for the same document, if it exists. */	      while (dv1->entry[dvi1].di > dv2->entry[dvi2].di		     && dvi2 < dv2->length)		dvi2++;	      if (dv1->entry[dvi1].di != dv2->entry[dvi2].di)		continue;	      doc1 = bow_cdocs_di2doc (barrel->cdocs, dv1->entry[dvi1].di);	      doc2 = bow_cdocs_di2doc (barrel->cdocs, dv2->entry[dvi2].di);	      /* We found a document with both WI1 and WI2 */	      if (doc1->type == bow_doc_train && doc2->type == bow_doc_train)		{		  count_with_pair[doc1->class] += doc1->prior;		  count_with_pair_total += doc1->prior;		}	    }	  /* Set the without-pair totals. */	  for (j = 0; j < num_classes; j++)	    {	      count_without_pair[j] = count[j] - count_with_pair[j];	    }	  count_without_pair_total = count_total - count_with_pair_total;	/* Calculate entropies */	  entropy_with_pair = bow_entropy (count_with_pair, num_classes);	  entropy_without_pair = bow_entropy (count_without_pair, num_classes);	/* Calculate and store the information gain. */	  ig =	    (entropy_unconditional 	     - ((((double)count_with_pair_total / count_total)		 * entropy_with_pair)		+ (((double)count_without_pair_total / count_total) 		   * entropy_without_pair)));	  /* Not comparing with 0 here because of round-off error. */	  assert (ig >= -1e-7);	  if (ig < 0)	    ig = 0;	  if (ig > 0.01)	    printf ("%12.9f  %20s %20s\n", 		    ig, bow_int2word (wi1), bow_int2word (wi2));	  /* Reset arrays to zero */	  for(i = 0; i < num_classes; i++) 	    {	      count_with_pair[i] = 0;	      count_without_pair[i] = 0;	    }	}      if (wi1 % 100 == 0)	bow_verbosify (bow_progress,		       "\b\b\b\b\b\b\b\b\b%9d", max_wi - wi1);    }  bow_verbosify (bow_progress, "\n");#if 0  /* Now loop over all pairs of words, printing the result. */  for (wi1 = 0; wi1 < max_wi; wi1++)     for (wi2 = 0; wi2 < max_wi; wi2++)      {	printf ("%8.5f %20s %20s\n",		igpair[wi1][wi2],		bow_int2word (wi1),		bow_int2word (wi2));      }#endif  return NULL;}/* Return a word array containing information gain scores, unsorted.   Only includes words with non-zero infogain. */bow_wa *bow_infogain_wa (bow_barrel *barrel, int num_classes){  float *wi2ig;			/* the array of information gains */  int wi2ig_size;  int wi;  bow_wa *wa = bow_wa_new (barrel->wi2dvf->num_words);  wi2ig = bow_infogain_per_wi_new (barrel, num_classes, &wi2ig_size);  /* Create and fill and array of `word-index and information-gain     structures' that can be sorted. */  for (wi = 0; wi < wi2ig_size; wi++)    if (wi2ig[wi] > 0)      bow_wa_append (wa, wi, wi2ig[wi]);  return wa;}/* Return a word array containing the count for each word, with +/-   0.1 noise added. */bow_wa *bow_word_count_wa (bow_barrel *doc_barrel){  bow_wa *wa;  int wi, dvi;  bow_dv *dv;  bow_cdoc *cdoc;  wa = bow_wa_new (0);  for (wi = 0; wi < doc_barrel->wi2dvf->size; wi++)    {      dv = bow_wi2dvf_dv (doc_barrel->wi2dvf, wi);      if (!dv)	continue;      for (dvi = 0; dvi < dv->length; dvi++)	{	  cdoc = bow_array_entry_at_index (doc_barrel->cdocs, 					   dv->entry[dvi].di);	  if (cdoc->type == bow_doc_train)	    bow_wa_add_to_end (wa, wi, 			       dv->entry[dvi].count + bow_random_01 () * 0.01);	}    }  return wa;}/* Print to stdout the sorted results of bow_infogain_per_wi_new().   It will print the NUM_TO_PRINT words with the highest infogain. */voidbow_infogain_per_wi_print (FILE *fp, bow_barrel *barrel, int num_classes, 			   int num_to_print){  float *wi2ig;			/* the array of information gains */  int wi2ig_size;  int wi, i;  struct wiig { int wi; float ig; } *wiigs;  int wiig_compare (const void *wiig1, const void *wiig2)    {      if (((struct wiig*)wiig1)->ig > ((struct wiig*)wiig2)->ig)	return -1;      else if (((struct wiig*)wiig1)->ig == ((struct wiig*)wiig2)->ig)	return 0;      else	return 1;    }  wi2ig = bow_infogain_per_wi_new (barrel, num_classes, &wi2ig_size);  if (num_to_print == 0)    num_to_print = wi2ig_size;  /* Create and fill and array of `word-index and information-gain     structures' that can be sorted. */  wiigs = bow_malloc (wi2ig_size * sizeof (struct wiig));  for (wi = 0; wi < wi2ig_size; wi++)    {      wiigs[wi].wi = wi;      wiigs[wi].ig = wi2ig[wi];    }  /* Sort it. */  qsort (wiigs, wi2ig_size, sizeof (struct wiig), wiig_compare);  /* Print it. */  for (i = 0; i < num_to_print; i++)    {      fprintf (fp, "%8.5f %s\n", wiigs[i].ig, bow_int2word (wiigs[i].wi));    }  bow_free (wi2ig);}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -