📄 active.c

📁 在Linux下处理英语文本分类
💻 C
📖 第 1 页 / 共 5 页
字号:
		      	  assert(num_classes < 100);	  /* set the class_probs by picking numbers from the pr 	     charts */	  for (hi = 0; hi < num_classes; hi++)	    {	      double score = scores[scorei].scores[member][hi].weight;	      int class = scores[scorei].scores[member][hi].di;	      int pr_index_low;	      int pr_index_high;	      int pr_index = 0;	      int correct_count = 0;	      int num_docs_in_window = 0;	      int pri;			  	      while ((pr_index < total_unknown) && 		     (pr_by_class[class][pr_index].score > score))		pr_index++;	      	      pr_index_low = pr_index;	      	      while ((pr_index < total_unknown) &&		     pr_by_class[class][pr_index].score == score)		pr_index++;	      	      pr_index_high = pr_index;	      #if 0		      	      if (10 > pr_index)		correct_count += 10 - pr_index;#endif	      /* note that we're including the test document here 		 in the stats... */	      for (pri = MAX (0, MIN(pr_index_low, 				     ((pr_index_low + pr_index_high - 				       active_pr_window_size) / 2))); 		   pri < MIN (MAX(pr_index_high,				  ((pr_index_high + pr_index_low + 				    active_pr_window_size) / 2)),			      total_unknown);		   pri++)		{		  correct_count += pr_by_class[class][pri].correct;		  num_docs_in_window++;		}			  	      prob_by_ci[class] = (double) correct_count / 		((double) num_docs_in_window);	    }		      	  /* normalize the probs to sum to one */	  for (ci = 0; ci < num_classes; ci++)	    total += prob_by_ci[ci];	  for (hi = 0; hi < num_classes; hi++)	    scores[scorei].scores[member][hi].weight =	      prob_by_ci[scores[scorei].scores[member][hi].di] / total; 	}    }}/* Return the entropy of the words in the document WV. */floatactive_document_entropy (bow_wv *wv){  float ret = 0;  float wv_word_count = 0;  int wvi;  float pr_w;  for (wvi = 0; wvi < wv->num_entries; wvi++)    wv_word_count += wv->entry[wvi].count;  for (wvi = 0; wvi < wv->num_entries; wvi++)    {      pr_w = wv->entry[wvi].count / wv_word_count;      ret -= pr_w * log (pr_w);    }  return ret;}/* select method routines *//* comparison function for sorting on selection criteria */intactive_scores_compare (const void *x, const void *y){  if (((active_scores *)x)->weight > ((active_scores *)y)->weight)    return -1;  else if (((active_scores *)x)->weight == ((active_scores *)y)->weight)    return 0;  else    return 1;}/* select docs with the highest kl-divergence to the mean */voidactive_select_qbc (bow_barrel *doc_barrel, active_scores *scores,  		   int num_to_add, int total_unknown,		   int committee_size){  int num_classes = bow_barrel_num_classes (doc_barrel);  double *mean_class_dist;  double mean_class_sum;  int committee;  int class;  int k;  mean_class_dist = alloca (sizeof (double) * num_classes);  /* Calculate the entropy of the class labels, H(Class|d,Committee),     where Class and Committee are random varibles, and put this in     SCORES->WEIGHT. */  for (k = 0; k < total_unknown; k++)    {      scores[k].weight = 0;      /* Initialize the mean class distribution for this document. */      for (class = 0; class < num_classes; class++)	mean_class_dist[class] = 0;      for (committee = 0; committee < committee_size; committee++)	for (class = 0; class < num_classes; class++)	  mean_class_dist[scores[k].scores[committee][class].di]	    += scores[k].scores[committee][class].weight;      mean_class_sum = 0;      for (class = 0; class < num_classes; class++)	mean_class_sum += mean_class_dist[class];      assert (mean_class_sum > committee_size * 0.999);      assert (mean_class_sum < committee_size * 1.001);      for (class = 0; class < num_classes; class++)	mean_class_dist[class] /= mean_class_sum;      /* Set WEIGHT to KL-divergence-to-the-mean averaged over all 	 committee members. */      for (committee = 0; committee < committee_size; committee++)	{	  for (class = 0; class < bow_barrel_num_classes (doc_barrel); class++)	    {	      if (1e-100 <  scores[k].scores[committee][class].weight)		{		  scores[k].weight -= 		    ((1.0 / committee_size)		     * scores[k].scores[committee][class].weight		     * log (mean_class_dist[scores[k].scores[committee][class].di]			    / scores[k].scores[committee][class].weight));		  if (scores[k].weight < -0.1)		    bow_error("scores[k].weight <  -0.1: %.20f, %.20f", scores[k].weight,			      log (mean_class_dist[scores[k].scores[committee][class].di]				   / scores[k].scores[committee][class].weight));		}	    }	}      /* KL divergence must be greater than or equal to 0 */      if (scores[k].weight < -0.1)	bow_error("scores[k].weight <  -0.1: %.20f", scores[k].weight);          }  /* reverse all weights if want lowest ones */  if (active_qbc_low_kl)    {      for (k = 0; k < total_unknown ; k++)	{	  scores[k].weight = -1 * scores[k].weight;	}    }    /* Sort based on weight */  qsort (scores, total_unknown, sizeof (active_scores),	 active_scores_compare);    /* Change doc types of those with highest entropy*/  for (k = 0; k < num_to_add; k++)    {      bow_cdoc *doc;      doc = bow_cdocs_di2doc (doc_barrel->cdocs, scores[k].di);      assert (doc);      assert (doc->type == bow_doc_unlabeled);      bow_verbosify (bow_progress, "Labeling %s, weight %f", doc->filename,		     scores[k].weight);      for (committee=0; committee < committee_size; committee++)	bow_verbosify(bow_progress, " [(%d, %f) (%d, %f)]", 		      scores[k].scores[committee][0].di,		      scores[k].scores[committee][0].weight,		      scores[k].scores[committee][1].di,		      scores[k].scores[committee][1].weight);      bow_verbosify(bow_progress, "\n");      doc->type = bow_doc_train;    }    return;}/* select docs with the highest weighted kl-divergence to the mean */voidactive_select_weighted_kl (bow_barrel *doc_barrel, active_scores *scores,  			   int num_to_add, int total_unknown,			   int committee_size){  int num_classes = bow_barrel_num_classes (doc_barrel);  double mean_class_dist[num_classes];  double mean_class_sum;  double **nb_scores;  int committee;  int class;  int k;  bow_cdoc *cdoc;  double nb_scores_sum;  double nb_scores_max;  int si;			/* an index into the sorted list of scores */  assert (num_to_add < total_unknown);  assert (em_cross_entropy == 1);  /* Allocate space to store Naive Bayes scores. */  nb_scores = alloca (sizeof (double*) * committee_size);  for (committee = 0; committee < committee_size; committee++)    nb_scores[committee] = alloca (sizeof(double) * num_classes);  /* Calculate the weighted KL divergence of the class labels     and put this in SCORES->WEIGHT. */  for (k = 0; k < total_unknown; k++)    {      /* Fill in the Naive Bayes scores array for this K'th document. */      cdoc = bow_array_entry_at_index (doc_barrel->cdocs, scores[k].di);      for (committee = 0; committee < committee_size; committee++)	{	  /* Undo the document length normalization */	  for (si = 0; si < num_classes; si++)	    nb_scores[committee][scores[k].scores[committee][si].di] = 	      (scores[k].scores[committee][si].weight	       * (cdoc->word_count + 1));	  /* Rescale the scores */	  nb_scores_max = -DBL_MAX;	  for (class = 0; class < num_classes; class++)	    if (nb_scores_max < nb_scores[committee][class])	      nb_scores_max = nb_scores[committee][class];	  for (class = 0; class < num_classes; class++)	    nb_scores[committee][class] -= nb_scores_max;	  /* Take the exponent of the scores to make them probabilities. */	  for (class = 0; class < num_classes; class++)	    nb_scores[committee][class] = exp (nb_scores[committee][class]);	  /* Normalize them so they sum to one. */	  nb_scores_sum = 0;	  for (class = 0; class < num_classes; class++)	    nb_scores_sum += nb_scores[committee][class];	  assert (nb_scores_sum > 0);	  for (class = 0; class < num_classes; class++)	    nb_scores[committee][class] /= nb_scores_sum;	}      /* Initialize the mean class distribution for this document. */      for (class = 0; class < num_classes; class++)	mean_class_dist[class] = 0;      for (committee = 0; committee < committee_size; committee++)	for (class = 0; class < num_classes; class++)	  mean_class_dist[class] += nb_scores[committee][class];      mean_class_sum = 0;      for (class = 0; class < num_classes; class++)	mean_class_sum += mean_class_dist[class];      assert (mean_class_sum > committee_size * 0.999);      assert (mean_class_sum < committee_size * 1.001);      for (class = 0; class < num_classes; class++)	mean_class_dist[class] /= mean_class_sum;      /* Set WEIGHT to KL-divergence-to-the-mean averaged over all 	 committee members. */      scores[k].weight = 0;      for (committee = 0; committee < committee_size; committee++)	{	  for (si = 0; si < bow_barrel_num_classes (doc_barrel); si++)	    {	      class = scores[k].scores[committee][si].di;	      if (1e-100 < nb_scores[committee][class])		{/* xxx Change this back to regular old WKL! */#define UNSUPERVISED_DENSITY 1#if UNSUPERVISED_DENSITY		  scores[k].weight -= 		    ((1.0 / committee_size)		     /* scale by kl-div of this document to this class */		     * nb_scores[committee][class]		     * log (mean_class_dist[class]			    / nb_scores[committee][class]));#elif 1		  /* Used for ICML submission */		  scores[k].weight -= 		    ((1.0 / committee_size)		     /* scale by kl-div of this document to this class */		     * exp (scores[k].scores[committee][si].weight 			    + cdoc->normalizer)		     * nb_scores[committee][class]		     * log (mean_class_dist[class]			    / nb_scores[committee][class]));#else		  scores[k].weight -= 		    ((1.0 / committee_size)		     * (cdoc->word_count + 1)		     /* scale by perplexity of this document in this class */		     * exp (scores[k].scores[committee][si].weight)		     * nb_scores[committee][class]		     * log (mean_class_dist[class]			    / nb_scores[committee][class]));#endif		}	    }	}#if UNSUPERVISED_DENSITY      /* Scale the score by the document density. */      scores[k].weight *= cdoc->prior;#endif            /* KL divergence must be greater than or equal to 0 */      if (scores[k].weight < -0.1)	bow_error("scores[k].weight <  -0.1: %.20f", scores[k].weight);    }  /* Sort based on weight */  qsort (scores, total_unknown, sizeof (active_scores),	 active_scores_compare);    /* Change doc types of those with highest entropy*/  for (k = 0; k < num_to_add ; k++)    {      bow_cdoc *doc;      doc = bow_cdocs_di2doc (doc_barrel->cdocs, scores[k].di);      assert (doc);      assert (doc->type == bow_doc_unlabeled);      bow_verbosify (bow_progress, "Labeling %s, weight %f", doc->filename,		     scores[k].weight);      for (committee=0; committee < committee_size; committee++)	bow_verbosify(bow_progress, " [(%d, %f) (%d, %f)]", 		      scores[k].scores[committee][0].di,		      scores[k].scores[committee][0].weight,		      scores[k].scores[committee][1].di,		      scores[k].scores[committee][1].weight);      bow_verbosify(bow_progress, "\n");      doc->type = bow_doc_train;    }    return;}/* select docs with the highest weighted kl-divergence to the mean.   Needs crossentropy scores! */voidactive_select_dkl (bow_barrel *doc_barrel, active_scores *scores,  		   int num_to_add, int total_unknown,		   int committee_size){  int num_classes = bow_barrel_num_classes (doc_barrel);  double mean_class_dist[num_classes];  double mean_class_sum;  double **nb_scores;  int committee;  int class;  int k;  bow_cdoc *cdoc;  double nb_scores_sum;  double nb_scores_max;  int si;			/* an index into the sorted list of scores */  assert (num_to_add < total_unknown);  assert (em_cross_entropy == 1);  /* Allocate space to store Naive Bayes scores. */  nb_scores = alloca (sizeof (double*) * committee_size);  for (committee = 0; committee < committee_size; committee++)    nb_scores[committee] = alloca (sizeof(double) * num_classes);  /* Calculate the weighted KL divergence of the class labels     and put this in SCORES->WEIGHT. */  for (k = 0; k < total_unknown; k++)    {      /* Fill in the Naive Bayes scores array for this K'th document. */      cdoc = bow_array_entry_at_index (doc_barrel->cdocs, scores[k].di);      for (committee = 0; committee < committee_size; committee++)	{	  /* Undo the document length normalization */	  for (si = 0; si < num_classes; si++)	    nb_scores[committee][scores[k].scores[committee][si].di] = 	      (scores[k].scores[committee][si].weight	       * (cdoc->word_count + 1));	  /* Rescale the scores */	  nb_scores_max = -DBL_MAX;	  for (class = 0; class < num_classes; class++)	    if (nb_scores_max < nb_scores[committee][class])	      nb_scores_max = nb_scores[committee][class];	  for (class = 0; class < num_classes; class++)	    nb_scores[committee][class] -= nb_scores_max;	  /* Take the exponent of the scores to make them probabilities. */	  for (class = 0; class < num_classes; class++)	    nb_scores[committee][class] = exp (nb_scores[committee][class]);
💿 文件大小 12 K
👤 上传用户 Numb_pqc
📂 所属分类 Linux/Unix编程
🏷️ 相关标签

#Linux #英语 #文本分类
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -