📄 active.c

📁 在Linux下处理英语文本分类
💻 C
📖 第 1 页 / 共 5 页
字号:
	{	  if (1e-100 <  mean_class_dist[class])	    {	      scores[k].weight -= 		(mean_class_dist[class]		 * log (mean_class_dist[class]));	    }	}      /* adjust for the correct log factor */      scores[k].weight *= 1 / log(2);           /* convert to a probability */      scores[k].weight /= log (bow_barrel_num_classes (doc_barrel)) / log(2);      /* multiply in the epsilon factor */      scores[k].weight *= active_stream_epsilon;      /* Entropy must be greater than or equal to 0 */      if (scores[k].weight < -0.1)	bow_error("scores[k].weight <  -0.1: %.20f", scores[k].weight);          }  /* select some documents randomly according to the weights */  for (j=0, k=0; k < num_to_add; j++)    {      int scoresi = rand() % total_unknown;      int doci;      double coin_flip;      doci = scores[scoresi].di;      doc = bow_cdocs_di2doc (doc_barrel->cdocs, doci);      assert (doc);      if (doc->type == bow_doc_unlabeled)	{	  coin_flip = bow_random_double (0,1);	  if (scores[scoresi].weight > coin_flip)	    {	      doc->type = bow_doc_train;	      k++;	      bow_verbosify (bow_progress, "Labeling %s, weight %f, flip %f", doc->filename,			     scores[scoresi].weight, coin_flip);	      for (committee=0; committee < committee_size; committee++)		bow_verbosify(bow_progress, " [(%d, %f) (%d, %f)]", 			      scores[scoresi].scores[committee][0].di,			      scores[scoresi].scores[committee][0].weight,			      scores[scoresi].scores[committee][1].di,			      scores[scoresi].scores[committee][1].weight);	      bow_verbosify(bow_progress, "\n");	    }	}      if (j > doc_barrel->cdocs->length * 1000)	bow_error ("Random number generator could not find enough "		   "unlabeled documents to convert.");    }  return;}voidactive_select_stream_kl (bow_barrel *doc_barrel, active_scores *scores,  			 int num_to_add, int total_unknown,			 int committee_size){  int j;  bow_cdoc *doc;  int num_classes = bow_barrel_num_classes (doc_barrel);  double *mean_class_dist;  double mean_class_sum;  int committee;  int class;  int k;  mean_class_dist = alloca (sizeof (double) * num_classes);  assert(num_to_add <= total_unknown);  /* ensures our max-kl for probability mapping is correct */  assert(bow_barrel_num_classes(doc_barrel) >= committee_size);  /* Calculate the entropy of the class labels, H(Class|d,Committee),     where Class and Committee are random varibles, and put this in     SCORES->WEIGHT. */  for (k = 0; k < total_unknown; k++)    {      scores[k].weight = 0;      /* Initialize the mean class distribution for this document. */      for (class = 0; class < num_classes; class++)	mean_class_dist[class] = 0;      for (committee = 0; committee < committee_size; committee++)	for (class = 0; class < num_classes; class++)	  mean_class_dist[scores[k].scores[committee][class].di]	    += scores[k].scores[committee][class].weight;      mean_class_sum = 0;      for (class = 0; class < num_classes; class++)	mean_class_sum += mean_class_dist[class];      assert (mean_class_sum > committee_size * 0.999);      assert (mean_class_sum < committee_size * 1.001);      for (class = 0; class < num_classes; class++)	mean_class_dist[class] /= mean_class_sum;      /* Set WEIGHT to KL-divergence-to-the-mean averaged over all 	 committee members. */      for (committee = 0; committee < committee_size; committee++)	{	  for (class = 0; class < bow_barrel_num_classes (doc_barrel); class++)	    {	      if (1e-100 <  scores[k].scores[committee][class].weight)		{		  scores[k].weight -= 		    ((1.0 / committee_size)		     * scores[k].scores[committee][class].weight		     * log (mean_class_dist[scores[k].scores[committee][class].di]			    / scores[k].scores[committee][class].weight));		  if (scores[k].weight < -0.1)		    bow_error("scores[k].weight <  -0.1: %.20f, %.20f", scores[k].weight,			      log (mean_class_dist[scores[k].scores[committee][class].di]				   / scores[k].scores[committee][class].weight));		}	    }	}      /* KL divergence must be greater than or equal to 0 */      if (scores[k].weight < -0.1)	bow_error("scores[k].weight <  -0.1: %.20f", scores[k].weight);       /* adjust for the correct log factor */      scores[k].weight *= 1.0 / log(2);            /* convert to a probability by scaling with the max kl-to-the-mean */      scores[k].weight /= -1.0 * log (1.0 / (double) committee_size) / log(2);        /* multiply in the epsilon factor */      scores[k].weight *= active_stream_epsilon;    }  /* select some documents randomly according to the weights */  for (j=0, k=0; k < num_to_add; j++)    {      int scoresi = rand() % total_unknown;      int doci;      double coin_flip;      doci = scores[scoresi].di;      doc = bow_cdocs_di2doc (doc_barrel->cdocs, doci);      assert (doc);      if (doc->type == bow_doc_unlabeled)	{	  coin_flip = bow_random_double (0,1);	  if (scores[scoresi].weight > coin_flip)	    {	      doc->type = bow_doc_train;	      k++;	      bow_verbosify (bow_progress, "Labeling %s, weight %f, flip %f", doc->filename,			     scores[scoresi].weight, coin_flip);	      for (committee=0; committee < committee_size; committee++)		bow_verbosify(bow_progress, " [(%d, %f) (%d, %f)]", 			      scores[scoresi].scores[committee][0].di,			      scores[scoresi].scores[committee][0].weight,			      scores[scoresi].scores[committee][1].di,			      scores[scoresi].scores[committee][1].weight);	      bow_verbosify(bow_progress, "\n");	    }	}      if (j > doc_barrel->cdocs->length * 1000)	bow_error ("Random number generator could not find enough "		   "unlabeled documents to convert.");    }  return;}/* Functions for calculating document density. */intactive_cdoc_is_used_for_density (bow_cdoc *cdoc){  return ((cdoc->type == bow_doc_train) ||	  (cdoc->type == bow_doc_unlabeled) ||	  (cdoc->type == bow_doc_pool) ||	  (cdoc->type == bow_doc_waiting));}/* Given a document barrel, set the CDOC->NORMALIZER to the document   word entropy.  Return the sum of the background cross entropies of   all the documents.  Assumes that IDF has already been set to Pr(w) */doubleactive_doc_barrel_set_entropy (bow_barrel *barrel){  bow_wv *wv;  bow_dv_heap *heap;  int wvi;  double pr_w_d;  double entropy;  double entropy_sum = 0;  double total_background_kl = 0;  int di;  bow_cdoc *cdoc;  bow_dv *dv;  double word_kl;  heap = bow_test_new_heap (barrel);  /* xxx Make sure to update CDOC->WORD_COUNT for a new vocabulary! */  while ((di = bow_heap_next_wv (heap, barrel, &wv, active_cdoc_is_used_for_density)) != -1)    {      cdoc = bow_array_entry_at_index (barrel->cdocs, di);      entropy = 0;      for (wvi = 0; wvi < wv->num_entries; wvi++)	{	  pr_w_d = ((double)wv->entry[wvi].count) / cdoc->word_count;	  entropy -= pr_w_d * log (pr_w_d);	  dv = bow_wi2dvf_dv (barrel->wi2dvf, wv->entry[wvi].wi);	  word_kl = (- pr_w_d * log ((1 - active_alpha) * dv->idf));	  assert (word_kl >= 0);	  total_background_kl += word_kl;	}      total_background_kl -= entropy;      cdoc->normalizer = entropy;      entropy_sum += entropy;    }  return total_background_kl;}/* Given a document barrel, set the WI2DVF->IDF to Pr(w) */voidactive_doc_barrel_set_pr_w (bow_barrel *barrel){  int wi, max_wi, dvi;  int total_num_words;  bow_dv *dv;  max_wi = MIN (barrel->wi2dvf->size, bow_num_words ());  total_num_words = 0;  for (wi = 0; wi < max_wi; wi++)    {      dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);      if (!dv)	continue;      dv->idf = 0;      for (dvi = 0; dvi < dv->length; dvi++)	{	  dv->idf += dv->entry[dvi].count;	  total_num_words += dv->entry[dvi].count;	}    }  for (wi = 0; wi < max_wi; wi++)    {      dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);      if (!dv)	continue;      dv->idf /= total_num_words;    }}/* Return the density of document WV, calculated using a KL divergence   distance to all other documents. */floatactive_wv_density (bow_wv *wv, bow_barrel *barrel, 		   float background_kl){  int wvi;  /* bow_bitvec *document_touched = bow_bitvec_new (1, barrel->cdocs->length); */  double pr_w_d;  double pr_w_wv;  double pr_w_wv_missing;  double total_kl;		/* sum of KL divergence to all other docs */  bow_dv *dv;  int dvi;  bow_cdoc *cdoc;  /* Set to background KL, that a document with no words would have. */  pr_w_wv_missing = 1.0 / (wv->num_entries + barrel->wi2dvf->num_words);  /*total_kl = - barrel->cdocs->length * log (pr_w_wv_missing);*/  total_kl = background_kl;  assert (total_kl == total_kl);	    for (wvi = 0; wvi < wv->num_entries; wvi++)    {      dv = bow_wi2dvf_dv (barrel->wi2dvf, wv->entry[wvi].wi);      if (!dv)	continue;      pr_w_wv = ((active_alpha *		  ((double)wv->entry[wvi].count) / wv->num_entries)		 + ((1 - active_alpha) * dv->idf));      for (dvi = 0; dvi < dv->length; dvi++)	{	  cdoc = bow_array_entry_at_index (barrel->cdocs, dv->entry[dvi].di);	  pr_w_d = ((double)dv->entry[dvi].count) / cdoc->word_count;	  /* Remove from the total what we said its contribution would	     be above in the background calculation. */	  /*total_kl += pr_w_d * log (pr_w_wv_missing);*/	  dv = bow_wi2dvf_dv (barrel->wi2dvf, wv->entry[wvi].wi);	  total_kl += pr_w_d * log ((1 - active_alpha) * dv->idf);	  assert (total_kl == total_kl);	  /* Add in the true contribution */	  total_kl -= pr_w_d * log (pr_w_wv);	  assert (total_kl == total_kl);	}    }  return total_kl;}/* Given a document barrel, set the CDOC->PRIOR to the document   density, using a KL divergence distance to all other   documents. Uses train and unlabeled documents.  Also sets the   CDOC->NORMALIZER to the document entropy */voidactive_doc_barrel_set_density (bow_barrel *barrel){  bow_dv_heap *heap;  int di;  bow_wv *wv;  bow_cdoc *cdoc;  double background_kl;  active_doc_barrel_set_pr_w (barrel);  background_kl = active_doc_barrel_set_entropy (barrel);  heap = bow_test_new_heap (barrel);  while ((di = bow_heap_next_wv (heap, barrel, &wv, active_cdoc_is_used_for_density)) != -1)    {      cdoc = bow_array_entry_at_index (barrel->cdocs, di);      cdoc->prior = active_wv_density (wv, barrel, background_kl);      cdoc->prior = exp (- active_beta * cdoc->prior / barrel->cdocs->length);      /*      printf ("%10g %s\n", cdoc->prior, cdoc->filename); */    }}/* Create a class barrel using active learning */bow_barrel *active_learn (bow_barrel *doc_barrel){  bow_barrel *vpc_barrel = NULL;  /* the vector-per-class barrel */  int max_ci;  int ci;  int di;  int mi;  int round_num;  int actual_num_hits;  int num_unlabeled_docs = 0;  int orig_num_unlabeled_docs;  bow_dv_heap *test_heap;	/* we'll extract test WV's from here */  bow_wv *query_wv;  active_scores *scores;  bow_cdoc *doc_cdoc;  bow_cdoc *class_cdoc;  rainbow_method *secondary_method;  /* Set the CDOC->PRIOR to the "density" value. */  if (active_selection_method == dkl)    active_doc_barrel_set_density (doc_barrel);  /* initialize variables */  max_ci = bow_barrel_num_classes(doc_barrel);  secondary_method = (rainbow_method*)    bow_method_at_name (active_secondary_method);  /* change all but vpc_with_weights */  doc_barrel->method->set_weights = secondary_method->set_weights;  doc_barrel->method->scale_weights = secondary_method->scale_weights;  doc_barrel->method->normalize_weights = secondary_method->normalize_weights;  doc_barrel->method->vpc_set_priors = secondary_method->vpc_set_priors;  doc_barrel->method->score = secondary_method->score;  doc_barrel->method->wv_set_weights = secondary_method->wv_set_weights;  doc_barrel->method->wv_normalize_weights =     secondary_method->wv_normalize_weights;  doc_barrel->method->free_barrel = secondary_method->free_barrel;  doc_barrel->method->params = secondary_method->params;  /* find the binary positive class, if needed */  if (active_binary_pos_classname != NULL)    {      assert(bow_barrel_num_classes(doc_barrel) == 2);      for (ci = 0; ci < bow_barrel_num_classes(doc_barrel); ci++)	{	  if (!strcmp(active_binary_pos_classname, 		      filename_to_classname		      (bow_barrel_classname_at_index (doc_barrel, ci))))	    {	      active_binary_pos_ci = ci;	      break;
💿 文件大小 12 K
👤 上传用户 Numb_pqc
📂 所属分类 Linux/Unix编程
📄 代码行数 2,028 行
💻 语言类型 C语言
🏷️ 相关标签

#Linux #英语 #文本分类
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -