📄 active.c

📁 在Linux下处理英语文本分类
💻 C
📖 第 1 页 / 共 5 页
字号:
	  /* Normalize them so they sum to one. */	  nb_scores_sum = 0;	  for (class = 0; class < num_classes; class++)	    nb_scores_sum += nb_scores[committee][class];	  assert (nb_scores_sum > 0);	  for (class = 0; class < num_classes; class++)	    nb_scores[committee][class] /= nb_scores_sum;	}      /* Initialize the mean class distribution for this document. */      for (class = 0; class < num_classes; class++)	mean_class_dist[class] = 0;      for (committee = 0; committee < committee_size; committee++)	for (class = 0; class < num_classes; class++)	  mean_class_dist[class] += nb_scores[committee][class];      mean_class_sum = 0;      for (class = 0; class < num_classes; class++)	mean_class_sum += mean_class_dist[class];      assert (mean_class_sum > committee_size * 0.999);      assert (mean_class_sum < committee_size * 1.001);      for (class = 0; class < num_classes; class++)	mean_class_dist[class] /= mean_class_sum;      /* Set WEIGHT to KL-divergence-to-the-mean averaged over all 	 committee members. */      scores[k].weight = 0;      for (committee = 0; committee < committee_size; committee++)	{	  for (si = 0; si < bow_barrel_num_classes (doc_barrel); si++)	    {	      class = scores[k].scores[committee][si].di;	      if (1e-100 < nb_scores[committee][class])		{		  scores[k].weight -= 		    ((1.0 / committee_size)		     /* scale by kl-div of this document to this class */		     * nb_scores[committee][class]		     * log (mean_class_dist[class]			    / nb_scores[committee][class]));		}	    }	}      /* Scale the score by the document density. */      scores[k].weight *= cdoc->prior;      /* KL divergence must be greater than or equal to 0 */      if (scores[k].weight < -0.1)	bow_error("scores[k].weight <  -0.1: %.20f", scores[k].weight);    }  /* Sort based on weight */  qsort (scores, total_unknown, sizeof (active_scores),	 active_scores_compare);    /* Change doc types of those with highest entropy*/  for (k = 0; k < num_to_add ; k++)    {      bow_cdoc *doc;      doc = bow_cdocs_di2doc (doc_barrel->cdocs, scores[k].di);      assert (doc);      assert (doc->type == bow_doc_unlabeled);      bow_verbosify (bow_progress, "Labeling %s, weight %f", doc->filename,		     scores[k].weight);      for (committee=0; committee < committee_size; committee++)	bow_verbosify(bow_progress, " [(%d, %f) (%d, %f)]", 		      scores[k].scores[committee][0].di,		      scores[k].scores[committee][0].weight,		      scores[k].scores[committee][1].di,		      scores[k].scores[committee][1].weight);      bow_verbosify(bow_progress, "\n");      doc->type = bow_doc_train;    }    return;}/* select docs with the highest vote entropy (Dagan and Engelson) */voidactive_select_vote_entropy (bow_barrel *doc_barrel, active_scores *scores,  			    int num_to_add, int total_unknown, int committee_size){  int num_classes = bow_barrel_num_classes (doc_barrel);  double *mean_class_dist;  double mean_class_sum;  int committee;  int class;  int k;  int si;  mean_class_dist = alloca (sizeof (double) * num_classes);  /* Calculate the entropy of the class labels, H(Class|d,Committee),     where Class and Committee are random varibles, and put this in     SCORES->WEIGHT. */  for (k = 0; k < total_unknown; k++)    {      scores[k].weight = 0;      /* Initialize the scores to be 'votes' */      for (committee = 0; committee < committee_size; committee++)	{	  scores[k].scores[committee][0].weight = 1.0;	  for (si = 1; si < num_classes; si++)	    scores[k].scores[committee][si].weight = 0.0;	}      /* Initialize the mean class distribution for this document. */      for (class = 0; class < num_classes; class++)	mean_class_dist[class] = 0;      for (committee = 0; committee < committee_size; committee++)	for (class = 0; class < num_classes; class++)	  mean_class_dist[scores[k].scores[committee][class].di]	    += scores[k].scores[committee][class].weight;      mean_class_sum = 0;      for (class = 0; class < num_classes; class++)	mean_class_sum += mean_class_dist[class];      assert (mean_class_sum > committee_size * 0.999);      assert (mean_class_sum < committee_size * 1.001);      for (class = 0; class < num_classes; class++)	mean_class_dist[class] /= mean_class_sum;      /* Calculate the entropy of the mean class distribution */      for (class = 0; class < bow_barrel_num_classes (doc_barrel); class++)	{	  if (1e-100 <  mean_class_dist[class])	    {	      scores[k].weight -= 		(mean_class_dist[class]		 * log (mean_class_dist[class]));	    }	}      /* Entropy must be greater than or equal to 0 */      if (scores[k].weight < -0.1)	bow_error("scores[k].weight <  -0.1: %.20f", scores[k].weight);          }  /* Sort based on weight */  qsort (scores, total_unknown, sizeof (active_scores),	 active_scores_compare);    /* Change doc types of those with highest entropy*/  for (k = 0; k < num_to_add; )    {      int z;      double top_score;      int n;      int j;      /* find how many top ranked docs have same score */      top_score = scores[k].weight;      for (z=k; z < total_unknown && scores[z].weight == top_score ; z++);      /* add all with top score if won't max it out */      if (z < num_to_add)	{	  for (n=k; n<z; n++, k++)	    {	      bow_cdoc *doc;	      doc = bow_cdocs_di2doc (doc_barrel->cdocs, scores[k].di);	      assert (doc);	      assert (doc->type == bow_doc_unlabeled);	      bow_verbosify (bow_progress, "Labeling %s, weight %f", doc->filename,			     scores[n].weight);	      for (committee=0; committee < committee_size; committee++)		bow_verbosify(bow_progress, " %d", 			      scores[n].scores[committee][0].di);	      bow_verbosify(bow_progress, "\n");	      doc->type = bow_doc_train;	    }	}      else	{	  /* need to randomly select some of the docs for labeling */	  for (j=0, n=k; n < num_to_add; j++)	    {	      int si = (rand() % (z-k)) + k;	      int doci;	      bow_cdoc *doc;	      	      doci = scores[si].di;	      doc = bow_cdocs_di2doc (doc_barrel->cdocs, doci);	      assert (doc);	      if (doc->type == bow_doc_unlabeled)		{		  doc->type = bow_doc_train;		  bow_verbosify (bow_progress, "Labeling %s, weight %f", doc->filename,				 scores[si].weight);		  for (committee=0; committee < committee_size; committee++)		    bow_verbosify(bow_progress, " %d", 				  scores[si].scores[committee][0].di);		  bow_verbosify(bow_progress, "\n");		  n++;		}	      if (j > doc_barrel->cdocs->length * 1000)		bow_error ("Random number generator could not find enough "			   "unlabeled documents to convert.");	    }	  return;	}    }  return;}voidactive_select_uncertain (bow_barrel *doc_barrel, active_scores *scores,  			 int num_to_add, int total_unknown,			 int committee_size){  int k;    assert(num_to_add <= total_unknown);  assert(committee_size == 1);  /* Make smallest top classification better */  for (k=0; k < total_unknown; k++)    {      scores[k].weight = -1 * scores[k].scores[0][0].weight;    }  /* sort based on weight */  qsort(scores, total_unknown, sizeof (active_scores),	active_scores_compare);  /* change doc types */  for (k=0; k < num_to_add; k++)    {      bow_cdoc *doc;      doc = bow_cdocs_di2doc (doc_barrel->cdocs, scores[k].di);      assert(doc);      assert(doc->type == bow_doc_unlabeled);      bow_verbosify(bow_progress, "Labeling %s\n", doc->filename);      doc->type = bow_doc_train;    }  return; }voidactive_select_relevant (bow_barrel *doc_barrel, active_scores *scores,  			int num_to_add, int total_unknown,			int committee_size){  int k;  assert(num_to_add <= total_unknown);  assert(committee_size == 1);  for (k=0; k < total_unknown; k++)    {      scores[k].weight = -1 *scores[k].scores[0][0].weight;    }	    /* sort based on weight */  qsort(scores, total_unknown, sizeof (active_scores),	active_scores_compare);  /* change doc types */  for (k = total_unknown - num_to_add; k < total_unknown; k++)    {      bow_cdoc *doc = bow_cdocs_di2doc (doc_barrel->cdocs, scores[k].di);      assert(doc);      assert(doc->type == bow_doc_unlabeled);      bow_verbosify(bow_progress, "Labeling %s\n", doc->filename);      doc->type = bow_doc_train;    }  return;}void active_select_length(bow_barrel *doc_barrel, active_scores *scores,  			   int num_to_add, int total_unknown, int committee_size){  int k;    assert(num_to_add <= total_unknown);  /* set weight to the document length */  for (k=0; k < total_unknown; k++)    {      bow_cdoc *cdoc = bow_cdocs_di2doc (doc_barrel->cdocs, scores[k].di);      scores[k].weight = cdoc->word_count;    }   /* sort based on weight */  qsort(scores, total_unknown, sizeof (active_scores),	active_scores_compare);  /* change doc types */  for (k = 0 ;  k < num_to_add; k++)    {      bow_cdoc *doc = bow_cdocs_di2doc (doc_barrel->cdocs, scores[k].di);            assert(doc);      assert(doc->type == bow_doc_unlabeled);      bow_verbosify(bow_progress, "Labeling %s, weight %f\n", doc->filename,		    scores[k].weight);      doc->type = bow_doc_train;    }  return;}voidactive_select_random (bow_barrel *doc_barrel, active_scores *scores,  		      int num_to_add, int total_unknown,		      int committee_size){  int j;  int k;  bow_cdoc *doc;  assert(num_to_add <= total_unknown);  for (j=0, k=0; k < num_to_add; j++)    {      int scoresi = rand() % total_unknown;      int doci;      doci = scores[scoresi].di;      doc = bow_cdocs_di2doc (doc_barrel->cdocs, doci);      assert (doc);      if (doc->type == bow_doc_unlabeled)	{	  doc->type = bow_doc_train;	  bow_verbosify(bow_progress, "Labeling %s\n", doc->filename);	  k++;	}      if (j > doc_barrel->cdocs->length * 1000)	bow_error ("Random number generator could not find enough "		   "unlabeled documents to convert.");    }  return;}voidactive_select_stream_ve (bow_barrel *doc_barrel, active_scores *scores,  			 int num_to_add, int total_unknown,			 int committee_size){  int j;  bow_cdoc *doc;    int num_classes = bow_barrel_num_classes (doc_barrel);  double *mean_class_dist;  double mean_class_sum;  int committee;  int class;  int k;  int si;  assert(num_to_add <= total_unknown);  mean_class_dist = alloca (sizeof (double) * num_classes);  /* Calculate the entropy of the class labels, H(Class|d,Committee),     where Class and Committee are random varibles, and put this in     SCORES->WEIGHT. */  for (k = 0; k < total_unknown; k++)    {      scores[k].weight = 0;      /* Initialize the scores to be 'votes' */      for (committee = 0; committee < committee_size; committee++)	{	  scores[k].scores[committee][0].weight = 1.0;	  for (si = 1; si < num_classes; si++)	    scores[k].scores[committee][si].weight = 0.0;	}      /* Initialize the mean class distribution for this document. */      for (class = 0; class < num_classes; class++)	mean_class_dist[class] = 0;      for (committee = 0; committee < committee_size; committee++)	for (class = 0; class < num_classes; class++)	  mean_class_dist[scores[k].scores[committee][class].di]	    += scores[k].scores[committee][class].weight;      mean_class_sum = 0;      for (class = 0; class < num_classes; class++)	mean_class_sum += mean_class_dist[class];      assert (mean_class_sum > committee_size * 0.999);      assert (mean_class_sum < committee_size * 1.001);      for (class = 0; class < num_classes; class++)	mean_class_dist[class] /= mean_class_sum;      /* Calculate the entropy of the mean class distribution */      for (class = 0; class < bow_barrel_num_classes (doc_barrel); class++)
💿 文件大小 12 K
👤 上传用户 Numb_pqc
📂 所属分类 Linux/Unix编程
📄 代码行数 2,028 行
💻 语言类型 C语言
🏷️ 相关标签

#Linux #英语 #文本分类
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -