📄 active.c

📁 在Linux下处理英语文本分类
💻 C
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
	    }	}           if (active_binary_pos_ci == -1)	{	  bow_error("No such class %s.", active_binary_pos_classname);	}    }  /* print out the model docs */  for (di=0; di < doc_barrel->cdocs->length; di++)    {      bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di);            if (cdoc->type == bow_doc_train)	bow_verbosify (bow_progress, "Initial %s\n", cdoc->filename);    }  /* count the number of unlabeled docs */  for (di=0; di < doc_barrel->cdocs->length; di++)    {      bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di);            if (cdoc->type == bow_doc_unlabeled)	num_unlabeled_docs++;    }  orig_num_unlabeled_docs = num_unlabeled_docs;  /* allocate the correct amount of space for unlabeled scoring */  scores = bow_malloc (sizeof(active_scores) * num_unlabeled_docs);  for (di = 0; di < num_unlabeled_docs; di++)    {      scores[di].scores = bow_malloc (sizeof(bow_score *) * active_committee_size);      for (mi = 0; mi < active_committee_size; mi++)	{	  scores[di].scores[mi] = bow_malloc (sizeof (bow_score) * 					      bow_barrel_num_classes(doc_barrel));	}    }  /* make the class barrel */  vpc_barrel =   bow_barrel_new (doc_barrel->wi2dvf->size,				 doc_barrel->cdocs->length-1,				 doc_barrel->cdocs->entry_size,				 doc_barrel->cdocs->free_func);   vpc_barrel->method = doc_barrel->method;  vpc_barrel->classnames = bow_int4str_new (0);  /* And, we're off */  for (round_num = 0; round_num < active_num_rounds; round_num++)    {      int hiti = 0;      /* Re-create the vector-per-class barrel in accordance with the	 new train/test settings. */      /* if we're pruning the vocab, do that now - fix for unlabeled percent */      if (bow_prune_vocab_by_infogain_n)	{	  /* Change barrel by removing words with small info gain, if requested. */	  	  bow_barrel_keep_top_words_by_infogain	    (bow_prune_vocab_by_infogain_n, doc_barrel, 	     bow_barrel_num_classes (doc_barrel));	}            /* Set word_count set correctly and set the entropy of each document	 in the normalizer of the cdoc; do this after all vocab changing */      {	query_wv = NULL;		/* Create the heap from which we'll get WV's. */	test_heap = bow_test_new_heap (doc_barrel);		/* Loop once for each document. */	while ((di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv,				       bow_cdoc_yes)) != -1)	  {	    int word_count = 0;	    int wvi;	    	    doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, 						 di);	    bow_wv_set_weights (query_wv, vpc_barrel);	    bow_wv_normalize_weights (query_wv, vpc_barrel);	    	    for (wvi = 0; wvi < query_wv->num_entries; wvi++)	      {		word_count += query_wv->entry[wvi].count;	      }	    	    doc_cdoc->word_count = word_count;	    doc_cdoc->normalizer = active_document_entropy(query_wv);	  }      }      /* generate test stats for the step in active learning */      if (active_test_stats)	{	  bow_em_perturb_method reset_perturb_start = -1;	  int reset_num_em_runs = -1;	  int reset_em_cross_entropy = -1;	  	  /* turn variance off for test stats */	  if (bow_em_perturb_starting_point != bow_em_perturb_none)	    {	      reset_perturb_start = bow_em_perturb_starting_point;	      bow_em_perturb_starting_point = bow_em_perturb_none;	    }	  /* make a real number of EM rounds just for printing tests */	  if (active_final_em)	    {	      reset_num_em_runs = bow_em_num_em_runs;	      bow_em_num_em_runs = 7;	    }	  /* Do no EM for stats-reporting */	  if (active_no_final_em)	    {	      reset_num_em_runs = bow_em_num_em_runs;	      bow_em_num_em_runs = 1;	    }	  /* turn cross entropy off if just testing docs */	  if (active_selection_method == wkl	      || active_selection_method == dkl)	    {	      reset_em_cross_entropy = em_cross_entropy;	      em_cross_entropy = 0;	    }	  if (vpc_barrel != NULL)	    bow_free_barrel (vpc_barrel);	  vpc_barrel = 	    (*(secondary_method->vpc_with_weights))(doc_barrel);	  active_test(stdout, doc_barrel, vpc_barrel);	  	  /* turn variance back on for committee members */	  if (reset_perturb_start != -1)	    bow_em_perturb_starting_point = reset_perturb_start;	  /* turn EM off for committee members */	  if (reset_num_em_runs != -1)	    bow_em_num_em_runs = reset_num_em_runs;	  /* turn cross entropy back on if necessary */	  if (reset_em_cross_entropy != -1)	    em_cross_entropy = reset_em_cross_entropy;	}            if (active_perturb_after_em)	{	  if (vpc_barrel)	    bow_barrel_free(vpc_barrel);	  vpc_barrel = 	    (*(secondary_method->vpc_with_weights))(doc_barrel);	}      for (mi = 0; mi < active_committee_size; mi++)	{	  bow_barrel *comm_barrel = NULL;	  	  hiti = 0;	  	  if (active_perturb_after_em)	    {	      comm_barrel = bow_barrel_copy(vpc_barrel);	      bow_em_perturb_starting_point = bow_em_perturb_with_dirichlet;	      bow_em_perturb_weights(comm_barrel, doc_barrel);	      bow_em_perturb_starting_point = bow_em_perturb_none;	    }	  else	    {	      comm_barrel = 		(*(secondary_method->vpc_with_weights))(doc_barrel);	    }	  		  if (active_print_committee_matrices)	    {	      active_test(stdout, doc_barrel, comm_barrel);	    }	  /* score all the unlabeled docs */	  	  /* Create the heap from which we'll get WV's. */	  test_heap = bow_test_new_heap (doc_barrel);	  	  /* Initialize QUERY_WV so BOW_TEST_NEXT_WV() knows not to try to free */	  query_wv = NULL;	  	  /* Loop once for each unlabeled document. */	  while ((di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv,					 bow_cdoc_is_unlabeled))		 != -1)	    {	      doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, 						   di);	      class_cdoc = bow_array_entry_at_index (comm_barrel->cdocs, 						     doc_cdoc->class);	      bow_wv_set_weights (query_wv, comm_barrel);	      bow_wv_normalize_weights (query_wv, comm_barrel);	      actual_num_hits = 		bow_barrel_score (comm_barrel, 				  query_wv, scores[hiti].scores[mi],				  bow_barrel_num_classes(comm_barrel), -1);	      assert (actual_num_hits == bow_barrel_num_classes(comm_barrel));	      if (mi == 0)		scores[hiti].di = di;	      else		assert (di == scores[hiti].di);	      hiti++;	    }	  bow_barrel_free (comm_barrel);	}      num_unlabeled_docs = hiti;      /* remap the scores if desired */      if (active_remap_scores_pr)	active_remap_scores(doc_barrel, scores,			    num_unlabeled_docs, active_committee_size);      /* choose docs to convert to model */      active_select_docs(doc_barrel, scores, 			 active_add_per_round, num_unlabeled_docs, 			 active_committee_size);   }  /* turn off perturbing for building final barrel */  if (bow_em_perturb_starting_point != bow_em_perturb_none)    {      bow_em_perturb_starting_point = bow_em_perturb_none;    }  /* make a real number of EM rounds if final em run */  if (active_final_em)    {      bow_em_num_em_runs = 7;    }  /* Do no EM for stats-reporting */  if (active_no_final_em)    {      bow_em_num_em_runs = 1;    }    /* turn cross entropy off if just testing docs */  if (active_selection_method == wkl      || active_selection_method == dkl)    {      em_cross_entropy = 0;    }  if (vpc_barrel != NULL)    bow_free_barrel(vpc_barrel);  vpc_barrel =     (*(secondary_method->vpc_with_weights))(doc_barrel);  /* free scores */  for (di = 0; di < orig_num_unlabeled_docs ; di++)    {      for (mi = 0; mi < active_committee_size; mi ++)	{	  bow_free (scores[di].scores[mi]);	}      bow_free(scores[di].scores);    }  bow_free(scores);  return vpc_barrel;}voidactive_undef_prior (bow_barrel *vpc_barrel,		    bow_barrel *doc_barrel){  bow_error("Active priors depends on secondary method.");  return;}intactive_undef_score (bow_barrel *barrel, bow_wv *query_wv, 		    bow_score *bscores, int bscores_len,		    int loo_class){  bow_error("Active scoring depends on secondary method.");  return -1;}/* Run test trials, outputing results to TEST_FP.  The results are   indended to be read and processed by the Perl script   ./rainbow-stats. */voidactive_test (FILE *test_fp, bow_barrel *rainbow_doc_barrel,		    bow_barrel *rainbow_class_barrel){  bow_dv_heap *test_heap;	/* we'll extract test WV's from here */  bow_wv *query_wv;  int di;			/* a document index */  bow_score *hits = NULL;  int num_hits_to_retrieve=0;  int actual_num_hits;  int hi;			/* hit index */  bow_cdoc *doc_cdoc;  bow_cdoc *class_cdoc;  fprintf (test_fp, "#0\n");  num_hits_to_retrieve = bow_barrel_num_classes (rainbow_class_barrel);  hits = alloca (sizeof (bow_score) * num_hits_to_retrieve);  /* Create the heap from which we'll get WV's. */  test_heap = bow_test_new_heap (rainbow_doc_barrel);  /* Initialize QUERY_WV so BOW_TEST_NEXT_WV() knows not to try to free */  query_wv = NULL;  /* Loop once for each test document.  NOTE: This will skip documents     that don't have any words that are in the vocabulary. */  while ((di = bow_heap_next_wv (test_heap, rainbow_doc_barrel, &query_wv,				 bow_cdoc_is_test)) != -1)    {      doc_cdoc = bow_array_entry_at_index (rainbow_doc_barrel->cdocs, 					   di);            class_cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, 					     doc_cdoc->class);      bow_wv_set_weights (query_wv, rainbow_class_barrel);      bow_wv_normalize_weights (query_wv, rainbow_class_barrel);      actual_num_hits = 	bow_barrel_score (rainbow_class_barrel, 			  query_wv, hits,			  num_hits_to_retrieve, -1);      assert (actual_num_hits == num_hits_to_retrieve);#if 0      printf ("%8.6f %d %8.6f %8.6f %d ",	      class_cdoc->normalizer, 	      class_cdoc->word_count, 	      class_cdoc->normalizer / class_cdoc->word_count, 	      class_cdoc->prior,	      doc_cdoc->class);      if (hits[0].di == doc_cdoc->class)	printf ("1\n");      else	printf ("0\n");#endif      fprintf (test_fp, "%s %s ", 	       doc_cdoc->filename, 	       filename_to_classname(class_cdoc->filename));      for (hi = 0; hi < actual_num_hits; hi++)	{	  class_cdoc = 	    bow_array_entry_at_index (rainbow_class_barrel->cdocs,				      hits[hi].di);	  fprintf (test_fp, "%s:%.*g ", 		   bow_barrel_classname_at_index		   (rainbow_class_barrel, hits[hi].di),		   bow_score_print_precision,		   hits[hi].weight);	}      fprintf (test_fp, "\n");    }}rainbow_method bow_method_active = {  "active",  NULL, /* bow_leave_weights_alone_since_theyre_really_counts */  0,				/* no weight scaling function */  NULL, /* bow_barrel_normalize_weights_by_summing, */  active_learn,  active_undef_prior,  active_undef_score,  bow_wv_set_weights_to_count,  NULL,				/* no need for extra weight normalization */  NULL  };void _register_method_active () __attribute__ ((constructor));void _register_method_active (){  static int done = 0;  if (done)     return;  bow_method_register_with_name ((bow_method*)&bow_method_active,				 "active", 				 sizeof (rainbow_method),				 &active_argp_child);  bow_argp_add_child (&active_argp_child);  done = 1;}
上一页 1 2 3 45
💿 文件大小 12 K
👤 上传用户 Numb_pqc
📂 所属分类 Linux/Unix编程
📄 代码行数 2,028 行
💻 语言类型 C语言
🏷️ 相关标签

#Linux #英语 #文本分类
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -