📄 active.c

📁 在Linux下处理英语文本分类
💻 C
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/* Weight-setting and scoring implementation for active learning *//* Copyright (C) 1997, 1998, 1999 Andrew McCallum   Written by:  Kamal Nigam <knigam@cs.cmu.edu>   This file is part of the Bag-Of-Words Library, `libbow'.   This library is free software; you can redistribute it and/or   modify it under the terms of the GNU Library General Public License   as published by the Free Software Foundation, version 2.      This library is distributed in the hope that it will be useful,   but WITHOUT ANY WARRANTY; without even the implied warranty of   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU   Library General Public License for more details.   You should have received a copy of the GNU Library General Public   License along with this library; if not, write to the Free Software   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */#include <bow/libbow.h>#include <math.h>#include <argp/argp.h>#include <stdlib.h>#include <bow/em.h>typedef enum {   dkl,  length,  qbc,  randomly,  relevance,  skl,  sve,  uncertainty,  ve,  wkl} active_selection_type;typedef struct _active_scores {  int di;   /* the doc barrel index of the doc */  double weight;  /* weight used for selecting */  bow_score **scores;  /* the scores of the doc */} active_scores;void active_select_length (bow_barrel *doc_barrel, active_scores *scores,  			   int num_to_add, int total_unknown, int committee_size);void active_select_uncertain (bow_barrel *doc_barrel, active_scores *scores,  			      int num_to_add, int total_unknown, int committee_size);void active_select_relevant (bow_barrel *doc_barrel, active_scores *scores,  			     int num_to_add, int total_unknown, int committee_size);void active_select_random (bow_barrel *doc_barrel, active_scores *scores,  			   int num_to_add, int total_unknown, int committee_size);void active_select_qbc (bow_barrel *doc_barrel, active_scores *scores,  			int num_to_add, int total_unknown, int committee_size);void active_select_weighted_kl (bow_barrel *doc_barrel, active_scores *scores,  				int num_to_add, int total_unknown, int committee_size);void active_select_dkl (bow_barrel *doc_barrel, active_scores *scores,  			int num_to_add, int total_unknown, int committee_size);void active_select_vote_entropy (bow_barrel *doc_barrel, active_scores *scores,  				 int num_to_add, int total_unknown, int committee_size);void active_select_stream_ve (bow_barrel *doc_barrel, active_scores *scores,  			      int num_to_add, int total_unknown, int committee_size);void active_select_stream_kl (bow_barrel *doc_barrel, active_scores *scores,  			      int num_to_add, int total_unknown, int committee_size);void active_test (FILE *test_fp, bow_barrel *rainbow_doc_barrel,		  bow_barrel *rainbow_class_barrel);/* The variables that can be changed on the command line, with defaults: */static int active_add_per_round = 4;static int active_test_stats = 0;static int active_committee_size = 1;static active_selection_type active_selection_method = uncertainty;static int active_num_rounds = 10;static void (* active_select_docs)(bow_barrel *, active_scores *,  int, int, int) =                active_select_uncertain;static int active_binary_pos_ci = -1;static char* active_binary_pos_classname = NULL;static char* active_secondary_method = "naivebayes";static int active_final_em = 0;static int active_print_committee_matrices = 0;static int active_qbc_low_kl = 0;static int active_pr_print_stat_summary = 0;static int active_pr_window_size = 20;static int active_remap_scores_pr = 0;static int active_no_final_em = 0;static double active_alpha = 0.5;static double active_beta = 5;static double active_stream_epsilon = 0.3;static int active_perturb_after_em = 0;/* The integer or single char used to represent this command-line option.   Make sure it is unique across all libbow and rainbow. */enum {  ACTIVE_ADD_PER_ROUND = 4000,  ACTIVE_TEST_STATS,  ACTIVE_SELECTION_METHOD,  ACTIVE_NUM_ROUNDS,  ACTIVE_BINARY_POS,  ACTIVE_SECONDARY_METHOD,  ACTIVE_COMMITTEE_SIZE,  ACTIVE_FINAL_EM,  ACTIVE_PRINT_COMMITTEE_MATRICES,  ACTIVE_QBC_LOW_KL,  ACTIVE_PR_PRINT_STAT_SUMMARY,  ACTIVE_PR_WINDOW_SIZE,  ACTIVE_REMAP_SCORES_PR,  ACTIVE_NO_FINAL_EM,  ACTIVE_BETA,  ACTIVE_STREAM_EPSILON,  ACTIVE_PERTURB_AFTER_EM,};static struct argp_option active_options[] ={  {0,0,0,0,   "Active Learning options:", 70},  {"active-add-per-round", ACTIVE_ADD_PER_ROUND, "NUM", 0,   "Specify the number of documents to label each round.  The default is 4."},  {"active-test-stats", ACTIVE_TEST_STATS, 0, 0,   "Generate output for test docs every n rounds."},  {"active-selection-method", ACTIVE_SELECTION_METHOD, "METHOD", 0,   "Specify the selection method for picking unlabeled docs. "   "One of uncertainty, relevance, qbc, random. "   "The default is 'uncertainty'."},  {"active-num-rounds", ACTIVE_NUM_ROUNDS, "NUM", 0,   "The number of active learning rounds to perform.  The default is 10."},  {"active-binary-pos", ACTIVE_BINARY_POS, "CLASS", 0,   "The name of the positive class for binary classification.  Required for"   "relevance sampling."},  {"active-secondary-method", ACTIVE_SECONDARY_METHOD, "METHOD", 0,   "The underlying method for active learning to use.  The default is 'naivebayes'."},  {"active-committee-size", ACTIVE_COMMITTEE_SIZE, "NUM", 0,   "The number of committee members to use with QBC.  Default is 1."},  {"active-final-em", ACTIVE_FINAL_EM, 0, 0,   "Finish with a full round of EM."},  {"active-print-committee-matrices", ACTIVE_PRINT_COMMITTEE_MATRICES, 0, 0,   "Print the confusion matrix for each committee member at each round."},  {"active-qbc-low-kl", ACTIVE_QBC_LOW_KL, 0, 0,   "Select documents with the lowest kl-divergence instead of the highest."},  {"active-pr-print-stat-summary", ACTIVE_PR_PRINT_STAT_SUMMARY, 0, 0,   "Print the precision recall curves used for score to probability remapping."},  {"active-pr-window-size", ACTIVE_PR_WINDOW_SIZE, "NUM", 0,   "Set the window size for precision-recall score to probability remapping."   "The default is 20."},  {"active-remap-scores-pr", ACTIVE_REMAP_SCORES_PR, 0, 0,   "Remap scores with sneaky precision-recall tricks."},  {"active-no-final-em", ACTIVE_NO_FINAL_EM, 0, 0,   "Finish without a full round of EM."},  {"active-beta", ACTIVE_BETA, "NUM", 0,   "Increase spread of document densities."},  {"active-stream-epsilon", ACTIVE_STREAM_EPSILON, "NUM", 0,   "The rate factor for selecting documents in stream sampling."},  {"active-perturb-after-em", ACTIVE_PERTURB_AFTER_EM, 0, 0,   "Perturb after running EM to create committee members."},  {0, 0}};error_tactive_parse_opt (int key, char *arg, struct argp_state *state){  switch (key)    {    case ACTIVE_ADD_PER_ROUND:      active_add_per_round = atoi(arg);      break;    case ACTIVE_TEST_STATS:      active_test_stats = 1;      break;    case ACTIVE_SELECTION_METHOD:      if (!strcmp(arg, "uncertainty"))	{	  active_selection_method = uncertainty;	  active_select_docs = active_select_uncertain;	}      else if (!strcmp(arg, "length"))	{	  active_selection_method = length;	  active_select_docs = active_select_length;	}      else if (!strcmp(arg, "relevance"))	{	  active_selection_method = relevance;	  active_select_docs = active_select_relevant;	}      else if (!strcmp(arg, "random"))	{	  active_selection_method = randomly;	  active_select_docs = active_select_random;	}      else if (!strcmp(arg, "qbc"))	{	  active_selection_method = qbc;	  active_select_docs = active_select_qbc;	}      else if (!strcmp(arg, "ve"))	{	  active_selection_method = ve;	  active_select_docs = active_select_vote_entropy;	}	        else if (!strcmp(arg, "wkl"))	{	  active_selection_method = wkl;	  active_select_docs = active_select_weighted_kl;	}      else if (!strcmp(arg, "dkl"))	{	  active_selection_method = dkl;	  active_select_docs = active_select_dkl;	}      else if (!strcmp(arg, "sve"))	{	  active_selection_method = sve;	  active_select_docs = active_select_stream_ve;	}      else if (!strcmp(arg, "skl"))	{	  active_selection_method = skl;	  active_select_docs = active_select_stream_kl;	}      else	bow_error("Invalid argument for --active-selection-method");      break;    case ACTIVE_NUM_ROUNDS:      active_num_rounds = atoi(arg);      break;    case ACTIVE_BINARY_POS:      active_binary_pos_classname = arg;      break;    case ACTIVE_SECONDARY_METHOD:      active_secondary_method = arg;      break;    case ACTIVE_COMMITTEE_SIZE:      active_committee_size = atoi (arg);      break;    case ACTIVE_FINAL_EM:      active_final_em = 1;      break;    case ACTIVE_PRINT_COMMITTEE_MATRICES:      active_print_committee_matrices = 1;      break;    case ACTIVE_QBC_LOW_KL:      active_qbc_low_kl = 1;      break;    case ACTIVE_REMAP_SCORES_PR:      active_remap_scores_pr = 1;      break;    case ACTIVE_PR_WINDOW_SIZE:      active_pr_window_size = atoi (arg);      break;    case ACTIVE_PR_PRINT_STAT_SUMMARY:      active_pr_print_stat_summary = 1;      break;    case ACTIVE_NO_FINAL_EM:      active_no_final_em = 1;      break;    case ACTIVE_BETA:      active_beta = atof (arg);      break;    case ACTIVE_STREAM_EPSILON:      active_stream_epsilon = atof (arg);      break;    case ACTIVE_PERTURB_AFTER_EM:      active_perturb_after_em = 1;      break;    default:      return ARGP_ERR_UNKNOWN;    }  return 0;}static const struct argp active_argp ={  active_options,  active_parse_opt};static struct argp_child active_argp_child ={  &active_argp,		/* This child's argp structure */  0,			/* flags for child */  0,			/* optional header in help message */  0			/* arbitrary group number for ordering */};/* End of command-line options specific to EM *//* Given a fully-specified file path name (all the way from `/'),   return just the last filename part of it. */static inline const char *filename_to_classname (const char *filename){  const char *ret;  ret = strrchr (filename, '/');  if (ret)    return ret + 1;  return filename;}/* cheat and look at the unlabeled data and convert the scores intotrue probabilities based on a window size.  BUG: we're not resortingthe weights as we should be. */voidactive_remap_scores (bow_barrel *doc_barrel, active_scores *scores,  		     int total_unknown, int committee_size){  int num_classes = bow_barrel_num_classes(doc_barrel);  bow_em_pr_struct *pr_by_class[num_classes];  int member;  int ci;  int scorei;  int hi;    /* malloc some space for pr stats */  for (ci = 0; ci < num_classes; ci++)    pr_by_class[ci] = bow_malloc(sizeof(bow_em_pr_struct) * total_unknown);    for (member = 0; member < committee_size; member++)    {      /* arrange this members scores by class, and note correctness */      for (scorei = 0; scorei < total_unknown; scorei++)	{	  bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, 						     scores[scorei].di);	  	  for (hi = 0; hi < num_classes; hi++)	    {	      pr_by_class[scores[scorei].scores[member][hi].di][scorei].score = 		scores[scorei].scores[member][hi].weight;	      pr_by_class[scores[scorei].scores[member][hi].di][scorei].correct = 		(cdoc->class == scores[scorei].scores[member][hi].di		 ? 1 : 0);	    }	}		        /* sort the scores for each class by descending score */      for (ci = 0; ci < num_classes; ci ++)	qsort(pr_by_class[ci], total_unknown, sizeof (bow_em_pr_struct),	      bow_em_pr_struct_compare);		        /* print out a summary of the stats */      if (active_pr_print_stat_summary)	{	  for (ci = 0; ci < num_classes; ci++)	    {	      int pr_index;	      int correct=0;	      int count=0;	      	      bow_verbosify(bow_progress, "%25s", 			    filename_to_classname			    (bow_barrel_classname_at_index (doc_barrel, ci)));	      	      for (pr_index = 0; pr_index < total_unknown; pr_index++)		{		  		  if (pr_index % active_pr_window_size == 0)		    {		      if (pr_index != 0)			{			  while (pr_index < total_unknown &&				 pr_by_class[ci][pr_index-1].score == 				 pr_by_class[ci][pr_index].score)			    {			      correct += pr_by_class[ci][pr_index].correct;			      count++;			      pr_index++;			    }			  bow_verbosify(bow_progress, " %3.0f (%1.3f)", 					(float) correct * 100.0 / count,					pr_by_class[ci][pr_index].score);			  if (!(pr_index < total_unknown))			    break;			}		      correct = 0;		      count = 0;		    }		  correct += pr_by_class[ci][pr_index].correct;		  count++;		  		  if (pr_by_class[ci][pr_index].correct != 0 &&		      pr_by_class[ci][pr_index].correct != 1)		    bow_error("Big Problem");		}	      	      bow_verbosify(bow_progress, "\n");	    }	}      /* remap the scores to better probabilities */      for (scorei = 0; scorei < total_unknown; scorei++)	{	  double prob_by_ci[100];	  double total = 0.0;
12 3 4 5 下一页
💿 文件大小 12 K
👤 上传用户 Numb_pqc
📂 所属分类 Linux/Unix编程
📄 代码行数 2,028 行
💻 语言类型 C语言
🏷️ 相关标签

#Linux #英语 #文本分类
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -