📄 dbacl.c

📁 dbacl是一个通用目的的digramic贝叶斯文本分类器。它可以学习你提供的文本
💻 C
📖 第 1 页 / 共 4 页
字号:
    }  }  /* finish last word */  *q = 0; /* append NUL to tok */  /* now write weight in hash */  id = (hash_value_t)hash((unsigned char *)tok, strlen(tok), 0);  k = find_in_learner(id); /* guaranteed to be found */  fill_ref_vars(k, tok, r);	  /* now calculate the unchanging part of the normalizing constant */  if(k->order < r) {    partial_z += (exp(r * (k->lam)) - 1.0) *       exp(r * UNPACK_LWEIGHTS(k->ltrms) + 	  UNPACK_LWEIGHTS(k->dref));  }  /* done */  return partial_z;}/* just for debugging */void print_score(token_order_t r) {  hash_count_t i;  double logprob = 0.0;  double lpapprox = 0.0;  double norm;  for(i = 0; i < learner.max_tokens; i++) {    if( FILLEDP(&learner.hash[i]) &&	(learner.hash[i].order <= r) ) {      logprob += learner.hash[i].count * (learner.hash[i].lam);      if( learner.hash[i].order == 1 ) {	logprob += learner.hash[i].count * 	  UNPACK_LWEIGHTS(learner.hash[i].dref)/((weight_t)r);      }    }    if( FILLEDP(&learner.hash[i]) &&	(learner.hash[i].order == r) ) {      lpapprox += learner.hash[i].count * 	((learner.hash[i].lam) + 	 UNPACK_LWEIGHTS(learner.hash[i].ltrms) + 	 UNPACK_LWEIGHTS(learner.hash[i].dref)/((weight_t)r));    }  }  norm = learner.fixed_order_token_count[r] * learner.logZ;  printf("*** logprob = %" FMT_printf_score_t " * %d (r = %d, logZ = %" FMT_printf_score_t ")\n", 	 ((score_t)(logprob / learner.fixed_order_token_count[r])), 	 learner.fixed_order_token_count[r], 	 r, learner.logZ);  printf("*** lpapprox = %" FMT_printf_score_t " * %d (r = %d, logZ = %" FMT_printf_score_t ")\n", 	 ((score_t)(lpapprox / learner.fixed_order_token_count[r])), 	 learner.fixed_order_token_count[r], 	 r, learner.logZ);}/* minimizes the divergence by solving for lambda one    component at a time.  */void minimize_learner_divergence() {  hash_count_t i;  token_order_t r;  token_count_t c = 0;  score_t d, dd, z;  score_t thresh, old_lam;  score_t upz, div_extra_bits;  if( options & (1<<OPTION_VERBOSE) ) {    fprintf(stdout, "now maximizing model entropy\n");  }  div_extra_bits = 0.0;  learner.logZ = 0.0;  learner.divergence = 0.0;  for(r = 1;       r <= ((options & (1<<OPTION_MULTINOMIAL)) ? 1 : learner.max_order);       r++) {    /* here we precalculate various bits and pieces       which aren't going to change during this iteration */    upz = recalculate_reference_measure(r);    if( r > 1 ) {      /* calculate extra bits for divergence score */      z = 0.0;      for(i = 0; i < learner.max_tokens; i++) {	if( FILLEDP(&learner.hash[i]) &&	    (learner.hash[i].order == (r - 1)) ) {	  z += (learner.hash[i].lam) * learner.hash[i].count;	}      }      div_extra_bits += (learner.fixed_order_token_count[r - 1] > 0) ? 	(z/learner.fixed_order_token_count[r-1]) : 0;    }    if( options & (1<<OPTION_REFMODEL) ) {      break;    }    if( options & (1<<OPTION_VERBOSE) ) {      fprintf(stdout, "* optimizing order %d weights (%i tokens, %i unique)\n", 	      r, learner.fixed_order_token_count[r],	      learner.fixed_order_unique_token_count[r]);    }    z = learner_Z(r, upz);    dd = learner_divergence(z, r);    do {      d = dd;     /* save old divergence */      thresh = 0.0;      /* change each lambda value one at a time,	 since we only rarely recalculate Z,	 we randomly decide to update a given lambda to	 prevent large errors and periodic behaviour */      for(i = 0; i < learner.max_tokens; i++) {	if( FILLEDP(&learner.hash[i]) &&	    (learner.hash[i].order == r) ) {	  if( rand() > (RAND_MAX>>3)) {	    old_lam = (learner.hash[i].lam);#define FIRST_METHOD#if defined FIRST_METHOD	    learner.hash[i].lam = 	      ( (log((learner.hash[i].count * z) / 		     learner.fixed_order_token_count[r]) /		 ((weight_t)r) ) -		UNPACK_LWEIGHTS(learner.hash[i].ltrms) - 		UNPACK_LWEIGHTS(learner.hash[i].dref) / 		((weight_t)r) );#else	    learner.hash[i].lam = 	      ( log((learner.hash[i].count * 		     (z - 		      exp(r * 			  ((learner.hash[i].lam) + 			   UNPACK_LWEIGHTS(learner.hash[i].ltrms)) +			  UNPACK_LWEIGHTS(learner.hash[i].dref)) )) /		    (learner.fixed_order_token_count[r] -		     learner.hash[i].count) ) / 		((weight_t)r) -		UNPACK_LWEIGHTS(learner.hash[i].ltrms) -		UNPACK_LWEIGHTS(learner.hash[i].dref) / 		((weight_t)r) );#endif	    if( isnan((learner.hash[i].lam)) ) {	      /* precision problem, just ignore */	      learner.hash[i].lam = (old_lam);	      z = learner_Z(r, upz);	      thresh = 0.0;	    } else {	      /* this ad hoc bit prevents really big changes in lambda,		 should help stability and prevent too many threshold 		 recalculations of Z */#define MAX_LAMBDA_JUMP 5	      if( (learner.hash[i].lam) > 		  (old_lam + MAX_LAMBDA_JUMP) ) {		learner.hash[i].lam = (old_lam + MAX_LAMBDA_JUMP);	      } else if( (learner.hash[i].lam) < 			 (old_lam - MAX_LAMBDA_JUMP) ) {		learner.hash[i].lam = (old_lam - MAX_LAMBDA_JUMP);	      }	      /* if the changes so far add up to a lot, we should recalculate Z */	      thresh += fabs(old_lam - (learner.hash[i].lam)) * 		learner.hash[i].count;	    }	  }	  if( thresh > learner.fixed_order_token_count[r] ) {	    z = learner_Z(r, upz);	    thresh = 0.0;	  }	  if( ++c >= learner.fixed_order_unique_token_count[r] ) {	    c = 0;	    break;	  }	}      }      /* update values */      z = learner_Z(r, upz);      dd = learner_divergence(z, r);      if( options & (1<<OPTION_VERBOSE) ) {	fprintf(stdout, "entropy change %" FMT_printf_score_t \	                " --> %" FMT_printf_score_t "\n", 		d + div_extra_bits, 		dd + div_extra_bits);      }    } while( fabs(d - dd) > TOL );    learner.logZ = log(z)/((weight_t)r);    learner.divergence = dd + div_extra_bits;  }}/* dumps readable model weights to the output */void dump_model(FILE *out, FILE *in) {  hash_value_t id;  char buf[BUFLEN];  char tok[MAX_TOKEN_LEN];  char smb[MAX_SUBMATCH+1];  token_order_t s;  regex_count_t c;  char *p, *q;  l_item *k;  /* preamble - this is copied from save_learner */  fprintf(out, MAGIC1, learner.filename, 	  (options & (1<<OPTION_REFMODEL)) ? "(ref)" : "");  fprintf(out, MAGIC2_o, learner.divergence, learner.logZ, learner.max_order,	  (options & (1<<OPTION_MULTINOMIAL)) ? "multinomial" : "hierarchical" );  fprintf(out, MAGIC3, 	  (short int)learner.max_hash_bits, 	  (long int)learner.full_token_count, 	  (long int)learner.unique_token_count,	  (long int)learner.num_docs);  /* print out any regexes we might need */  for(c = 0; c < regex_count; c++) {    /* write the bitmap */    for(p = smb, s = 1; s <= MAX_SUBMATCH; s++) {      if( re[c].submatches & (1<<s) ) {	*p++ = s + '0';      }    }    *p = '\0';#if defined HAVE_LIBBOOST_REGEX    /* does this work? */    fprintf(out, MAGIC5_wo, re[c].string, smb);#else    fprintf(out, MAGIC5_o, re[c].string, smb);#endif  }  /* this is optional too */  if( options & (1<<OPTION_CASEN) ) {    fprintf(out, MAGIC4);  }  if( options & (1<<OPTION_I18N) ) {    fprintf(out, MAGIC7);  }  fprintf(out, MAGIC6);   fprintf(out, "# lambda | dig_ref | count | token\n");  /* now go through hash printing values */  rewind(learner.tmp);  q = tok;  while( !feof(learner.tmp) && fgets(buf, BUFLEN, learner.tmp) ) {    p = buf;    while( *p ) {      if( *p != -1) {	*q++ = *p; /* copy into tok */      } else { /* interword space */ 	*q = 0; /* append NUL to tok */	/* now write weight in hash */	id = (hash_value_t)hash((unsigned char *)tok, strlen(tok), 0);	k = find_in_learner(id); /* guaranteed to be found */        fprintf(out, "%9.3f %9.3f %7d %s\n",                 (weight_t)k->lam, UNPACK_LWEIGHTS(k->dref), k->count, tok);	q = tok; /* reset q */      }      p++;    }  }  /* finish last word */  *q = 0; /* append NUL to tok */  /* now write weight in hash */  id = (hash_value_t)hash((unsigned char *)tok, strlen(tok), 0);  k = find_in_learner(id); /* guaranteed to be found */  fprintf(out, "%9.3f %9.3f %7d %s\n", 	  (weight_t)k->lam, UNPACK_LWEIGHTS(k->dref), k->count, tok);}void optimize_learner_and_save() {  hash_count_t i;  token_order_t c;  if(100 * learner.unique_token_count >= HASH_FULL * learner.max_tokens) {     fprintf(stderr, 	    "warning: table full, some tokens ignored - "	    "try with option -h %i\n",	    learner.max_hash_bits + 1);  } else if( learner.unique_token_count <= 0 ) {    fprintf(stderr, 	    "warning: no tokens matched - have I learned nothing?\n");  }  if( skewed_constraints_warning ) {    fprintf(stderr,	    "warning: ran out of integers (too much data) constraints will be skewed.\n");  }  if( options & (1<<OPTION_VERBOSE) ) {    fprintf(stdout, 	    "picked up %i (%i distinct) tokens\n", 	    learner.full_token_count, learner.unique_token_count);    fprintf(stdout, 	    "calculating reference word weights\n");  }  /* find best Dirichlet parameters */  init_dirichlet();  optimize_dirichlet();  /* estimate the conditional digram probabilities */  compute_digram_probabilities();  if( learner.fixed_order_token_count[1] == 0 ) {    /* it's a higher order model but there are no first       order tokens! We can't handle that! Go through the        hash converting everything to first order.       This will result in incorrect calculations, unless       the higher order tokens don't overlap. Suitable only for       geniuses and fools. */    fprintf(stderr, 	    "\n"	    "warning: you have not defined any unigrams, so in this model\n"	    "         features will be treated independently, which is quite\n"	    "         likely incorrect. I hope you know what you're doing,\n"	    "         because I don't!\n\n");    options |= (1<<OPTION_MULTINOMIAL);    for(c = 2; c <= learner.max_order; c++) {      learner.fixed_order_token_count[1] += learner.fixed_order_token_count[c];      learner.fixed_order_unique_token_count[1] += 	learner.fixed_order_unique_token_count[c];    }    for(i = 0; i < learner.max_tokens; i++) {      if( FILLEDP(&learner.hash[i]) ) {	learner.hash[i].order = 1;      }    }  }  /* policy: always treat order 1 models multinomially */  if( learner.max_order == 1 ) {    options |= (1<<OPTION_MULTINOMIAL);  }  minimize_learner_divergence();  if( options & (1<<OPTION_DUMP) ) {    rewind(learner.tmp);    dump_model(stdout, learner.tmp);  }  fclose(learner.tmp);  /* now save the model to a file */  save_learner();}/*********************************************************** * MULTIBYTE FILE HANDLING FUNCTIONS                       * * this is suitable for any locale whose character set     * * encoding doesn't include NUL bytes inside characters    * ***********************************************************//* this code executed before processing each line of input.   - handles indents and appends via -Aa switches  */char *handle_indents_and_appends(char *textbuf) {  char *pptextbuf = textbuf; /* default */  if( options & (1<<OPTION_INDENTED) ) {    if( textbuf[0] == ' ' ) {      pptextbuf = textbuf + 1; /* processing should ignore indent */    } else {      if( options & (1<<OPTION_APPEND) ) {	fprintf(stdout, "%s", textbuf);      }      pptextbuf = NULL; /* no further processing */    }  }  /* if appending, print the lines as they      come in before they are processed */  if( options & (1<<OPTION_APPEND) ) {    fprintf(stdout, " %s", pptextbuf);  }  return pptextbuf;}/*********************************************************** * WIDE CHARACTER FILE HANDLING FUNCTIONS                  * * this is needed for any locale whose character set       * * encoding can include NUL bytes inside characters        * ***********************************************************//*********************************************************** * MAIN FUNCTIONS                                          * ***********************************************************/int main(int argc, char **argv) {  category_count_t c;  regex_count_t k;  FILE *input;  char op;  char *p;#if defined HAVE_LIBBOOST_REGEX  wchar_t *rstring;#endif  void (*preprocess_fun)(void) = NULL;  void (*word_fun)(char *, token_order_t, regex_count_t) = NULL;  char *(*pre_line_fun)(char *) = NULL;  void (*post_line_fun)(char *) = NULL;  void (*postprocess_fun)(void) = NULL;  /* set up internationalization */  if( !setlocale(LC_ALL, "") ) {    fprintf(stderr, 	    "warning: could not set locale, internationalization disabled\n");  } else {    if( options & (1<<OPTION_VERBOSE) ) {      fprintf(stderr, 	      "warning: international locales not supported\n");    }  }  /* parse the options */  while( (op = getopt(argc, argv, "Aac:Ddf:g:H:h:ij:l:MNnRrT:Vvw:x:")) > -1 ) {    switch(op) {    case 'A':      options |= (1<<OPTION_INDENTED);    case 'a':      options |= (1<<OPTION_APPEND);      break;    case 'D':      options |= (1<<OPTION_DEBUG);      break;    case 'M':      options |= (1<<OPTION_MULTINOMIAL);      break;    case 'N':      options |= (1<<OPTION_POSTERIOR);      break;    case 'R':      if( cat_count >= MAX_CAT ) {	fprintf(stderr, 		"warning: maximum reached, random text category omitted\n");      } else if( options & (1<<OPTION_LEARN) ) {	fprintf(stderr, 		"error: cannot use options -l and -R together\n");	usage(argv);	exit(0);      } else {	options |= (1<<OPTION_CLASSIFY);	cat[cat_count].filename = "random";	init_category(&cat[cat_count]);	init_purely_random_text_category(&cat[cat_count]);	cat_count++;      }      break;    case 'T':      if( !strncasecmp(optarg, "email", 5) ) {	options |= (1<<OPTION_MBOX_FORMAT);	options |= (1<<OPTION_XML); /* filter out HTML from messages */      } else if( !strncasecmp(optarg, "xml", 3) ) {	options |= (1<<OPTION_XML);      } else { /* default */	options |= (1<<OPTION_TEXT_FORMAT);      }      break;    case 'V':      fprintf(stdout, "dbacl version %s\n", VERSION);
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -