📄 dbacl.c

📁 dbacl是一个通用目的的digramic贝叶斯文本分类器。它可以学习你提供的文本
💻 C
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/*  * Copyright (C) 2002 Laird Breyer *   * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. *  * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. *  * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *  * Author:   Laird Breyer <laird@lbreyer.com> *//*  * Note to regenerate the Makefile and configure script: * aclocal * autoconf * touch NEWS README AUTHORS ChangeLog * automake -a */#ifdef HAVE_CONFIG_H#include "config.h"#endif#include <math.h>#include <ctype.h>#include <string.h>#include <stdlib.h>#include <unistd.h>#include <locale.h>#if defined HAVE_LANGINFO_H#include <langinfo.h>#endif#include "dbacl.h" /* make sure this is last *//* global variables */hash_bit_count_t default_max_hash_bits = 15;hash_count_t default_max_tokens = (1<<15);hash_bit_count_t default_max_grow_hash_bits = 16;hash_count_t default_max_grow_tokens = (1<<16);hash_bit_count_t decimation;Dirichlet dirichlet;Learner learner;Category cat[MAX_CAT];category_count_t cat_count = 0;int filter[MAX_CAT];category_count_t filter_count = 0;Regex re[MAX_RE];regex_count_t regex_count = 0;Empirical empirical;extern token_order_t ngram_order; /* defaults to 1 *//* for counting emails */bool_t not_header; extern MBOX_State mbox;/* for option processing */extern char *optarg;extern int optind, opterr, optopt;options_t options = 0;char *title = "";int exit_code = 0; /* default */int overflow_warning = 0;int digramic_overflow_warning = 0;int skewed_constraints_warning = 0;/*********************************************************** * MISCELLANEOUS FUNCTIONS                                 * ***********************************************************/void usage(char **argv) {  fprintf(stderr, 	  "\n");  fprintf(stderr, 	  "dbacl [-vniNR] [-T type] -c CATEGORY [-c CATEGORY]...\n");  fprintf(stderr, 	  "      [-f KEEP]... [FILE]...\n");  fprintf(stderr, 	  "\n");  fprintf(stderr, 	  "      classifies FILE or STDIN using CATEGORY, optionally\n");  fprintf(stderr, 	  "      removing all lines which don't fit the category KEEP.\n");  fprintf(stderr, 	  "\n");  fprintf(stderr, 	  "dbacl [-vnirND] [-h size] [-T type] -l CATEGORY \n");  fprintf(stderr, 	  "      [-g regex]... [FILE]...\n");  fprintf(stderr, 	  "\n");  fprintf(stderr, 	  "      builds a maximum entropy model from the words in FILE or STDIN\n");  fprintf(stderr, 	  "      or concatenated regex submatches if using the -g option.\n");  fprintf(stderr, 	  "\n");  fprintf(stderr, 	  "dbacl -V\n");  fprintf(stderr, 	  "\n");  fprintf(stderr, 	  "      prints program version.\n");}/*********************************************************** * CATEGORY FUNCTIONS                                      * ***********************************************************/void line_score_single_category(char *textbuf) {    score_t score = 0.0;    if( *textbuf ) {      if( options & (1<<OPTION_POSTERIOR) ) {	/* score = empirical_entropy(); */	fprintf(stdout, "%s", textbuf);      }      if( options & (1<<OPTION_SCORES) ) {	if( options & (1<<OPTION_VERBOSE) ) {	  score = -(cat[0].score/cat[0].complexity)/log(2.0);	  fprintf(stdout, "%s %6.2" FMT_printf_score_t " * %-4d %s", 		  cat[0].filename, score, 		  cat[0].complexity, textbuf);	} else {	  fprintf(stdout, "%s %7.2" FMT_printf_score_t " %s", 		  cat[0].filename, -cat[0].score, textbuf); 	}      }    }    cat[0].score = 0.0;    cat[0].complexity = 0;    if( options & (1<<OPTION_CALCENTROPY) ) {      clear_empirical(&empirical);    }}void line_score_multiple_categories(char *textbuf) {  category_count_t i;  int map;  score_t c, cmax;  /* find MAP */  cmax = cat[0].score;   map = 0;  for(i = 1; i < cat_count; i++) {    if(cmax < cat[i].score) {      cmax = cat[i].score;      map = i;    }  }  if(options & (1<<OPTION_POSTERIOR) ) {    /* compute probabilities given exclusive choices */    c = 0.0;    for(i = 0; i < cat_count; i++) {      c += exp((cat[i].score - cmax));    }          if( *textbuf ) {      for(i = 0; i < cat_count; i++) {	fprintf(stdout, "%s %6.2" FMT_printf_score_t "%% ", 		cat[i].filename, 100 * exp((cat[i].score - cmax))/c);      }      fprintf(stdout, "%s", textbuf);    }  } else if( options & (1<<OPTION_SCORES) ) {    /* display normalized divergence scores */    if( *textbuf ) {      for(i = 0; i < cat_count; i++) {	if( options & (1<<OPTION_VERBOSE) ) {	  fprintf(stdout, "%s %6.2" FMT_printf_score_t " * %-4d ", 		  cat[i].filename, 		  -cat[i].score/cat[i].complexity, 		  cat[i].complexity);	} else {	  fprintf(stdout, "%s %6.2" FMT_printf_score_t " ", 		  cat[i].filename, -cat[i].score);	}      }      fprintf(stdout, "%s", textbuf);    }  } else {    /* prune the text which doesn't fit */    for(i = 0; i < filter_count; i++) {      if( cat[map].score <= cat[filter[i]].score ) {	fprintf(stdout, "%s", textbuf);	break;      }    }  }      /* clean up for next line */  for(i = 0; i < cat_count; i++) {    cat[i].score = 0.0;    cat[i].complexity = 0;  }  if( options & (1<<OPTION_CALCENTROPY) ) {    clear_empirical(&empirical);  }}void score_single_category() {  if( options & (1<<OPTION_POSTERIOR) ) {    /* score = empirical_entropy(); */    /* fprintf(stdout, "n/a\n"); */  }  if( options & (1<<OPTION_APPEND) ) {    fprintf(stdout, "\nscores ");  }  if( options & (1<<OPTION_SCORES) ) {    if( options & (1<<OPTION_VERBOSE) ) {      fprintf(stdout, "cross_entropy %.2" FMT_printf_score_t \                      " bits complexity %d\n", 	      -(cat[0].score/cat[0].complexity) / log(2.0), cat[0].complexity);    } else {      fprintf(stdout, "score %.2" FMT_printf_score_t "\n", -cat[0].score);    }    if( options & (1<<OPTION_APPEND) ) {      if( cat[0].model_num_docs > 0 ) {	fprintf(stdout, "mean_complexity %s %5.2" FMT_printf_score_t "\n", 		cat[0].filename, 		(double)cat[0].model_full_token_count/cat[0].model_num_docs);      }    }  }  exit_code = 1;}void score_multiple_categories() {  bool_t no_title;  category_count_t i;  score_t c, cmax;  /* find MAP */  cmax = cat[0].score;   exit_code = 0;  for(i = 1; i < cat_count; i++) {    if(cmax < cat[i].score) {      cmax = cat[i].score;      exit_code = i;    }  }  if( options & (1<<OPTION_POSTERIOR) ) {    if( options & (1<<OPTION_APPEND) ) {      fprintf(stdout, "\nprobabilities ");    }    /* here we compute probabilities given exclusive choices */    c = 0.0;    for(i = 0; i < cat_count; i++) {      c += exp((cat[i].score - cmax));    }    for(i = 0; i < cat_count; i++) {      fprintf(stdout, "%s %5.2" FMT_printf_score_t "%% ", 	      cat[i].filename, 100 * exp((cat[i].score - cmax))/c);    }    fprintf(stdout, "\n");  } else if( options & (1<<OPTION_SCORES) ) {    if( options & (1<<OPTION_APPEND) ) {      fprintf(stdout, "\nscores ");    }    /* display logarithmic score */    for(i = 0; i < cat_count; i++) {      if( options & (1<<OPTION_VERBOSE) ) {	fprintf(stdout, "%s %5.2" FMT_printf_score_t " * %-d ", 		cat[i].filename, (-cat[i].score/cat[i].complexity)/log(2.0), 		cat[i].complexity);      } else {	fprintf(stdout, "%s %5.2" FMT_printf_score_t " ", 		cat[i].filename, -cat[i].score/log(2.0));      }    }    fprintf(stdout, "\n");    if( options & (1<<OPTION_APPEND) ) {      no_title = 1;      for(i = 0; i < cat_count; i++) {	if( cat[i].model_num_docs > 0 ) {	  if( no_title ) {	    fprintf(stdout, "mean_complexity ");	    no_title = 0;	  }	  fprintf(stdout, "%s %5.2" FMT_printf_score_t " ", 		  cat[i].filename, 		  (double)cat[i].model_full_token_count/cat[i].model_num_docs);	}      }      if( !no_title ) { fprintf(stdout, "\n"); }    }  } else if( options & (1<<OPTION_VERBOSE) ) {    if( options & (1<<OPTION_APPEND) ) {      fprintf(stdout, "\ncategory ");    }    fprintf(stdout, "%s\n", cat[exit_code].filename);   }  exit_code++; /* make number between 1 and cat_count+1 */}/*********************************************************** * FILE MANAGEMENT FUNCTIONS                               * ***********************************************************//* writes the learner to a file for easily readable category */error_code_t save_learner() {  alphabet_size_t i, j;  hash_count_t t;  regex_count_t c;  FILE *output;  c_item ci;  char buf[MAGIC_BUFSIZE];  char smb[MAX_SUBMATCH+1];  token_order_t s;  char *p;#if defined DIGITIZE_DIGRAMS  short int shval;#else  weight_t shval;#endif  if( options & (1<<OPTION_VERBOSE) ) {    fprintf(stdout, "saving category to file %s\n", learner.filename);  }    /* don't overwrite data files */  if( (output = fopen(learner.filename, "r")) ) {    /* output file exists already */    fgets(buf, MAGIC_BUFSIZE, output);    if( strncmp(buf, MAGIC1, 10) != 0 ) {      fprintf(stderr,	      "error: the file %s is already used for something, "	      "use another filename. Nothing written.\n", learner.filename);      fclose(output);      exit(0);    } else {      /* it's an existing category file */      fclose(output);    }  }    if( (output = fopen(learner.filename, "wb")) ) {    /* print out standard headers */    fprintf(output, MAGIC1, learner.filename, 	    (options & (1<<OPTION_REFMODEL)) ? "(ref)" : "");    fprintf(output, 	    MAGIC2_o, learner.divergence, learner.logZ, learner.max_order,	    (options & (1<<OPTION_MULTINOMIAL)) ? "multinomial" : "hierarchical" );    fprintf(output, MAGIC3, 	    (short int)learner.max_hash_bits, 	    (long int)learner.full_token_count, 	    (long int)learner.unique_token_count,	    (long int)learner.num_docs);    /* print out any regexes we might need */    for(c = 0; c < regex_count; c++) {      /* write the bitmap */      for(p = smb, s = 1; s <= MAX_SUBMATCH; s++) {	if( re[c].submatches & (1<<s) ) {	  *p++ = s + '0';	}      }      *p = '\0';#if defined HAVE_LIBBOOST_REGEX      /* does this work? */      fprintf(output, MAGIC5_wo, re[c].string, smb);#else      fprintf(output, MAGIC5_o, re[c].string, smb);#endif    }    /* this is optional too */    if( options & (1<<OPTION_CASEN) ) {      fprintf(output, MAGIC4);    }    if( options & (1<<OPTION_I18N) ) {      fprintf(output, MAGIC7);    }    fprintf(output, MAGIC6);     /* end of readable stuff */    /* character frequencies */    for(i = 0; i < ASIZE; i++) {      for(j = 0; j < ASIZE; j++) {	shval = PACK_DIGRAMS(learner.dig[i][j]);	while(fwrite(&shval, SIZEOF_DIGRAMS, 1, output) < 1);      }    }    /* token/feature weights */    for(t = 0; t < learner.max_tokens; t++) {      /* write each element so that it's easy to read back in a c_item array */      SET(ci.id,learner.hash[t].id);#if defined DIGITIZE_LAMBDA      ci.lam = (digitized_weight_t)((learner.hash[t].lam) * 100);      if( fabs((learner.hash[t].lam)) > 320 ) {	fprintf(stderr,		"warning: digitized an extreme lambda value - results may be unusable\n");      } #else      ci.lam = (learner.hash[t].lam);#endif      while(fwrite(&ci, sizeof(ci), 1, output) < 1);     }    fclose(output);  } else {    fprintf(stderr, "error: cannot open file for writing %s\n", 	    learner.filename);    return 0;  }  return 1;}/*********************************************************** * LEARNER FUNCTIONS                                       * ***********************************************************/void reset_mbox_messages() {  not_header = 1;  learner.num_docs = 0;}void count_mbox_messages(char *textbuf) {  switch(mbox.state) {  case HEADER:    if(not_header) {      learner.num_docs++;    }    not_header = 0;    break;  default:    not_header = 1;    break;  }}l_item *find_in_learner(hash_value_t id) {    register l_item *i, *loop;    /* start at id */    i = loop = &learner.hash[id & (learner.max_tokens - 1)];    while( FILLEDP(i) ) {	if( EQUALP(i->id,id) ) {	    return i; /* found id */	} else {
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -