⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 bayesol.c

📁 dbacl是一个通用目的的digramic贝叶斯文本分类器。它可以学习你提供的文本
💻 C
📖 第 1 页 / 共 2 页
字号:
/*  * Copyright (C) 2002 Laird Breyer *   * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. *  * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. *  * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *  * Author:   Laird Breyer <laird@lbreyer.com> *//*  * Note to regenerate the Makefile and configure script: * aclocal * autoconf * touch NEWS README AUTHORS ChangeLog * automake -a */#ifdef HAVE_CONFIG_H#include "config.h"#endif#include <assert.h>#include <math.h>#include <limits.h>#include <ctype.h>#include <string.h>#include <stdio.h>#include <stdlib.h>#include <unistd.h>#include <locale.h>#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H#include <wctype.h>#include <wchar.h>#else#undef HAVE_LIBBOOST_REGEX#endif#if defined HAVE_LIBBOOST_REGEX#define UNICODE#include <boost/regex.h>#else #include <sys/types.h>#include <regex.h>#endif#include "bayesol.h" /* global variables */Spec spec; /* used in risk-parser */char *textbuf = NULL;charbuf_len_t textbuf_len = 0;wchar_t *wc_textbuf = NULL;charbuf_len_t wc_textbuf_len = 0;char *aux_textbuf = NULL;charbuf_len_t aux_textbuf_len = 0;/* for option processing */extern char *optarg;extern int optind, opterr, optopt;options_t options = 0;char *title = "";/* parser interface */extern int parse_loss_vec(category_count_t i, char *buf);extern int parse_risk_spec(FILE *input);bool_t found_scores = 0;int exit_code = 0; /* default */#if defined OS_SUNint isinf(double x) { return !finite(x) && x==x; }#endif/*********************************************************** * MISCELLANEOUS FUNCTIONS                                 * ***********************************************************/void usage(char **argv) {  fprintf(stderr, 	  "\n");  fprintf(stderr, 	  "bayesol [-vi] -c RISKSPEC [FILE]...\n");  fprintf(stderr, 	  "      calculates the optimal Bayes solution using RISKSPEC, with\n");  fprintf(stderr, 	  "      input from FILE or STDIN.\n");  fprintf(stderr, 	  "\n");  fprintf(stderr, 	  "bayesol -V\n");  fprintf(stderr, 	  "\n");  fprintf(stderr, 	  "      prints program version.\n");}void print_line() {  fprintf(stdout, "%s", textbuf);}void init_spec() {  category_count_t i;  spec.num_cats = 0;  spec.num_priors = 0;  for( i = 0; i < MAX_CAT; spec.loss_list[i++] = NULL);  for( i = 0; i < MAX_CAT; spec.cross_entropy[i++] = 0.0);  for( i = 0; i < MAX_CAT; spec.complexity[i++] = 0.0);}/*********************************************************** * PARSING RELATED FUNCTIONS                               * ***********************************************************//* this function is called before processing each file.   It constructs a linked list of regular expressions if   there are any. Each regex is associated with the string    representation of the corresponding loss vector */void setup_regexes() {  category_count_t i;  bool_t has_empty_string;  LossVector *p;  RegMatch *q, *r;#if defined HAVE_LIBBOOST_REGEX  charbuf_len_t rr;  wchar_t *ws;#endif  for( i = 0; i < spec.num_cats; i++ ) {    /* if spec.loss_list[i] does not contain a        LossVector whose regex is the empty string,        then this specification is not complete and we exit */    if( spec.loss_list[i] == NULL ) {      fprintf(stderr, "error: missing loss_matrix entry for category %s\n", 	      spec.catname[i]);      exit(0);    } else {      has_empty_string = 0;      /* step through the list of vectors looking for regexes */      for(p = spec.loss_list[i]; p != NULL; p = p->next) {	if( !(p->re[0]) ) {	  has_empty_string = 1;	  p->found = 1;	} else {	  /* it's a genuine regex, try compiling it */	  q = malloc(sizeof(RegMatch));	  if(q) {#if defined HAVE_LIBBOOST_REGEX	    /* boost regexes accept wide char strings */	    rr = strlen(p->re);	    ws = malloc((rr+1) * sizeof(wchar_t));	    if( !ws || (rr = mbstowcs(ws, p->re, rr)) <= 0) {	      fprintf(stderr, 		      "error: couldn't convert regular expression '%s'.\n",		      p->re);	      free(q);	      if(ws) { free(ws); }	      q = NULL;	      ws = NULL;	    } else {	      ws[rr] = L'\0';	      if( regcomp(&(q->reg), ws, REG_EXTENDED) == 0 ) {		p->found = 0;		q->lv = p;		q->next = NULL;	      } else {		fprintf(stderr,			"warning: couldn't compile regex '%s', ignoring\n",			p->re);		free(q);		q = NULL;	      }	    }#else	    /* GNU regexes accept ordinary strings */	    if( regcomp(&(q->reg), p->re, REG_EXTENDED) == 0 ) {	      p->found = 0;	      q->lv = p;	      q->next = NULL;	    } else {	      fprintf(stderr,		      "warning: couldn't compile regex '%s', ignoring\n",		      p->re);	      free(q);	      q = NULL;	    }#endif	  } else {	    fprintf(stderr,		    "warning: no memory for regex '%s', ignoring\n",		    p->re);	  }	  /* add this regex to the list we have to check online */	  	  if( spec.regs == NULL ) {	    spec.regs = q;	  } else {	    for(r = spec.regs; r->next != NULL; r = r->next);	    r->next = q;	  }	}      }      /* consistency check */      if( !has_empty_string ) {	fprintf(stderr,		"error: missing loss_matrix entry for category %s (need \"\" case)\n", 		spec.catname[i]);	exit(0);      }    }  }}/* parses the dbacl scores. Format used is    scores [cat s * c]...    Note: destroys the buf */int parse_dbacl_scores(char *buf) {  char *tok;  category_count_t index, counter;    /* first token is scores */  tok = strtok(buf, " \n");  counter = 0;  /* next gobble up tokens four at a time */  while(tok) {    /* first we get the category name/index */    if( !(tok = strtok(NULL, " \n")) ) {      break;    } else {      for(index = 0; index < spec.num_cats; index++) {	if( strcmp(spec.catname[index], tok) == 0 ) {	  break;	}      }      if( index == spec.num_cats ) {	return 0;      }    }    /* now that we have the index, associate the cross entropy */    if( !(tok = strtok(NULL, " \n")) ) {      fprintf(stderr, "error: scores in wrong format\n");      return 0;    } else {      spec.cross_entropy[index] = strtod(tok, NULL);    }    /* now skip the '*' */    if( !(tok = strtok(NULL, " \n")) ) {      fprintf(stderr, "error: scores in wrong format\n");      return 0;    }    /* now get the complexity */    if( !(tok = strtok(NULL, " \n")) ) {      fprintf(stderr, "error: scores in wrong format\n");      return 0;    } else {      spec.complexity[index] = strtod(tok, NULL);    }    counter++;  }  /* last consistency check */  if( counter > spec.num_cats ) {    return 0;  } else {    found_scores = 1;    return 1;  }}void finish_parsing() {  category_count_t i, j;  LossVector *p;  real_value_t min_complexity, max_complexity;  /* consistency checks */  if( !found_scores ) {    fprintf(stderr,	    "error: no scores found. Did you use dbacl with the -a switch?\n");    exit(0);  } else {    min_complexity = spec.complexity[0];    max_complexity = spec.complexity[0];    for(i = 0; i < spec.num_cats; i++) {      if( !spec.catname[i] ) {	fprintf(stderr,		"error: too few categories scored. Is your risk specification correct?\n");	exit(0);      }      min_complexity = (min_complexity < spec.complexity[i]) ? min_complexity : 	spec.complexity[i];      max_complexity = (max_complexity > spec.complexity[i]) ? max_complexity : 	spec.complexity[i];    }    if( min_complexity < MEANINGLESS_THRESHOLD * max_complexity ) {      fprintf(stderr,	      "\n"	      "warning: There is a significant disparity between the complexities\n"	      "         reported under each category. This most likely indicates that\n"	      "         you've chosen categories whose features aren't comparable.\n"	      "\n"	      "         The Bayes solution calculations will be meaningless.\n\n");    }   }  /* first, we must finish building the loss matrix     We pick for each category the first lossvector     which reported a match */  for(i = 0; i < spec.num_cats; i++) {    for(p = spec.loss_list[i]; p != NULL; p = p->next) {      if( p->found ) {	break;      }    }    if( p == NULL ) {      fprintf(stderr, "error: something's wrong with loss_list, sorry\n");    } else {      /* now build the ith row of the loss matrix */      if( options & (1<<OPTION_DEBUG) ) {	fprintf(stdout,		"category %s\t risk spec: %s\n", 		spec.catname[i], p->ve);      }      if( parse_loss_vec(i, p->ve) != 0 ) {	fprintf(stderr,		"error: couldn't parse spec for (%s, \"%s\")\n", 		spec.catname[i], p->re);	exit(0);      }    }  }  if( options & (1<<OPTION_DEBUG) ) {    fprintf(stdout, "\nfinal loss_matrix (log scale)\n");    for(i = 0; i < spec.num_cats; i++) {      fprintf(stdout, "%s\t", spec.catname[i]);      for(j = 0; j < spec.num_cats; j++) {	fprintf(stdout, "%8.2f ", spec.loss_matrix[i][j]);      }      fprintf(stdout, "\n");    }    fprintf(stdout, "\n");  }  }/* now we are ready to score the losses    This is a simple matrix multiplication */category_count_t score_losses() {  category_count_t i, j;  real_value_t tmp[MAX_CAT];  real_value_t norm, score, min_score;  category_count_t min_cat;  min_score = 1.0/0.0;  min_cat = 0;  for(i = 0; i < spec.num_cats; i++) {    /* calculate terms of the sum on exponential scale        and find normalization. Note -inf + inf = -inf */    norm = -1.0/0.0;    for(j = 0; j < spec.num_cats; j++) {      tmp[j] = 	- spec.complexity[j] * spec.cross_entropy[j] +	spec.prior[j];      if( !isinf(tmp[j]) ) {	tmp[j] += spec.loss_matrix[j][i]; 	if( !isinf(tmp[j]) && (norm < tmp[j]) ) {	  norm = tmp[j];	}      }    }    if( isinf(norm) ) { norm = 0.0; }     /* now calculate the ith score */    score = 0.0;    for(j = 0; j < spec.num_cats; j++) {      score += exp( tmp[j] - norm );    }    score = log(score) + norm;    if( options & (1<<OPTION_DEBUG) ) {      fprintf(stdout,	      "decision score %s\t %10.2f\n", spec.catname[i], score);    } else if( options & (1<<OPTION_SCORES) ) {      fprintf(stdout,	      "%s %10.2f ", spec.catname[i], score);    }    /* and minimize */    if( min_score > score ) {      min_score = score;      min_cat = i;    }  }  if( options & (1<<OPTION_SCORES) ) {    fprintf(stdout,"\n");  }  return min_cat;}void finish_parsing_and_score() {  category_count_t best;  finish_parsing();  best = score_losses();  if( spec.num_cats == 1 ) {    fprintf(stderr, "warning: only one category specified, this is trivial!\n");  }  if( (options & (1<<OPTION_VERBOSE)) &&      !(options & (1<<OPTION_SCORES)) ) {    fprintf(stdout, "%s\n", spec.catname[best]);  }  exit_code = (best + 1);}/*********************************************************** * FILE MANAGEMENT FUNCTIONS                               * ***********************************************************//* this function parses the risk specification */int read_riskspec(char *filename) {  FILE *input;  /* parse the spec */  if( (input = fopen(filename, "r")) ) {    if( parse_risk_spec(input) != 0 ) {      fclose(input);      exit(0);    }    fclose(input);  } else {    return 0;  }  /* do some consistency checks */  if( spec.num_cats == 0 ) {    fprintf(stderr, "error: you need at least one category\n");    exit(0);  } else if( spec.num_cats != spec.num_priors ) {    fprintf(stderr, "error: prior doesn't match number of categories\n");    exit(0);  }  return 1;}/*********************************************************** * MULTIBYTE FILE HANDLING FUNCTIONS                       * * this is suitable for any locale whose character set     * * encoding doesn't include NUL bytes inside characters    * ***********************************************************/#define MAGIC "scores "#define MULTIBYTE_EPSILON 10 /* enough for a multibyte char and a null char */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -