📄 bayesol.c
字号:
/* * Copyright (C) 2002 Laird Breyer * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Author: Laird Breyer <laird@lbreyer.com> *//* * Note to regenerate the Makefile and configure script: * aclocal * autoconf * touch NEWS README AUTHORS ChangeLog * automake -a */#ifdef HAVE_CONFIG_H#include "config.h"#endif#include <assert.h>#include <math.h>#include <limits.h>#include <ctype.h>#include <string.h>#include <stdio.h>#include <stdlib.h>#include <unistd.h>#include <locale.h>#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H#include <wctype.h>#include <wchar.h>#else#undef HAVE_LIBBOOST_REGEX#endif#if defined HAVE_LIBBOOST_REGEX#define UNICODE#include <boost/regex.h>#else #include <sys/types.h>#include <regex.h>#endif#include "bayesol.h" /* global variables */Spec spec; /* used in risk-parser */char *textbuf = NULL;charbuf_len_t textbuf_len = 0;wchar_t *wc_textbuf = NULL;charbuf_len_t wc_textbuf_len = 0;char *aux_textbuf = NULL;charbuf_len_t aux_textbuf_len = 0;/* for option processing */extern char *optarg;extern int optind, opterr, optopt;options_t options = 0;char *title = "";/* parser interface */extern int parse_loss_vec(category_count_t i, char *buf);extern int parse_risk_spec(FILE *input);bool_t found_scores = 0;int exit_code = 0; /* default */#if defined OS_SUNint isinf(double x) { return !finite(x) && x==x; }#endif/*********************************************************** * MISCELLANEOUS FUNCTIONS * ***********************************************************/void usage(char **argv) { fprintf(stderr, "\n"); fprintf(stderr, "bayesol [-vi] -c RISKSPEC [FILE]...\n"); fprintf(stderr, " calculates the optimal Bayes solution using RISKSPEC, with\n"); fprintf(stderr, " input from FILE or STDIN.\n"); fprintf(stderr, "\n"); fprintf(stderr, "bayesol -V\n"); fprintf(stderr, "\n"); fprintf(stderr, " prints program version.\n");}void print_line() { fprintf(stdout, "%s", textbuf);}void init_spec() { category_count_t i; spec.num_cats = 0; spec.num_priors = 0; for( i = 0; i < MAX_CAT; spec.loss_list[i++] = NULL); for( i = 0; i < MAX_CAT; spec.cross_entropy[i++] = 0.0); for( i = 0; i < MAX_CAT; spec.complexity[i++] = 0.0);}/*********************************************************** * PARSING RELATED FUNCTIONS * ***********************************************************//* this function is called before processing each file. It constructs a linked list of regular expressions if there are any. Each regex is associated with the string representation of the corresponding loss vector */void setup_regexes() { category_count_t i; bool_t has_empty_string; LossVector *p; RegMatch *q, *r;#if defined HAVE_LIBBOOST_REGEX charbuf_len_t rr; wchar_t *ws;#endif for( i = 0; i < spec.num_cats; i++ ) { /* if spec.loss_list[i] does not contain a LossVector whose regex is the empty string, then this specification is not complete and we exit */ if( spec.loss_list[i] == NULL ) { fprintf(stderr, "error: missing loss_matrix entry for category %s\n", spec.catname[i]); exit(0); } else { has_empty_string = 0; /* step through the list of vectors looking for regexes */ for(p = spec.loss_list[i]; p != NULL; p = p->next) { if( !(p->re[0]) ) { has_empty_string = 1; p->found = 1; } else { /* it's a genuine regex, try compiling it */ q = malloc(sizeof(RegMatch)); if(q) {#if defined HAVE_LIBBOOST_REGEX /* boost regexes accept wide char strings */ rr = strlen(p->re); ws = malloc((rr+1) * sizeof(wchar_t)); if( !ws || (rr = mbstowcs(ws, p->re, rr)) <= 0) { fprintf(stderr, "error: couldn't convert regular expression '%s'.\n", p->re); free(q); if(ws) { free(ws); } q = NULL; ws = NULL; } else { ws[rr] = L'\0'; if( regcomp(&(q->reg), ws, REG_EXTENDED) == 0 ) { p->found = 0; q->lv = p; q->next = NULL; } else { fprintf(stderr, "warning: couldn't compile regex '%s', ignoring\n", p->re); free(q); q = NULL; } }#else /* GNU regexes accept ordinary strings */ if( regcomp(&(q->reg), p->re, REG_EXTENDED) == 0 ) { p->found = 0; q->lv = p; q->next = NULL; } else { fprintf(stderr, "warning: couldn't compile regex '%s', ignoring\n", p->re); free(q); q = NULL; }#endif } else { fprintf(stderr, "warning: no memory for regex '%s', ignoring\n", p->re); } /* add this regex to the list we have to check online */ if( spec.regs == NULL ) { spec.regs = q; } else { for(r = spec.regs; r->next != NULL; r = r->next); r->next = q; } } } /* consistency check */ if( !has_empty_string ) { fprintf(stderr, "error: missing loss_matrix entry for category %s (need \"\" case)\n", spec.catname[i]); exit(0); } } }}/* parses the dbacl scores. Format used is scores [cat s * c]... Note: destroys the buf */int parse_dbacl_scores(char *buf) { char *tok; category_count_t index, counter; /* first token is scores */ tok = strtok(buf, " \n"); counter = 0; /* next gobble up tokens four at a time */ while(tok) { /* first we get the category name/index */ if( !(tok = strtok(NULL, " \n")) ) { break; } else { for(index = 0; index < spec.num_cats; index++) { if( strcmp(spec.catname[index], tok) == 0 ) { break; } } if( index == spec.num_cats ) { return 0; } } /* now that we have the index, associate the cross entropy */ if( !(tok = strtok(NULL, " \n")) ) { fprintf(stderr, "error: scores in wrong format\n"); return 0; } else { spec.cross_entropy[index] = strtod(tok, NULL); } /* now skip the '*' */ if( !(tok = strtok(NULL, " \n")) ) { fprintf(stderr, "error: scores in wrong format\n"); return 0; } /* now get the complexity */ if( !(tok = strtok(NULL, " \n")) ) { fprintf(stderr, "error: scores in wrong format\n"); return 0; } else { spec.complexity[index] = strtod(tok, NULL); } counter++; } /* last consistency check */ if( counter > spec.num_cats ) { return 0; } else { found_scores = 1; return 1; }}void finish_parsing() { category_count_t i, j; LossVector *p; real_value_t min_complexity, max_complexity; /* consistency checks */ if( !found_scores ) { fprintf(stderr, "error: no scores found. Did you use dbacl with the -a switch?\n"); exit(0); } else { min_complexity = spec.complexity[0]; max_complexity = spec.complexity[0]; for(i = 0; i < spec.num_cats; i++) { if( !spec.catname[i] ) { fprintf(stderr, "error: too few categories scored. Is your risk specification correct?\n"); exit(0); } min_complexity = (min_complexity < spec.complexity[i]) ? min_complexity : spec.complexity[i]; max_complexity = (max_complexity > spec.complexity[i]) ? max_complexity : spec.complexity[i]; } if( min_complexity < MEANINGLESS_THRESHOLD * max_complexity ) { fprintf(stderr, "\n" "warning: There is a significant disparity between the complexities\n" " reported under each category. This most likely indicates that\n" " you've chosen categories whose features aren't comparable.\n" "\n" " The Bayes solution calculations will be meaningless.\n\n"); } } /* first, we must finish building the loss matrix We pick for each category the first lossvector which reported a match */ for(i = 0; i < spec.num_cats; i++) { for(p = spec.loss_list[i]; p != NULL; p = p->next) { if( p->found ) { break; } } if( p == NULL ) { fprintf(stderr, "error: something's wrong with loss_list, sorry\n"); } else { /* now build the ith row of the loss matrix */ if( options & (1<<OPTION_DEBUG) ) { fprintf(stdout, "category %s\t risk spec: %s\n", spec.catname[i], p->ve); } if( parse_loss_vec(i, p->ve) != 0 ) { fprintf(stderr, "error: couldn't parse spec for (%s, \"%s\")\n", spec.catname[i], p->re); exit(0); } } } if( options & (1<<OPTION_DEBUG) ) { fprintf(stdout, "\nfinal loss_matrix (log scale)\n"); for(i = 0; i < spec.num_cats; i++) { fprintf(stdout, "%s\t", spec.catname[i]); for(j = 0; j < spec.num_cats; j++) { fprintf(stdout, "%8.2f ", spec.loss_matrix[i][j]); } fprintf(stdout, "\n"); } fprintf(stdout, "\n"); } }/* now we are ready to score the losses This is a simple matrix multiplication */category_count_t score_losses() { category_count_t i, j; real_value_t tmp[MAX_CAT]; real_value_t norm, score, min_score; category_count_t min_cat; min_score = 1.0/0.0; min_cat = 0; for(i = 0; i < spec.num_cats; i++) { /* calculate terms of the sum on exponential scale and find normalization. Note -inf + inf = -inf */ norm = -1.0/0.0; for(j = 0; j < spec.num_cats; j++) { tmp[j] = - spec.complexity[j] * spec.cross_entropy[j] + spec.prior[j]; if( !isinf(tmp[j]) ) { tmp[j] += spec.loss_matrix[j][i]; if( !isinf(tmp[j]) && (norm < tmp[j]) ) { norm = tmp[j]; } } } if( isinf(norm) ) { norm = 0.0; } /* now calculate the ith score */ score = 0.0; for(j = 0; j < spec.num_cats; j++) { score += exp( tmp[j] - norm ); } score = log(score) + norm; if( options & (1<<OPTION_DEBUG) ) { fprintf(stdout, "decision score %s\t %10.2f\n", spec.catname[i], score); } else if( options & (1<<OPTION_SCORES) ) { fprintf(stdout, "%s %10.2f ", spec.catname[i], score); } /* and minimize */ if( min_score > score ) { min_score = score; min_cat = i; } } if( options & (1<<OPTION_SCORES) ) { fprintf(stdout,"\n"); } return min_cat;}void finish_parsing_and_score() { category_count_t best; finish_parsing(); best = score_losses(); if( spec.num_cats == 1 ) { fprintf(stderr, "warning: only one category specified, this is trivial!\n"); } if( (options & (1<<OPTION_VERBOSE)) && !(options & (1<<OPTION_SCORES)) ) { fprintf(stdout, "%s\n", spec.catname[best]); } exit_code = (best + 1);}/*********************************************************** * FILE MANAGEMENT FUNCTIONS * ***********************************************************//* this function parses the risk specification */int read_riskspec(char *filename) { FILE *input; /* parse the spec */ if( (input = fopen(filename, "r")) ) { if( parse_risk_spec(input) != 0 ) { fclose(input); exit(0); } fclose(input); } else { return 0; } /* do some consistency checks */ if( spec.num_cats == 0 ) { fprintf(stderr, "error: you need at least one category\n"); exit(0); } else if( spec.num_cats != spec.num_priors ) { fprintf(stderr, "error: prior doesn't match number of categories\n"); exit(0); } return 1;}/*********************************************************** * MULTIBYTE FILE HANDLING FUNCTIONS * * this is suitable for any locale whose character set * * encoding doesn't include NUL bytes inside characters * ***********************************************************/#define MAGIC "scores "#define MULTIBYTE_EPSILON 10 /* enough for a multibyte char and a null char */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -