📄 dbacl.c
字号:
/* * Copyright (C) 2002 Laird Breyer * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Author: Laird Breyer <laird@lbreyer.com> *//* * Note to regenerate the Makefile and configure script: * aclocal * autoconf * touch NEWS README AUTHORS ChangeLog * automake -a */#ifdef HAVE_CONFIG_H#include "config.h"#endif#include <math.h>#include <ctype.h>#include <string.h>#include <stdlib.h>#include <unistd.h>#include <locale.h>#if defined HAVE_LANGINFO_H#include <langinfo.h>#endif#include "dbacl.h" /* make sure this is last *//* global variables */hash_bit_count_t default_max_hash_bits = 15;hash_count_t default_max_tokens = (1<<15);hash_bit_count_t default_max_grow_hash_bits = 16;hash_count_t default_max_grow_tokens = (1<<16);hash_bit_count_t decimation;Dirichlet dirichlet;Learner learner;Category cat[MAX_CAT];category_count_t cat_count = 0;int filter[MAX_CAT];category_count_t filter_count = 0;Regex re[MAX_RE];regex_count_t regex_count = 0;Empirical empirical;extern token_order_t ngram_order; /* defaults to 1 *//* for counting emails */bool_t not_header; extern MBOX_State mbox;/* for option processing */extern char *optarg;extern int optind, opterr, optopt;options_t options = 0;char *title = "";int exit_code = 0; /* default */int overflow_warning = 0;int digramic_overflow_warning = 0;int skewed_constraints_warning = 0;/*********************************************************** * MISCELLANEOUS FUNCTIONS * ***********************************************************/void usage(char **argv) { fprintf(stderr, "\n"); fprintf(stderr, "dbacl [-vniNR] [-T type] -c CATEGORY [-c CATEGORY]...\n"); fprintf(stderr, " [-f KEEP]... [FILE]...\n"); fprintf(stderr, "\n"); fprintf(stderr, " classifies FILE or STDIN using CATEGORY, optionally\n"); fprintf(stderr, " removing all lines which don't fit the category KEEP.\n"); fprintf(stderr, "\n"); fprintf(stderr, "dbacl [-vnirND] [-h size] [-T type] -l CATEGORY \n"); fprintf(stderr, " [-g regex]... [FILE]...\n"); fprintf(stderr, "\n"); fprintf(stderr, " builds a maximum entropy model from the words in FILE or STDIN\n"); fprintf(stderr, " or concatenated regex submatches if using the -g option.\n"); fprintf(stderr, "\n"); fprintf(stderr, "dbacl -V\n"); fprintf(stderr, "\n"); fprintf(stderr, " prints program version.\n");}/*********************************************************** * CATEGORY FUNCTIONS * ***********************************************************/void line_score_single_category(char *textbuf) { score_t score = 0.0; if( *textbuf ) { if( options & (1<<OPTION_POSTERIOR) ) { /* score = empirical_entropy(); */ fprintf(stdout, "%s", textbuf); } if( options & (1<<OPTION_SCORES) ) { if( options & (1<<OPTION_VERBOSE) ) { score = -(cat[0].score/cat[0].complexity)/log(2.0); fprintf(stdout, "%s %6.2" FMT_printf_score_t " * %-4d %s", cat[0].filename, score, cat[0].complexity, textbuf); } else { fprintf(stdout, "%s %7.2" FMT_printf_score_t " %s", cat[0].filename, -cat[0].score, textbuf); } } } cat[0].score = 0.0; cat[0].complexity = 0; if( options & (1<<OPTION_CALCENTROPY) ) { clear_empirical(&empirical); }}void line_score_multiple_categories(char *textbuf) { category_count_t i; int map; score_t c, cmax; /* find MAP */ cmax = cat[0].score; map = 0; for(i = 1; i < cat_count; i++) { if(cmax < cat[i].score) { cmax = cat[i].score; map = i; } } if(options & (1<<OPTION_POSTERIOR) ) { /* compute probabilities given exclusive choices */ c = 0.0; for(i = 0; i < cat_count; i++) { c += exp((cat[i].score - cmax)); } if( *textbuf ) { for(i = 0; i < cat_count; i++) { fprintf(stdout, "%s %6.2" FMT_printf_score_t "%% ", cat[i].filename, 100 * exp((cat[i].score - cmax))/c); } fprintf(stdout, "%s", textbuf); } } else if( options & (1<<OPTION_SCORES) ) { /* display normalized divergence scores */ if( *textbuf ) { for(i = 0; i < cat_count; i++) { if( options & (1<<OPTION_VERBOSE) ) { fprintf(stdout, "%s %6.2" FMT_printf_score_t " * %-4d ", cat[i].filename, -cat[i].score/cat[i].complexity, cat[i].complexity); } else { fprintf(stdout, "%s %6.2" FMT_printf_score_t " ", cat[i].filename, -cat[i].score); } } fprintf(stdout, "%s", textbuf); } } else { /* prune the text which doesn't fit */ for(i = 0; i < filter_count; i++) { if( cat[map].score <= cat[filter[i]].score ) { fprintf(stdout, "%s", textbuf); break; } } } /* clean up for next line */ for(i = 0; i < cat_count; i++) { cat[i].score = 0.0; cat[i].complexity = 0; } if( options & (1<<OPTION_CALCENTROPY) ) { clear_empirical(&empirical); }}void score_single_category() { if( options & (1<<OPTION_POSTERIOR) ) { /* score = empirical_entropy(); */ /* fprintf(stdout, "n/a\n"); */ } if( options & (1<<OPTION_APPEND) ) { fprintf(stdout, "\nscores "); } if( options & (1<<OPTION_SCORES) ) { if( options & (1<<OPTION_VERBOSE) ) { fprintf(stdout, "cross_entropy %.2" FMT_printf_score_t \ " bits complexity %d\n", -(cat[0].score/cat[0].complexity) / log(2.0), cat[0].complexity); } else { fprintf(stdout, "score %.2" FMT_printf_score_t "\n", -cat[0].score); } if( options & (1<<OPTION_APPEND) ) { if( cat[0].model_num_docs > 0 ) { fprintf(stdout, "mean_complexity %s %5.2" FMT_printf_score_t "\n", cat[0].filename, (double)cat[0].model_full_token_count/cat[0].model_num_docs); } } } exit_code = 1;}void score_multiple_categories() { bool_t no_title; category_count_t i; score_t c, cmax; /* find MAP */ cmax = cat[0].score; exit_code = 0; for(i = 1; i < cat_count; i++) { if(cmax < cat[i].score) { cmax = cat[i].score; exit_code = i; } } if( options & (1<<OPTION_POSTERIOR) ) { if( options & (1<<OPTION_APPEND) ) { fprintf(stdout, "\nprobabilities "); } /* here we compute probabilities given exclusive choices */ c = 0.0; for(i = 0; i < cat_count; i++) { c += exp((cat[i].score - cmax)); } for(i = 0; i < cat_count; i++) { fprintf(stdout, "%s %5.2" FMT_printf_score_t "%% ", cat[i].filename, 100 * exp((cat[i].score - cmax))/c); } fprintf(stdout, "\n"); } else if( options & (1<<OPTION_SCORES) ) { if( options & (1<<OPTION_APPEND) ) { fprintf(stdout, "\nscores "); } /* display logarithmic score */ for(i = 0; i < cat_count; i++) { if( options & (1<<OPTION_VERBOSE) ) { fprintf(stdout, "%s %5.2" FMT_printf_score_t " * %-d ", cat[i].filename, (-cat[i].score/cat[i].complexity)/log(2.0), cat[i].complexity); } else { fprintf(stdout, "%s %5.2" FMT_printf_score_t " ", cat[i].filename, -cat[i].score/log(2.0)); } } fprintf(stdout, "\n"); if( options & (1<<OPTION_APPEND) ) { no_title = 1; for(i = 0; i < cat_count; i++) { if( cat[i].model_num_docs > 0 ) { if( no_title ) { fprintf(stdout, "mean_complexity "); no_title = 0; } fprintf(stdout, "%s %5.2" FMT_printf_score_t " ", cat[i].filename, (double)cat[i].model_full_token_count/cat[i].model_num_docs); } } if( !no_title ) { fprintf(stdout, "\n"); } } } else if( options & (1<<OPTION_VERBOSE) ) { if( options & (1<<OPTION_APPEND) ) { fprintf(stdout, "\ncategory "); } fprintf(stdout, "%s\n", cat[exit_code].filename); } exit_code++; /* make number between 1 and cat_count+1 */}/*********************************************************** * FILE MANAGEMENT FUNCTIONS * ***********************************************************//* writes the learner to a file for easily readable category */error_code_t save_learner() { alphabet_size_t i, j; hash_count_t t; regex_count_t c; FILE *output; c_item ci; char buf[MAGIC_BUFSIZE]; char smb[MAX_SUBMATCH+1]; token_order_t s; char *p;#if defined DIGITIZE_DIGRAMS short int shval;#else weight_t shval;#endif if( options & (1<<OPTION_VERBOSE) ) { fprintf(stdout, "saving category to file %s\n", learner.filename); } /* don't overwrite data files */ if( (output = fopen(learner.filename, "r")) ) { /* output file exists already */ fgets(buf, MAGIC_BUFSIZE, output); if( strncmp(buf, MAGIC1, 10) != 0 ) { fprintf(stderr, "error: the file %s is already used for something, " "use another filename. Nothing written.\n", learner.filename); fclose(output); exit(0); } else { /* it's an existing category file */ fclose(output); } } if( (output = fopen(learner.filename, "wb")) ) { /* print out standard headers */ fprintf(output, MAGIC1, learner.filename, (options & (1<<OPTION_REFMODEL)) ? "(ref)" : ""); fprintf(output, MAGIC2_o, learner.divergence, learner.logZ, learner.max_order, (options & (1<<OPTION_MULTINOMIAL)) ? "multinomial" : "hierarchical" ); fprintf(output, MAGIC3, (short int)learner.max_hash_bits, (long int)learner.full_token_count, (long int)learner.unique_token_count, (long int)learner.num_docs); /* print out any regexes we might need */ for(c = 0; c < regex_count; c++) { /* write the bitmap */ for(p = smb, s = 1; s <= MAX_SUBMATCH; s++) { if( re[c].submatches & (1<<s) ) { *p++ = s + '0'; } } *p = '\0';#if defined HAVE_LIBBOOST_REGEX /* does this work? */ fprintf(output, MAGIC5_wo, re[c].string, smb);#else fprintf(output, MAGIC5_o, re[c].string, smb);#endif } /* this is optional too */ if( options & (1<<OPTION_CASEN) ) { fprintf(output, MAGIC4); } if( options & (1<<OPTION_I18N) ) { fprintf(output, MAGIC7); } fprintf(output, MAGIC6); /* end of readable stuff */ /* character frequencies */ for(i = 0; i < ASIZE; i++) { for(j = 0; j < ASIZE; j++) { shval = PACK_DIGRAMS(learner.dig[i][j]); while(fwrite(&shval, SIZEOF_DIGRAMS, 1, output) < 1); } } /* token/feature weights */ for(t = 0; t < learner.max_tokens; t++) { /* write each element so that it's easy to read back in a c_item array */ SET(ci.id,learner.hash[t].id);#if defined DIGITIZE_LAMBDA ci.lam = (digitized_weight_t)((learner.hash[t].lam) * 100); if( fabs((learner.hash[t].lam)) > 320 ) { fprintf(stderr, "warning: digitized an extreme lambda value - results may be unusable\n"); } #else ci.lam = (learner.hash[t].lam);#endif while(fwrite(&ci, sizeof(ci), 1, output) < 1); } fclose(output); } else { fprintf(stderr, "error: cannot open file for writing %s\n", learner.filename); return 0; } return 1;}/*********************************************************** * LEARNER FUNCTIONS * ***********************************************************/void reset_mbox_messages() { not_header = 1; learner.num_docs = 0;}void count_mbox_messages(char *textbuf) { switch(mbox.state) { case HEADER: if(not_header) { learner.num_docs++; } not_header = 0; break; default: not_header = 1; break; }}l_item *find_in_learner(hash_value_t id) { register l_item *i, *loop; /* start at id */ i = loop = &learner.hash[id & (learner.max_tokens - 1)]; while( FILLEDP(i) ) { if( EQUALP(i->id,id) ) { return i; /* found id */ } else {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -