📄 catfun.c
字号:
/* * Copyright (C) 2002 Laird Breyer * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Author: Laird Breyer <laird@lbreyer.com> */#ifdef HAVE_CONFIG_H#include "config.h"#endif#include <string.h>#include <stdlib.h>#include <math.h>#include "dbacl.h"/* look for these where main() is defined *//* these globals will have to be refactored once I decide on an API for libdbacl ... */extern options_t options; extern Empirical empirical;extern Category cat[MAX_CAT];extern category_count_t cat_count;extern Regex re[MAX_RE];extern regex_count_t regex_count;int hashfull_warning;/*********************************************************** * MISCELLANEOUS FUNCTIONS * ***********************************************************/char *sanitize_path(char *in) { char *q; char *path; charbuf_len_t l; /* this bit likely fails in DOS ;-) */ if( (*in != '/') && (*in != '.') && (path = getenv("DBACL_PATH")) ) { l = strlen(path); q = malloc(l + strlen(in) + 3); strcpy(q, path); if( q[l - 1] != '/' ) { q[l] = '/'; q[l + 1] = 0; } strcat(q, in); return q; } else { return in; }}digitized_weight_t digitize_a_weight(weight_t w) { if( w < -320 ) { return DIGITIZED_WEIGHT_MIN; } else if( 320 < w ) { return DIGITIZED_WEIGHT_MAX; } else { return 100 * w; }}/*********************************************************** * EMPIRICAL DISTRIBUTION OF TOKENS * ***********************************************************//* initialize global learner object */void init_empirical(Empirical *emp, hash_count_t dmt, hash_bit_count_t dmhb) { /* some constants */ emp->max_tokens = dmt; emp->max_hash_bits = dmhb; emp->full_token_count = 0; emp->unique_token_count = 0; emp->track_features = 0; emp->feature_stack_top = 0; /* allocate room for hash */ emp->hash = calloc(emp->max_tokens, sizeof(h_item)); if( !emp->hash ) { fprintf(stderr, "error: not enough memory? I couldn't allocate %li bytes\n", (sizeof(h_item) * ((long int)emp->max_tokens))); exit(0); }}void clear_empirical(Empirical *emp) { token_stack_t i; if( emp->track_features ) { /* this may actually be slower than a global memset */ for(i = 0; i < emp->feature_stack_top; i++) { memset(emp->feature_stack[i], 0, sizeof(h_item)); } } else { memset(emp->hash, 0, sizeof(h_item) * emp->max_tokens); } emp->full_token_count = 0; emp->unique_token_count = 0; emp->feature_stack_top = 0; if( options & (1<<OPTION_FASTEMP) ) { emp->track_features = 1; }}h_item *find_in_empirical(Empirical *emp, hash_value_t id) { register h_item *i, *loop; /* start at id */ i = loop = &emp->hash[id & (empirical.max_tokens - 1)]; while( FILLEDP(i) ) { if( EQUALP(i->id,id) ) { return i; /* found id */ } else { i++; /* not found */ /* wrap around */ i = (i >= &emp->hash[emp->max_tokens]) ? emp->hash : i; if( i == loop ) { return NULL; /* when hash table is full */ } } } /* empty slot, so not found */ return i; }/* calculates the entropy of the empirical measure */score_t empirical_entropy(Empirical *emp) { hash_count_t i; score_t e = 0.0; if( emp->track_features && (emp->feature_stack_top < MAX_TOKEN_LINE_STACK) ) { for(i = 0; i < emp->feature_stack_top; i++) { e += ((score_t)emp->feature_stack[i]->count) * log((score_t)emp->feature_stack[i]->count); } } else { for(i = 0; i < emp->max_tokens; i++) { if( FILLEDP(&emp->hash[i]) ) { e += ((score_t)emp->hash[i].count) * log((score_t)emp->hash[i].count); } } } e = e/emp->full_token_count - log((score_t)emp->full_token_count); return -e;}/*********************************************************** * CATEGORY FUNCTIONS * ***********************************************************//* initialize to zero. Filename specified elsewhere */void init_category(Category *cat) { char *p; cat->model_full_token_count = 0; cat->model_unique_token_count = 0; cat->score = 0.0; cat->complexity = 0; cat->max_order = 0; if( (p = strrchr(cat->filename, '/')) ) { cat->filename = p + 1; /* only keep basename */ } cat->retype = 0; cat->model_type = simple; cat->hash = NULL;}/* frees the resrouces associated with a category */void free_category(Category *cat) { if( cat->hash ) { free(cat->hash); }}/* turns purely random text into a category of its own */void init_purely_random_text_category(Category *cat) { alphabet_size_t i, j; weight_t z = -log((double)ASIZE - 1.0);#if defined DIGITIZE_DIGRAMS digitized_weight_t zz = PACK_DIGRAMS(z);#endif for(i = 1; i < ASIZE; i++) { for(j = 1; j < ASIZE; j++) {#if defined DIGITIZE_DIGRAMS cat->dig[i][j] = zz;#else cat->dig[i][j] = z;#endif } } z = -log((double)ASIZE - 2.0);#if defined DIGITIZE_DIGRAMS zz = PACK_DIGRAMS(z);#endif for(j = 1; j < ASIZE; j++) {#if defined DIGITIZE_DIGRAMS cat->dig[DIAMOND][j] = zz;#else cat->dig[DIAMOND][j] = z;#endif } /* not needed: set DIAMOND-DIAMOND score for completeness only */#if defined DIGITIZE_DIGRAMS cat->dig[DIAMOND][DIAMOND] = DIGITIZED_WEIGHT_MIN;#else cat->dig[DIAMOND][DIAMOND] = log(0.0);#endif cat->logZ = 0.0; cat->hash = NULL; cat->model_type = simple; cat->max_order = 1; options |= (1<<OPTION_NOREGEX); }c_item *find_in_category(Category *cat, hash_value_t id) { register c_item *i, *loop; if( cat->hash ) { /* start at id */ i = loop = &cat->hash[id & (cat->max_tokens - 1)]; while( FILLEDP(i) ) { if( EQUALP(i->id,id) ) { return i; /* found id */ } else { i++; /* not found */ /* wrap around */ i = (i >= &cat->hash[cat->max_tokens]) ? cat->hash : i; if( i == loop ) { return NULL; /* when hash table is full */ } } } return i; } else { return NULL; }}/* for each loaded category, this calculates the score */void score_word(char *tok, token_order_t r, regex_count_t re) { category_count_t i; alphabet_size_t pp, pc; hash_value_t id; char *q; register c_item *k; h_item *h = NULL; if( *(tok + 2) ) { /* DIAMOND-DIAMOND is empty string */ id = (hash_value_t)hash((unsigned char *)tok, strlen(tok), 0); if( (options & (1<<OPTION_CALCENTROPY)) && (r == 1) ) { /* add the token to the hash */ h = find_in_empirical(&empirical, id); if( h ) { if( FILLEDP(h) ) { h->count += ( h->count < K_TOKEN_COUNT_MAX ) ? 1 : 0; } else { if( /* !FILLEDP(i) && */ ((100 * empirical.unique_token_count) < (HASH_FULL * empirical.max_tokens) )) { /* fill the empirical hash */ SET(h->id,id); empirical.unique_token_count += ( empirical.unique_token_count < K_TOKEN_COUNT_MAX ) ? 1 : 0; h->count += ( h->count < K_TOKEN_COUNT_MAX ) ? 1 : 0; } else { /* hash full */ h = NULL; if( !hashfull_warning ) { fprintf(stderr,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -