📄 catfun.c

📁 dbacl是一个通用目的的digramic贝叶斯文本分类器。它可以学习你提供的文本
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*  * Copyright (C) 2002 Laird Breyer *   * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. *  * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. *  * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *  * Author:   Laird Breyer <laird@lbreyer.com> */#ifdef HAVE_CONFIG_H#include "config.h"#endif#include <string.h>#include <stdlib.h>#include <math.h>#include "dbacl.h"/* look for these where main() is defined *//* these globals will have to be refactored once   I decide on an API for libdbacl ... */extern options_t options; extern Empirical empirical;extern Category cat[MAX_CAT];extern category_count_t cat_count;extern Regex re[MAX_RE];extern regex_count_t regex_count;int hashfull_warning;/*********************************************************** * MISCELLANEOUS FUNCTIONS                                 * ***********************************************************/char *sanitize_path(char *in) {  char *q;  char *path;  charbuf_len_t l;  /* this bit likely fails in DOS ;-) */  if( (*in != '/') && (*in != '.') && (path = getenv("DBACL_PATH")) ) {    l = strlen(path);    q = malloc(l + strlen(in) + 3);    strcpy(q, path);    if( q[l - 1] != '/' ) {      q[l] = '/';       q[l + 1] = 0;    }    strcat(q, in);    return q;  } else {    return in;  }}digitized_weight_t digitize_a_weight(weight_t w) {  if( w < -320 ) {    return DIGITIZED_WEIGHT_MIN;  } else if( 320 < w ) {    return DIGITIZED_WEIGHT_MAX;  } else {    return 100 * w;  }}/*********************************************************** * EMPIRICAL DISTRIBUTION OF TOKENS                        * ***********************************************************//* initialize global learner object */void init_empirical(Empirical *emp, hash_count_t dmt, hash_bit_count_t dmhb) {    /* some constants */    emp->max_tokens = dmt;    emp->max_hash_bits = dmhb;    emp->full_token_count = 0;    emp->unique_token_count = 0;    emp->track_features = 0;    emp->feature_stack_top = 0;    /* allocate room for hash */    emp->hash = calloc(emp->max_tokens, sizeof(h_item));    if( !emp->hash ) {	fprintf(stderr, 		"error: not enough memory? I couldn't allocate %li bytes\n", 		(sizeof(h_item) * ((long int)emp->max_tokens)));	exit(0);    }}void clear_empirical(Empirical *emp) {    token_stack_t i;    if( emp->track_features ) {	/* this may actually be slower than a global memset */ 	for(i = 0; i < emp->feature_stack_top; i++) {	    memset(emp->feature_stack[i], 0, sizeof(h_item));	}    } else {	memset(emp->hash, 0, sizeof(h_item) * emp->max_tokens);    }    emp->full_token_count = 0;    emp->unique_token_count = 0;    emp->feature_stack_top = 0;    if( options & (1<<OPTION_FASTEMP) ) {      emp->track_features = 1;    }}h_item *find_in_empirical(Empirical *emp, hash_value_t id) {  register h_item *i, *loop;  /* start at id */  i = loop = &emp->hash[id & (empirical.max_tokens - 1)];  while( FILLEDP(i) ) {    if( EQUALP(i->id,id) ) {      return i; /* found id */    } else {      i++; /* not found */      /* wrap around */      i = (i >= &emp->hash[emp->max_tokens]) ? emp->hash : i;       if( i == loop ) {	return NULL; /* when hash table is full */      }    }  }  /* empty slot, so not found */  return i; }/* calculates the entropy of the empirical measure */score_t empirical_entropy(Empirical *emp) {    hash_count_t i;    score_t e = 0.0;    if( emp->track_features && 	(emp->feature_stack_top < MAX_TOKEN_LINE_STACK) ) {	for(i = 0; i < emp->feature_stack_top; i++) {	    e += ((score_t)emp->feature_stack[i]->count) * 	      log((score_t)emp->feature_stack[i]->count);	}    } else {	for(i = 0; i < emp->max_tokens; i++) {	    if( FILLEDP(&emp->hash[i]) ) {		e += ((score_t)emp->hash[i].count) * 		  log((score_t)emp->hash[i].count);	    }	}    }    e = e/emp->full_token_count - log((score_t)emp->full_token_count);    return -e;}/*********************************************************** * CATEGORY FUNCTIONS                                      * ***********************************************************//* initialize to zero. Filename specified elsewhere */void init_category(Category *cat) {    char *p;    cat->model_full_token_count = 0;    cat->model_unique_token_count = 0;    cat->score = 0.0;    cat->complexity = 0;    cat->max_order = 0;    if( (p = strrchr(cat->filename, '/')) ) {	cat->filename = p + 1; /* only keep basename */    }    cat->retype = 0;    cat->model_type = simple;    cat->hash = NULL;}/* frees the resrouces associated with a category */void free_category(Category *cat) {  if( cat->hash ) {    free(cat->hash);  }}/* turns purely random text into a category of its own */void init_purely_random_text_category(Category *cat) {  alphabet_size_t i, j;  weight_t z = -log((double)ASIZE - 1.0);#if defined DIGITIZE_DIGRAMS  digitized_weight_t zz = PACK_DIGRAMS(z);#endif      for(i = 1; i < ASIZE; i++) {    for(j = 1; j < ASIZE; j++) {#if defined DIGITIZE_DIGRAMS      cat->dig[i][j] = zz;#else      cat->dig[i][j] = z;#endif    }  }  z = -log((double)ASIZE - 2.0);#if defined DIGITIZE_DIGRAMS  zz = PACK_DIGRAMS(z);#endif  for(j = 1; j < ASIZE; j++) {#if defined DIGITIZE_DIGRAMS    cat->dig[DIAMOND][j] = zz;#else    cat->dig[DIAMOND][j] = z;#endif  }   /* not needed: set DIAMOND-DIAMOND score for completeness only */#if defined DIGITIZE_DIGRAMS  cat->dig[DIAMOND][DIAMOND] = DIGITIZED_WEIGHT_MIN;#else  cat->dig[DIAMOND][DIAMOND] = log(0.0);#endif  cat->logZ = 0.0;  cat->hash = NULL;  cat->model_type = simple;  cat->max_order = 1;  options |= (1<<OPTION_NOREGEX); }c_item *find_in_category(Category *cat, hash_value_t id) {    register c_item *i, *loop;    if( cat->hash ) {	/* start at id */	i = loop = &cat->hash[id & (cat->max_tokens - 1)];	while( FILLEDP(i) ) {	    if( EQUALP(i->id,id) ) {		return i; /* found id */	    } else {		i++; /* not found */		/* wrap around */		i = (i >= &cat->hash[cat->max_tokens]) ? cat->hash : i; 		if( i == loop ) {		    return NULL; /* when hash table is full */		}	    }	}	return i;    } else {	return NULL;    }}/* for each loaded category, this calculates the score */void score_word(char *tok, token_order_t r, regex_count_t re) {  category_count_t i;  alphabet_size_t pp, pc;  hash_value_t id;  char *q;  register c_item *k;  h_item *h = NULL;  if( *(tok + 2) ) { /* DIAMOND-DIAMOND is empty string */    id = (hash_value_t)hash((unsigned char *)tok, strlen(tok), 0);    if( (options & (1<<OPTION_CALCENTROPY)) && (r == 1) ) {      /* add the token to the hash */      h = find_in_empirical(&empirical, id);      if( h ) { 	if( FILLEDP(h) ) {	  h->count += ( h->count < K_TOKEN_COUNT_MAX ) ? 1 : 0; 	} else { 	  if( /* !FILLEDP(i) && */ 	     ((100 * empirical.unique_token_count) < 	      (HASH_FULL * empirical.max_tokens) )) { 	    /* fill the empirical hash */ 	    SET(h->id,id);	    empirical.unique_token_count += 	      ( empirical.unique_token_count < K_TOKEN_COUNT_MAX ) ? 1 : 0;	    h->count += ( h->count < K_TOKEN_COUNT_MAX ) ? 1 : 0; 	  } else { 	    /* hash full */ 	    h = NULL;  	    if( !hashfull_warning ) { 	      fprintf(stderr,
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -