📄 dbacl.h

📁 dbacl是一个通用目的的digramic贝叶斯文本分类器。它可以学习你提供的文本
💻 H
字号:
/*  * Copyright (C) 2002 Laird Breyer *   * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. *  * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. *  * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *  * Author:   Laird Breyer <laird@lbreyer.com> *//* we define several memory models, which differ basically    in the number of bytes used for the hash tables. Adjust to taste *//* use this for 64-bit hashes */#undef HUGE_MEMORY_MODEL/* use this for 32-bit hashes */#define NORMAL_MEMORY_MODEL /* use this for 16-bit hashes */#undef SMALL_MEMORY_MODEL/* use this for 8-bit hashes */#undef TINY_MEMORY_MODEL/* the following defines set up a tradeoff between   modelling accuracy and memory requirements - season to taste *//* digram digitization: avg loss of precision = 0.01 * token size */#define DIGITIZE_DIGRAMS/* lambda digitization: avg loss of precision = 0.01 */#define DIGITIZE_LAMBDA/* learner.hash digitization: avg loss of precision = 0.01 */#define DIGITIZE_LWEIGHTS/* tolerance for the error in divergence -    more accuracy means slower learning. */#define TOL 0.05#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H#include <wctype.h>#include <wchar.h>#else#undef HAVE_LIBBOOST_REGEX#endif#include <limits.h>#include <stdio.h>#if defined HAVE_LIBBOOST_REGEX#define UNICODE#include <boost/regex.h>#else #include <sys/types.h>#include <regex.h>#endif/* some systems seem to have broken sys/types */#if defined OS_SUN#include <ieeefp.h>typedef uint8_t u_int8_t;typedef uint16_t u_int16_t;typedef uint32_t u_int32_t;typedef uint64_t u_int64_t;#endif/* below, FMT_* macros are used in printf/scanf format strings */#if defined HUGE_MEMORY_MODELtypedef u_int64_t token_count_t;typedef u_int8_t token_order_t;typedef u_int8_t hash_bit_count_t;typedef u_int64_t hash_value_t;typedef u_int64_t hash_count_t;typedef unsigned int hash_percentage_t;typedef u_int8_t category_count_t;typedef u_int8_t regex_count_t;typedef u_int32_t document_count_t;typedef int16_t digitized_weight_t;typedef float weight_t;typedef long double score_t;#define FMT_printf_score_t "Lf"#define FMT_scanf_score_t "Lf"typedef u_int8_t token_stack_t;typedef int charbuf_len_t;typedef u_int16_t alphabet_size_t;typedef u_int16_t smbitmap_t;typedef int error_code_t;typedef int bool_t;/* where token counts wrap around */#define K_TOKEN_COUNT_MAX ((token_count_t)18446744073709551615U)/* size of hash in bits */#define MAX_HASH_BITS ((hash_bit_count_t)64)/* maximum size of a token, beyond that rest is ignored */#define MAX_TOKEN_LEN ((charbuf_len_t)100)/* for line filtering: maximum number of tokens allowed on a single line */#define MAX_TOKEN_LINE_STACK ((token_stack_t)256) #define BUFLEN ((charbuf_len_t)1024)#elif defined NORMAL_MEMORY_MODELtypedef u_int32_t token_count_t;typedef u_int8_t token_order_t;typedef u_int8_t hash_bit_count_t;typedef u_int32_t hash_value_t;typedef u_int32_t hash_count_t;typedef unsigned int hash_percentage_t;typedef u_int8_t category_count_t;typedef u_int8_t regex_count_t;typedef u_int32_t document_count_t;typedef int16_t digitized_weight_t;typedef float weight_t;typedef double score_t;#define FMT_printf_score_t "f"#define FMT_scanf_score_t "lf"typedef u_int8_t token_stack_t;typedef int charbuf_len_t;typedef u_int16_t alphabet_size_t;typedef u_int16_t smbitmap_t;typedef int error_code_t;typedef int bool_t;/* where token counts wrap around */#define K_TOKEN_COUNT_MAX ((token_count_t)4294967295U)/* size of hash in bits */#define MAX_HASH_BITS ((hash_bit_count_t)30)/* maximum size of a token, beyond that rest is ignored */#define MAX_TOKEN_LEN ((charbuf_len_t)100)/* for line filtering: maximum number of tokens allowed on a single line */#define MAX_TOKEN_LINE_STACK ((token_stack_t)128) #define BUFLEN ((charbuf_len_t)1024)#elif defined SMALL_MEMORY_MODELtypedef u_int32_t token_count_t;typedef u_int8_t token_order_t;typedef u_int8_t hash_bit_count_t;typedef u_int16_t hash_value_t;typedef u_int16_t hash_count_t;typedef unsigned int hash_percentage_t;typedef u_int8_t category_count_t;typedef u_int8_t regex_count_t;typedef u_int16_t document_count_t;typedef int16_t digitized_weight_t;typedef float weight_t;typedef double score_t;#define FMT_printf_score_t "f"#define FMT_scanf_score_t "lf"typedef u_int8_t token_stack_t;typedef int charbuf_len_t;typedef u_int16_t alphabet_size_t;typedef u_int16_t smbitmap_t;typedef int error_code_t;typedef int bool_t;/* where token counts wrap around */#define K_TOKEN_COUNT_MAX ((token_count_t)4294967295U)/* size of hash in bits */#define MAX_HASH_BITS ((hash_bit_count_t)15)/* maximum size of a token, beyond that rest is ignored */#define MAX_TOKEN_LEN ((charbuf_len_t)100)/* for line filtering: maximum number of tokens allowed on a single line */#define MAX_TOKEN_LINE_STACK ((token_stack_t)128) #define BUFLEN ((charbuf_len_t)1024)#elif defined TINY_MEMORY_MODEL/* not tested, this model probably doesn't work ;-) */#undef DIGITIZE_DIGRAMStypedef u_int32_t token_count_t;typedef u_int8_t token_order_t;typedef u_int8_t hash_bit_count_t;typedef u_int8_t hash_value_t;typedef u_int8_t hash_count_t;typedef unsigned int hash_percentage_t;typedef u_int8_t category_count_t;typedef u_int8_t regex_count_t;typedef u_int8_t document_count_t;typedef int16_t digitized_weight_t;typedef float weight_t;typedef double score_t;#define FMT_printf_score_t "f"#define FMT_scanf_score_t "lf"typedef u_int8_t token_stack_t;typedef int charbuf_len_t;typedef u_int16_t alphabet_size_t;typedef u_int16_t smbitmap_t;typedef int error_code_t;typedef int bool_t;#define K_TOKEN_COUNT_MAX ((token_count_t)4294967295U)/* size of hash in bits */#define MAX_HASH_BITS ((hash_bit_count_t)8)/* maximum size of a token, beyond that rest is ignored */#define MAX_TOKEN_LEN ((charbuf_len_t)100)/* for line filtering: maximum number of tokens allowed on a single line */#define MAX_TOKEN_LINE_STACK ((token_stack_t)128) #define BUFLEN ((charbuf_len_t)512)#endif/* this is common to all memory models */#if defined OS_DARWIN/* the system I tested this on didn't seem to like packed structures */#define PACK_STRUCTS#else/* disable this if speed is paramount  */#define PACK_STRUCTS __attribute__ ((packed))#endif/* when digitizing transitions, this stands for -infinity */#define DIGITIZED_WEIGHT_MIN ((digitized_weight_t)SHRT_MIN)#define DIGITIZED_WEIGHT_MAX ((digitized_weight_t)SHRT_MAX)/* maximum number of categories we can handle simultaneously */#define MAX_CAT ((category_count_t)64)/* percentage of hash we use */#define HASH_FULL ((hash_percentage_t)95)/* alphabet size */#define ASIZE ((alphabet_size_t)256)#define DIAMOND '\001'/* options */#define OPTION_CLASSIFY     1#define OPTION_LEARN        2#define OPTION_FASTEMP      3#define OPTION_CUTOFF       4#define OPTION_VERBOSE      5#define INPUT_FROM_CMDLINE  6#define OPTION_SCORES       7#define OPTION_POSTERIOR    8#define OPTION_IDENTIFY     9#define OPTION_FILTER       10#define OPTION_REFMODEL     11#define OPTION_TEXT_FORMAT  12#define OPTION_MBOX_FORMAT  13#define OPTION_XML          14#define OPTION_DEBUG        15#define OPTION_I18N         16#define OPTION_NOREGEX      17#define OPTION_CASEN        18#define OPTION_CALCENTROPY  19#define OPTION_MULTINOMIAL  20#define OPTION_DUMP         21#define OPTION_APPEND       22#define OPTION_DECIMATE     23#define OPTION_GROWHASH     24#define OPTION_INDENTED     25typedef u_int32_t options_t; /* make sure big enough for all options */typedef long int re_bitfield;/* maximum number of regular expressions we can handle */#define MAX_RE ((regex_count_t)(8 * sizeof(re_bitfield)))/* maximum number of tagged subexpressions we can handle for each regex */#define MAX_SUBMATCH ((token_order_t)10)/* macros *//* used for digitizing */#if defined DIGITIZE_LWEIGHTS#define PACK_LWEIGHTS(a) ((digitized_weight_t)(digitize_a_weight(a)))#define UNPACK_LWEIGHTS(a) ((weight_t)(a)/100.00)#else#define PACK_LWEIGHTS(a) ((weight_t)(a))#define UNPACK_LWEIGHTS(a) ((weight_t)(a))#endif#if defined DIGITIZE_LAMBDA#define PACK_LAMBDA(a) ((digitized_weight_t)(digitize_a_weight(a)))#define UNPACK_LAMBDA(a) ((weight_t)(a)/100.00)#else#define PACK_LAMBDA(a) ((weight_t)(a))#define UNPACK_LAMBDA(a) ((weight_t)(a))#endif#if defined DIGITIZE_DIGRAMS#define PACK_DIGRAMS(a) ((digitized_weight_t)(digitize_a_weight(a)))#define UNPACK_DIGRAMS(a) ((weight_t)(a)/100.00)#define SIZEOF_DIGRAMS (sizeof(digitized_weight_t))#else#define PACK_DIGRAMS(a) ((weight_t)(a))#define UNPACK_DIGRAMS(a) ((weight_t)(a))#define SIZEOF_DIGRAMS (sizeof(weight_t))#endif/* used in hash code */#define FILLEDP(a) ((a)->id)#define EQUALP(a,b) ((a)==(b))#define SET(a,b) (a = (b))#define SETMARK(a) ((a)->ltrms = 1.0)#define UNSETMARK(a) ((a)->ltrms = 0.0)#define MARKEDP(a) ((a)->ltrms == 1.0)/* used by both category load and learner save functions */#define MAGIC_BUFSIZE 256#define MAGIC1    "# dbacl version "VERSION" category %s %s\n"#define MAGIC2_i  "# entropy %" FMT_scanf_score_t \                  " logZ %" FMT_scanf_score_t " max_order %hd" \                  " type %s\n"#define MAGIC2_o  "# entropy %" FMT_printf_score_t \                  " logZ %" FMT_printf_score_t " max_order %hd" \                  " type %s\n"#define MAGIC3    "# hash_size %hd" \                  " features %ld unique_features %ld" \                  " documents %ld\n"#define MAGIC4    "# case sensitive\n"#define MAGIC5_i  "# regex %s\n"#define MAGIC5_o  "# regex %s||%s\n"#define MAGIC5_wo "# regex %ls||%s\n"#define RESTARTPOS 8#define MAGIC6    "#\n"#define MAGIC7    "# wide characters\n"/* data structures */typedef struct {  hash_value_t id;  token_count_t count;} h_item;typedef struct {  hash_count_t max_tokens;  hash_bit_count_t max_hash_bits;  token_count_t full_token_count;  token_count_t unique_token_count;  h_item *hash;  bool_t track_features;  h_item *feature_stack[MAX_TOKEN_LINE_STACK];  token_stack_t feature_stack_top;} Empirical;typedef struct {  hash_value_t id;#if defined DIGITIZE_LAMBDA  digitized_weight_t lam;#else  weight_t lam;#endif} PACK_STRUCTS c_item;typedef struct {  char *filename;  enum {simple, sequential} model_type;  token_order_t max_order;  token_count_t complexity;  token_count_t model_unique_token_count;  token_count_t model_full_token_count;  document_count_t model_num_docs;  hash_count_t max_tokens;  hash_bit_count_t max_hash_bits;  re_bitfield retype;  score_t logZ;  score_t divergence;  score_t score;  c_item *hash;#if defined DIGITIZE_DIGRAMS  digitized_weight_t dig[ASIZE][ASIZE];#else  weight_t dig[ASIZE][ASIZE];#endif} Category;typedef struct {  token_count_t count;  weight_t lam; #if defined DIGITIZE_LWEIGHTS  digitized_weight_t ltrms;  digitized_weight_t dref;#else  weight_t ltrms;  weight_t dref;#endif  hash_value_t id;  token_order_t order;} PACK_STRUCTS l_item;typedef struct {  char *filename;  FILE *tmp;  token_order_t max_order;  token_count_t fixed_order_token_count[MAX_SUBMATCH];  token_count_t fixed_order_unique_token_count[MAX_SUBMATCH];  hash_bit_count_t max_hash_bits;  hash_count_t max_tokens;  token_count_t full_token_count;  token_count_t unique_token_count;  score_t logZ;  score_t divergence;  l_item *hash;  weight_t dig[ASIZE][ASIZE];  long int regex_token_count[MAX_RE + 1];  document_count_t num_docs;} Learner;typedef struct {  double alpha;  double u[ASIZE];} Dirichlet;typedef struct {  regex_t regex;#if defined HAVE_LIBBOOST_REGEX  wchar_t *string;#else   char *string;#endif  smbitmap_t submatches;} Regex;typedef struct {  enum { UNDEF, HEADER, BODY} state;  bool_t prev_line_empty;  bool_t mime_type;  bool_t checked_content_type;} MBOX_State;typedef struct {  enum {TEXT, TAG, COMMENT, SPECIAL} state;} XML_State;#ifdef __cplusplusextern "C" {#endif  /* these are defined in catfun.c */  char *sanitize_path(char *in);  digitized_weight_t digitize_a_weight(weight_t w);  void init_empirical(Empirical *emp, hash_count_t dmt, hash_bit_count_t dmhb);  void clear_empirical(Empirical *emp);  h_item *find_in_empirical(Empirical *emp, hash_value_t id);  score_t empirical_entropy(Empirical *emp);  void init_category(Category *cat);  void free_category(Category *cat);  c_item *find_in_category(Category *cat, hash_value_t id);  void init_purely_random_text_category(Category *cat);  error_code_t load_category(Category *cat);  void score_word(char *tok, token_order_t r, regex_count_t re);  /* string hash function in jenkins.c */  unsigned long int hash( unsigned char *k, 			  unsigned long int length, 			  unsigned long int initval);  /* file format handling in fh.c */  void init_file_handling();  void cleanup_file_handling();  void reset_mbox_line_filter();  void reset_xml_character_filter();  bool_t mbox_line_filter(char *line);  void xml_character_filter(char *line);  void process_file(FILE *input, 		    int (*line_filter)(char *),		    void (*character_filter)(char *), 		    void (*word_fun)(char *, token_order_t, regex_count_t), 		    char *(*pre_line_fun)(char *),		    void (*post_line_fun)(char *));  bool_t w_mbox_line_filter(wchar_t *line);  void w_xml_character_filter(wchar_t *line);  void w_process_file(FILE *input, 		      int (*line_filter)(wchar_t *),		      void (*character_filter)(wchar_t *), 		      void (*word_fun)(char *, token_order_t, regex_count_t), 		      char *(*pre_line_fun)(char *),		      void (*post_line_fun)(char *));  /* probabilities in probs.c */  double log_poisson(int k, double lambda);#ifdef __cplusplus}#endif
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -