📄 dbacl.h
字号:
/* * Copyright (C) 2002 Laird Breyer * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Author: Laird Breyer <laird@lbreyer.com> *//* we define several memory models, which differ basically in the number of bytes used for the hash tables. Adjust to taste *//* use this for 64-bit hashes */#undef HUGE_MEMORY_MODEL/* use this for 32-bit hashes */#define NORMAL_MEMORY_MODEL /* use this for 16-bit hashes */#undef SMALL_MEMORY_MODEL/* use this for 8-bit hashes */#undef TINY_MEMORY_MODEL/* the following defines set up a tradeoff between modelling accuracy and memory requirements - season to taste *//* digram digitization: avg loss of precision = 0.01 * token size */#define DIGITIZE_DIGRAMS/* lambda digitization: avg loss of precision = 0.01 */#define DIGITIZE_LAMBDA/* learner.hash digitization: avg loss of precision = 0.01 */#define DIGITIZE_LWEIGHTS/* tolerance for the error in divergence - more accuracy means slower learning. */#define TOL 0.05#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H#include <wctype.h>#include <wchar.h>#else#undef HAVE_LIBBOOST_REGEX#endif#include <limits.h>#include <stdio.h>#if defined HAVE_LIBBOOST_REGEX#define UNICODE#include <boost/regex.h>#else #include <sys/types.h>#include <regex.h>#endif/* some systems seem to have broken sys/types */#if defined OS_SUN#include <ieeefp.h>typedef uint8_t u_int8_t;typedef uint16_t u_int16_t;typedef uint32_t u_int32_t;typedef uint64_t u_int64_t;#endif/* below, FMT_* macros are used in printf/scanf format strings */#if defined HUGE_MEMORY_MODELtypedef u_int64_t token_count_t;typedef u_int8_t token_order_t;typedef u_int8_t hash_bit_count_t;typedef u_int64_t hash_value_t;typedef u_int64_t hash_count_t;typedef unsigned int hash_percentage_t;typedef u_int8_t category_count_t;typedef u_int8_t regex_count_t;typedef u_int32_t document_count_t;typedef int16_t digitized_weight_t;typedef float weight_t;typedef long double score_t;#define FMT_printf_score_t "Lf"#define FMT_scanf_score_t "Lf"typedef u_int8_t token_stack_t;typedef int charbuf_len_t;typedef u_int16_t alphabet_size_t;typedef u_int16_t smbitmap_t;typedef int error_code_t;typedef int bool_t;/* where token counts wrap around */#define K_TOKEN_COUNT_MAX ((token_count_t)18446744073709551615U)/* size of hash in bits */#define MAX_HASH_BITS ((hash_bit_count_t)64)/* maximum size of a token, beyond that rest is ignored */#define MAX_TOKEN_LEN ((charbuf_len_t)100)/* for line filtering: maximum number of tokens allowed on a single line */#define MAX_TOKEN_LINE_STACK ((token_stack_t)256) #define BUFLEN ((charbuf_len_t)1024)#elif defined NORMAL_MEMORY_MODELtypedef u_int32_t token_count_t;typedef u_int8_t token_order_t;typedef u_int8_t hash_bit_count_t;typedef u_int32_t hash_value_t;typedef u_int32_t hash_count_t;typedef unsigned int hash_percentage_t;typedef u_int8_t category_count_t;typedef u_int8_t regex_count_t;typedef u_int32_t document_count_t;typedef int16_t digitized_weight_t;typedef float weight_t;typedef double score_t;#define FMT_printf_score_t "f"#define FMT_scanf_score_t "lf"typedef u_int8_t token_stack_t;typedef int charbuf_len_t;typedef u_int16_t alphabet_size_t;typedef u_int16_t smbitmap_t;typedef int error_code_t;typedef int bool_t;/* where token counts wrap around */#define K_TOKEN_COUNT_MAX ((token_count_t)4294967295U)/* size of hash in bits */#define MAX_HASH_BITS ((hash_bit_count_t)30)/* maximum size of a token, beyond that rest is ignored */#define MAX_TOKEN_LEN ((charbuf_len_t)100)/* for line filtering: maximum number of tokens allowed on a single line */#define MAX_TOKEN_LINE_STACK ((token_stack_t)128) #define BUFLEN ((charbuf_len_t)1024)#elif defined SMALL_MEMORY_MODELtypedef u_int32_t token_count_t;typedef u_int8_t token_order_t;typedef u_int8_t hash_bit_count_t;typedef u_int16_t hash_value_t;typedef u_int16_t hash_count_t;typedef unsigned int hash_percentage_t;typedef u_int8_t category_count_t;typedef u_int8_t regex_count_t;typedef u_int16_t document_count_t;typedef int16_t digitized_weight_t;typedef float weight_t;typedef double score_t;#define FMT_printf_score_t "f"#define FMT_scanf_score_t "lf"typedef u_int8_t token_stack_t;typedef int charbuf_len_t;typedef u_int16_t alphabet_size_t;typedef u_int16_t smbitmap_t;typedef int error_code_t;typedef int bool_t;/* where token counts wrap around */#define K_TOKEN_COUNT_MAX ((token_count_t)4294967295U)/* size of hash in bits */#define MAX_HASH_BITS ((hash_bit_count_t)15)/* maximum size of a token, beyond that rest is ignored */#define MAX_TOKEN_LEN ((charbuf_len_t)100)/* for line filtering: maximum number of tokens allowed on a single line */#define MAX_TOKEN_LINE_STACK ((token_stack_t)128) #define BUFLEN ((charbuf_len_t)1024)#elif defined TINY_MEMORY_MODEL/* not tested, this model probably doesn't work ;-) */#undef DIGITIZE_DIGRAMStypedef u_int32_t token_count_t;typedef u_int8_t token_order_t;typedef u_int8_t hash_bit_count_t;typedef u_int8_t hash_value_t;typedef u_int8_t hash_count_t;typedef unsigned int hash_percentage_t;typedef u_int8_t category_count_t;typedef u_int8_t regex_count_t;typedef u_int8_t document_count_t;typedef int16_t digitized_weight_t;typedef float weight_t;typedef double score_t;#define FMT_printf_score_t "f"#define FMT_scanf_score_t "lf"typedef u_int8_t token_stack_t;typedef int charbuf_len_t;typedef u_int16_t alphabet_size_t;typedef u_int16_t smbitmap_t;typedef int error_code_t;typedef int bool_t;#define K_TOKEN_COUNT_MAX ((token_count_t)4294967295U)/* size of hash in bits */#define MAX_HASH_BITS ((hash_bit_count_t)8)/* maximum size of a token, beyond that rest is ignored */#define MAX_TOKEN_LEN ((charbuf_len_t)100)/* for line filtering: maximum number of tokens allowed on a single line */#define MAX_TOKEN_LINE_STACK ((token_stack_t)128) #define BUFLEN ((charbuf_len_t)512)#endif/* this is common to all memory models */#if defined OS_DARWIN/* the system I tested this on didn't seem to like packed structures */#define PACK_STRUCTS#else/* disable this if speed is paramount */#define PACK_STRUCTS __attribute__ ((packed))#endif/* when digitizing transitions, this stands for -infinity */#define DIGITIZED_WEIGHT_MIN ((digitized_weight_t)SHRT_MIN)#define DIGITIZED_WEIGHT_MAX ((digitized_weight_t)SHRT_MAX)/* maximum number of categories we can handle simultaneously */#define MAX_CAT ((category_count_t)64)/* percentage of hash we use */#define HASH_FULL ((hash_percentage_t)95)/* alphabet size */#define ASIZE ((alphabet_size_t)256)#define DIAMOND '\001'/* options */#define OPTION_CLASSIFY 1#define OPTION_LEARN 2#define OPTION_FASTEMP 3#define OPTION_CUTOFF 4#define OPTION_VERBOSE 5#define INPUT_FROM_CMDLINE 6#define OPTION_SCORES 7#define OPTION_POSTERIOR 8#define OPTION_IDENTIFY 9#define OPTION_FILTER 10#define OPTION_REFMODEL 11#define OPTION_TEXT_FORMAT 12#define OPTION_MBOX_FORMAT 13#define OPTION_XML 14#define OPTION_DEBUG 15#define OPTION_I18N 16#define OPTION_NOREGEX 17#define OPTION_CASEN 18#define OPTION_CALCENTROPY 19#define OPTION_MULTINOMIAL 20#define OPTION_DUMP 21#define OPTION_APPEND 22#define OPTION_DECIMATE 23#define OPTION_GROWHASH 24#define OPTION_INDENTED 25typedef u_int32_t options_t; /* make sure big enough for all options */typedef long int re_bitfield;/* maximum number of regular expressions we can handle */#define MAX_RE ((regex_count_t)(8 * sizeof(re_bitfield)))/* maximum number of tagged subexpressions we can handle for each regex */#define MAX_SUBMATCH ((token_order_t)10)/* macros *//* used for digitizing */#if defined DIGITIZE_LWEIGHTS#define PACK_LWEIGHTS(a) ((digitized_weight_t)(digitize_a_weight(a)))#define UNPACK_LWEIGHTS(a) ((weight_t)(a)/100.00)#else#define PACK_LWEIGHTS(a) ((weight_t)(a))#define UNPACK_LWEIGHTS(a) ((weight_t)(a))#endif#if defined DIGITIZE_LAMBDA#define PACK_LAMBDA(a) ((digitized_weight_t)(digitize_a_weight(a)))#define UNPACK_LAMBDA(a) ((weight_t)(a)/100.00)#else#define PACK_LAMBDA(a) ((weight_t)(a))#define UNPACK_LAMBDA(a) ((weight_t)(a))#endif#if defined DIGITIZE_DIGRAMS#define PACK_DIGRAMS(a) ((digitized_weight_t)(digitize_a_weight(a)))#define UNPACK_DIGRAMS(a) ((weight_t)(a)/100.00)#define SIZEOF_DIGRAMS (sizeof(digitized_weight_t))#else#define PACK_DIGRAMS(a) ((weight_t)(a))#define UNPACK_DIGRAMS(a) ((weight_t)(a))#define SIZEOF_DIGRAMS (sizeof(weight_t))#endif/* used in hash code */#define FILLEDP(a) ((a)->id)#define EQUALP(a,b) ((a)==(b))#define SET(a,b) (a = (b))#define SETMARK(a) ((a)->ltrms = 1.0)#define UNSETMARK(a) ((a)->ltrms = 0.0)#define MARKEDP(a) ((a)->ltrms == 1.0)/* used by both category load and learner save functions */#define MAGIC_BUFSIZE 256#define MAGIC1 "# dbacl version "VERSION" category %s %s\n"#define MAGIC2_i "# entropy %" FMT_scanf_score_t \ " logZ %" FMT_scanf_score_t " max_order %hd" \ " type %s\n"#define MAGIC2_o "# entropy %" FMT_printf_score_t \ " logZ %" FMT_printf_score_t " max_order %hd" \ " type %s\n"#define MAGIC3 "# hash_size %hd" \ " features %ld unique_features %ld" \ " documents %ld\n"#define MAGIC4 "# case sensitive\n"#define MAGIC5_i "# regex %s\n"#define MAGIC5_o "# regex %s||%s\n"#define MAGIC5_wo "# regex %ls||%s\n"#define RESTARTPOS 8#define MAGIC6 "#\n"#define MAGIC7 "# wide characters\n"/* data structures */typedef struct { hash_value_t id; token_count_t count;} h_item;typedef struct { hash_count_t max_tokens; hash_bit_count_t max_hash_bits; token_count_t full_token_count; token_count_t unique_token_count; h_item *hash; bool_t track_features; h_item *feature_stack[MAX_TOKEN_LINE_STACK]; token_stack_t feature_stack_top;} Empirical;typedef struct { hash_value_t id;#if defined DIGITIZE_LAMBDA digitized_weight_t lam;#else weight_t lam;#endif} PACK_STRUCTS c_item;typedef struct { char *filename; enum {simple, sequential} model_type; token_order_t max_order; token_count_t complexity; token_count_t model_unique_token_count; token_count_t model_full_token_count; document_count_t model_num_docs; hash_count_t max_tokens; hash_bit_count_t max_hash_bits; re_bitfield retype; score_t logZ; score_t divergence; score_t score; c_item *hash;#if defined DIGITIZE_DIGRAMS digitized_weight_t dig[ASIZE][ASIZE];#else weight_t dig[ASIZE][ASIZE];#endif} Category;typedef struct { token_count_t count; weight_t lam; #if defined DIGITIZE_LWEIGHTS digitized_weight_t ltrms; digitized_weight_t dref;#else weight_t ltrms; weight_t dref;#endif hash_value_t id; token_order_t order;} PACK_STRUCTS l_item;typedef struct { char *filename; FILE *tmp; token_order_t max_order; token_count_t fixed_order_token_count[MAX_SUBMATCH]; token_count_t fixed_order_unique_token_count[MAX_SUBMATCH]; hash_bit_count_t max_hash_bits; hash_count_t max_tokens; token_count_t full_token_count; token_count_t unique_token_count; score_t logZ; score_t divergence; l_item *hash; weight_t dig[ASIZE][ASIZE]; long int regex_token_count[MAX_RE + 1]; document_count_t num_docs;} Learner;typedef struct { double alpha; double u[ASIZE];} Dirichlet;typedef struct { regex_t regex;#if defined HAVE_LIBBOOST_REGEX wchar_t *string;#else char *string;#endif smbitmap_t submatches;} Regex;typedef struct { enum { UNDEF, HEADER, BODY} state; bool_t prev_line_empty; bool_t mime_type; bool_t checked_content_type;} MBOX_State;typedef struct { enum {TEXT, TAG, COMMENT, SPECIAL} state;} XML_State;#ifdef __cplusplusextern "C" {#endif /* these are defined in catfun.c */ char *sanitize_path(char *in); digitized_weight_t digitize_a_weight(weight_t w); void init_empirical(Empirical *emp, hash_count_t dmt, hash_bit_count_t dmhb); void clear_empirical(Empirical *emp); h_item *find_in_empirical(Empirical *emp, hash_value_t id); score_t empirical_entropy(Empirical *emp); void init_category(Category *cat); void free_category(Category *cat); c_item *find_in_category(Category *cat, hash_value_t id); void init_purely_random_text_category(Category *cat); error_code_t load_category(Category *cat); void score_word(char *tok, token_order_t r, regex_count_t re); /* string hash function in jenkins.c */ unsigned long int hash( unsigned char *k, unsigned long int length, unsigned long int initval); /* file format handling in fh.c */ void init_file_handling(); void cleanup_file_handling(); void reset_mbox_line_filter(); void reset_xml_character_filter(); bool_t mbox_line_filter(char *line); void xml_character_filter(char *line); void process_file(FILE *input, int (*line_filter)(char *), void (*character_filter)(char *), void (*word_fun)(char *, token_order_t, regex_count_t), char *(*pre_line_fun)(char *), void (*post_line_fun)(char *)); bool_t w_mbox_line_filter(wchar_t *line); void w_xml_character_filter(wchar_t *line); void w_process_file(FILE *input, int (*line_filter)(wchar_t *), void (*character_filter)(wchar_t *), void (*word_fun)(char *, token_order_t, regex_count_t), char *(*pre_line_fun)(char *), void (*post_line_fun)(char *)); /* probabilities in probs.c */ double log_poisson(int k, double lambda);#ifdef __cplusplus}#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -