📄 libbow.h
字号:
void bow_stoplist_replace_with_file (const char *filename);/* Add WORD to the stop list. */void bow_stoplist_add_word (const char *word);/* Arrays of C struct's that can grow. */typedef struct _bow_array { int length; /* number of elements in the array */ int size; /* number of elts for which alloc'ed space */ int entry_size; /* number of bytes in each entry */ void (*free_func)(void*); /* call this with each entry when freeing */ int growth_factor; /* mult, then divide by 1-this when realloc */ void *entries; /* the malloc'ed space for the entries */} bow_array;extern int bow_array_default_capacity;extern int bow_array_default_growth_factor;/* Allocate, initialize and return a new array structure. */bow_array *bow_array_new (int capacity, int entry_size, void (*free_func)());/* Initialize an already allocated array structure. */void bow_array_init (bow_array *array, int capacity, int entry_size, void (*free_func)());/* Append an entry to the array. Return its index. */int bow_array_append (bow_array *array, void *entry);/* Append an entry to the array by reading from fp. Return its index, or -1 if there are no more entries to be read. */int bow_array_append_from_fp_inc (bow_array *array, int (*read_func)(void*,FILE*), FILE *fp);/* Return what will be the index of the next entry to be appended */int bow_array_next_index (bow_array *array);/* Return a pointer to the array entry at index INDEX. */void *bow_array_entry_at_index (bow_array *array, int index);/* Write the array ARRAY to the file-pointer FP, using the function WRITE_FUNC to write each of the entries in ARRAY. */void bow_array_write (bow_array *array, int (*write_func)(void*,FILE*), FILE *fp);/* Write the incremental format header to the file-pointer FP */void bow_array_write_header_inc (bow_array *array, FILE *fp);/* Write one entry in incremental format to the file-pointer FP, using the function WRITE_FUNC. It will fseek to the appropriate location to write. */void bow_array_write_entry_inc (bow_array *array, int i, int (*write_func)(void*,FILE*), FILE *fp);/* Return a new array, created by reading file-pointer FP, and using the function READ_FUNC to read each of the array entries. The returned array will have entry-freeing-function FREE_FUNC. */bow_array *bow_array_new_from_fp_inc (int (*read_func)(void*,FILE*), void (*free_func)(), FILE *fp);/* Return a new array, created by reading file-pointer FP, and using the function READ_FUNC to read each of the array entries. The returned array will have entry-freeing-function FREE_FUNC. */bow_array *bow_array_new_from_data_fp (int (*read_func)(void*,FILE*), void (*free_func)(), FILE *fp);/* Return a new array, created by reading file-pointer FP, and using the function READ_FUNC to read each of the array entries. The array entries will have size MIN_ENTRY_SIZE, or larger, if indicated by the data in FP; this is useful when a structure is re-defined to be larger. The returned array will have entry-freeing-function FREE_FUNC. */bow_array *bow_array_new_with_entry_size_from_data_fp (int min_entry_size, int (*read_func)(void*,FILE*), void (*free_func)(), FILE *fp);/* Free the memory held by the array ARRAY. */void bow_array_free (bow_array *array);/* Sparse arrays of C struct's that can grow. */typedef struct _bow_sparray { void **entry_root; int entry_size; /* number of bytes in each entry */ void (*free_func)(void*); /* call this with each entry when freeing */} bow_sparray;/* Managing int->string and string->int mappings. */typedef struct _bow_int4str { const char **str_array; int str_array_length; int str_array_size; int *str_hash; int str_hash_size;} bow_int4str;/* Allocate, initialize and return a new int/string mapping structure. The parameter CAPACITY is used as a hint about the number of words to expect; if you don't know or don't care about a CAPACITY value, pass 0, and a default value will be used. */bow_int4str *bow_int4str_new (int capacity);/* Given a integer INDEX, return its corresponding string. */const char *bow_int2str (bow_int4str *map, int index);/* Given the char-pointer STRING, return its integer index. If this is the first time we're seeing STRING, add it to the mapping, assign it a new index, and return the new index. */int bow_str2int (bow_int4str *map, const char *string);/* Just like BOW_STR2INT, except assume that the STRING's ID has already been calculated. */int _bow_str2int (bow_int4str *map, const char *string, unsigned id);/* Given the char-pointer STRING, return its integer index. If STRING is not yet in the mapping, return -1. */int bow_str2int_no_add (bow_int4str *map, const char *string);/* Create a new int-str mapping by lexing words from FILE. */bow_int4str *bow_int4str_new_from_text_file (const char *filename);/* Create a new int-str mapping words fscanf'ed from FILE using %s. */bow_int4str *bow_int4str_new_from_string_file (const char *filename);/* Write the int-str mapping to file-pointer FP. */void bow_int4str_write (bow_int4str *map, FILE *fp);/* Return a new int-str mapping, created by reading file-pointer FP. */bow_int4str *bow_int4str_new_from_fp (FILE *fp);/* Same as above, but in incremental format. */bow_int4str *bow_int4str_new_from_fp_inc (FILE *fp);/* Return a new int-str mapping, created by reading FILENAME. */bow_int4str *bow_int4str_new_from_file (const char *filename);/* Free the memory held by the int-word mapping MAP. */void bow_int4str_free (bow_int4str *map);/* Arrays of C struct's that can grow. Entries can be retrieved either by integer index, or by string key. */typedef struct _bow_sarray { bow_array *array; bow_int4str *i4k;} bow_sarray;extern int bow_sarray_default_capacity;/* Allocate, initialize and return a new sarray structure. */bow_sarray *bow_sarray_new (int capacity, int entry_size, void (*free_func)());/* Initialize a newly allocated sarray structure. */void bow_sarray_init (bow_sarray *sa, int capacity, int entry_size, void (*free_func)());/* Append a new entry to the array. Also make the entry accessible by the string KEYSTR. Returns the index of the new entry. */int bow_sarray_add_entry_with_keystr (bow_sarray *sa, void *entry, const char *keystr);/* Append a new entry to the array. Also make the entry accessible by the string KEYSTR. Reflect changes on disk. Returns the index of the new entry. */intbow_sarray_add_entry_with_keystr_inc (bow_sarray *sa, void *entry, const char *keystr, int (*write_func)(void*,FILE*), FILE *i4k_fp, FILE *array_fp);/* Return a pointer to the entry at index INDEX. */void *bow_sarray_entry_at_index (bow_sarray *sa, int index);/* Return a pointer to the entry associated with string KEYSTR. */void *bow_sarray_entry_at_keystr (bow_sarray *sa, const char *keystr);/* Return the string KEYSTR associated with the entry at index INDEX. */const char *bow_sarray_keystr_at_index (bow_sarray *sa, int index);/* Return the index of the entry associated with the string KEYSTR. */int bow_sarray_index_at_keystr (bow_sarray *sa, const char *keystr);/* Write the sarray SARRAY to the file-pointer FP, using the function WRITE_FUNC to write each of the entries in SARRAY. */void bow_sarray_write (bow_sarray *sarray, int (*write_func)(void*,FILE*), FILE *fp);/* Return a new sarray, created by reading file-pointer FP, and using the function READ_FUNC to read each of the sarray entries. The returned sarray will have entry-freeing-function FREE_FUNC. */bow_sarray *bow_sarray_new_from_data_fp (int (*read_func)(void*,FILE*), void (*free_func)(), FILE *fp);/* Return a new sarray, created by reading file-pointers I4K_FP and ARRAY_FP, and using the function READ_FUNC to read each of the icremental-format array entries from FP_ARRAY. The returned sarray will have entry-freeing-function FREE_FUNC. */bow_sarray *bow_sarray_new_from_data_fps_inc (int (*read_func)(void*,FILE*), void (*free_func)(), FILE *i4k_fp, FILE *array_fp);/* Free the memory held by the bow_sarray SA. */void bow_sarray_free (bow_sarray *sa);/* Bit vectors, indexed by multiple dimensions. They can grow automatically in the last dimension. */typedef struct _bow_bitvec { int num_dimensions; /* the number of dimensions by which indexed */ int *dimension_sizes; /* the sizes of each index dimension */ int vector_size; /* size of VECTOR in bytes */ int bits_set; /* number of bits set to 1 */ unsigned char *vector; /* the memory for the bit vector */} bow_bitvec;/* Allocate, initialize and return a new "bit vector" that is indexed by NUM_DIMENSIONS different dimensions. The size of each dimension is given in DIMENSION_SIZES. The size of the last dimension is used as hint for allocating initial memory for the vector, but in practice, higher indices for the last dimension can be used later, and the bit vector will grow automatically. Initially, the bit vector contains all zeros. */bow_bitvec *bow_bitvec_new (int num_dimensions, int *dimension_sizes);/* Set all the bits in the bit vector BV to 0 if value is zero, to 1 otherwise. */void bow_bitvec_set_all_to_value (bow_bitvec *bv, int value);/* If VALUE is non-zero, set the bit at indices INDICES to 1, otherwise set it to zero. Returns the previous value of that bit. */int bow_bitvec_set (bow_bitvec *bv, int *indices, int value);/* Return the value of the bit at indicies INDICIES. */int bow_bitvec_value (bow_bitvec *bv, int *indices);/* Free the memory held by the "bit vector" BV. */void bow_bitvec_free (bow_bitvec *bv);/* A trie for testing membership in a set of lowercase alphabetic strings */typedef struct _bow_strtrie { struct _bow_strtrie *next[27];} bow_strtrie;/* Return a new strtrie */bow_strtrie *bow_strtrie_new ();/* Add the string STR to the trie STRIE */void bow_strtrie_add (bow_strtrie *strie, const char *str);/* Return non-zero if the string STR is present in the trie STRIE */int bow_strtrie_present (bow_strtrie *strie, const char *str);/* Free the memory occupied by STRIE */void bow_strtrie_free (bow_strtrie *strie);/* A convient interface to a specific instance of the above int/string mapping; this one is intended for all the words encountered in all documents. *//* Given a "word index" WI, return its WORD, according to the global word-int mapping. */const char *bow_int2word (int wi);/* Given a WORD, return its "word index", WI, according to the global word-int mapping; if it's not yet in the mapping, add it. */int bow_word2int (const char *word);/* Given a WORD, return its "word index", WI, according to the global word-int mapping; if it's not yet in the mapping, add it, and write the word to file-pointer fp as well. */int bow_word2int_inc (const char *word, FILE *fp);/* Given a WORD, return its "word index", WI, according to the global word-int mapping; if it's not yet in the mapping, return -1. */int bow_word2int_no_add (const char *word);/* Like bow_word2int(), except it also increments the occurrence count associated with WORD. */int bow_word2int_add_occurrence (const char *word);/* The int/string mapping for bow's vocabulary words. */extern bow_int4str *word_map;/* If this is non-zero, then bow_word2int() will return -1 when asked for the index of a word that is not already in the mapping. Essentially, setting this to non-zero makes bow_word2int() and bow_word2int_add_occurrence() behave like bow_str2int_no_add(). */extern int bow_word2int_do_not_add;/* If this is non-zero and bow_word2int_do_not_add is non-zero, then bow_word2int() will return the index of the "<unknown>" token when asked for the index of a word that is not already in the mapping. */extern int bow_word2int_use_unknown_word;#define BOW_UNKNOWN_WORD "<unknown>"/* Add to the word occurrence counts from the documents in FILENAME. */int bow_words_add_occurrences_from_file (const char *filename);/* Add to the word occurrence counts by recursively decending directory DIRNAME and lexing all the text files; skip any files matching EXCEPTION_NAME. */int bow_words_add_occurrences_from_text_dir (const char *dirname, const char *exception_name);/* Add to the word occurrence counts reading all entries in HDB database DIRNAME and parsing all the text files; skip any files matching EXCEPTION_NAME. */int bow_words_add_occurrences_from_hdb (const char *dirname, const char *exception_name);/* Return the number of times bow_word2int_add_occurrence() was called with the word whose index is WI. */int bow_words_occurrences_for_wi (int wi);/* Replace the current word/int mapping with MAP. */void bow_words_set_map (bow_int4str *map, int free_old_map);/* Modify the int/word mapping by removing all words that occurred less than OCCUR number of times. WARNING: This totally changes the word/int mapping; any WV's, WI2DVF's or BARREL's you build with the old mapping will have bogus WI's afterward. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -