📄 libbow.h
字号:
void bow_words_remove_occurrences_less_than (int occur);/* Return the total number of unique words in the int/word map. */int bow_num_words ();/* Save the int/word map to file-pointer FP. */void bow_words_write (FILE *fp);/* Same as above, but with a filename instead of a FILE* */void bow_words_write_to_file (const char *filename);/* Read the int/word map from file-pointer FP. */void bow_words_read_from_fp (FILE *fp);/* Same as above, but with a filename instead of a FILE* */void bow_words_read_from_file (const char *filename);/* Same as above, but don't bother rereading unless filename is different from the last one, or FORCE_UPDATE is non-zero. */void bow_words_reread_from_file (const char *filename, int force_update);/* Read the int/word map (in incremental format) from file-pointer FP. */void bow_words_read_from_fp_inc (FILE *fp);/* Lists of words sorted by some score, for example, infogain *//* An entry for a word and its score */typedef struct _bow_ws { int wi; float weight;} bow_ws;/* An array of words and their scores. */typedef struct _bow_wa { int size; int length; bow_ws *entry;} bow_wa;/* Create a new, empty array of word/score entries, with CAPACITY entries. */bow_wa *bow_wa_new (int capacity);/* Add a new word and score to the array */int bow_wa_append (bow_wa *wa, int wi, float score);/* Add a score to the array. If there is already an entry for WI, the SCORE gets added to WI's current score. If WI is not already in the array, then this function behaves like bow_wa_append(). */int bow_wa_add (bow_wa *wa, int wi, float score);/* Add a score to the array. If there is already an entry for WI at the end, the SCORE gets added to WI's current score. If WI is greater than the WI at the end, then this function behaves like bow_wa_append(), otherwise an error is raised. */int bow_wa_add_to_end (bow_wa *wa, int wi, float score);/* Remove the entry corresponding to word WI. Return the new length of the word array. */int bow_wa_remove (bow_wa *wa, int wi);/* Add to WA all the WI/WEIGHT entries from WA2. Uses bow_wa_add(). */int bow_wa_union (bow_wa *wa, bow_wa *wa2);/* Return a new array containing only WI entries that are in both WA1 and WA2. */bow_wa *bow_wa_intersection (bow_wa *wa1, bow_wa *wa2);/* Add weights to WA1 for those entries appearing in WA2 */int bow_wa_overlay (bow_wa *wa1, bow_wa *wa2);/* Return a new array containing only WI entries that are in WA1 but not in WA2. */bow_wa *bow_wa_diff (bow_wa *wa1, bow_wa *wa2);/* Sort the word array with high values first. */void bow_wa_sort (bow_wa *wa);/* Sort the word array with high values last. */void bow_wa_sort_reverse (bow_wa *wa);/* Print the first N entries of the word array WA to stream FP. */void bow_wa_fprintf (bow_wa *wa, FILE *fp, int n);/* Remove all entries from the word array */void bow_wa_empty (bow_wa *wa);/* Free the word array */void bow_wa_free (bow_wa *wa);/* Word vectors. A "word vector" is sorted array of words, with count information attached to each word. Typically, there would be one "word vector" associated with a document, or with a concept. *//* A "word entry"; these are the elements of a "word vector" */typedef struct _bow_we { int wi; int count; float weight;} bow_we;/* A "word vector", containing an array of words with their statistics */typedef struct _bow_wv { int num_entries; /* the number of unique words in the vector */ float normalizer; /* multiply weights by this for normalizing */ bow_we entry[0];} bow_wv;/* Create and return a new "word vector" from a file. */bow_wv *bow_wv_new_from_text_fp (FILE *fp, const char *filename);/* Create and return a new "word vector" from a string. */bow_wv *bow_wv_new_from_text_string (char *the_string);/* Create and return a new "word vector" from a document buffer LEX. */bow_wv *bow_wv_new_from_lex (bow_lex *lex);/* Create and return a new "word vector" that is the sum of all the "word vectors" in WV_ARRAY. The second parameter, WV_ARRAY_LENGTH, is the number of "word vectors" in WV_ARRAY. */bow_wv *bow_wv_add (bow_wv **wv_array, int wv_array_length);/* Create and return a new "word vector" with uninitialized contents. */bow_wv *bow_wv_new (int capacity);/* Allocate a return a copy of WV */bow_wv * bow_wv_copy (bow_wv *wv);/* Return the number of word occurrences in the WV */int bow_wv_word_count (bow_wv *wv);/* Return a pointer to the "word entry" with index WI in "word vector WV */bow_we *bow_wv_entry_for_wi (bow_wv *wv, int wi);/* Return the count entry of "word" with index WI in "word vector" WV */int bow_wv_count_for_wi (bow_wv *wv, int wi);/* Print "word vector" WV on stream FP in a human-readable format. */void bow_wv_fprintf (FILE *fp, bow_wv *wv);/* Print "word vector" WV to a string in a human-readable format. */char *bow_wv_sprintf (bow_wv *wv, unsigned int max_size_for_string);/* Print "word vector"'s actual words to a string. */char *bow_wv_sprintf_words (bow_wv *wv, unsigned int max_size_for_string);/* Assign a value to the "word vector's" NORMALIZER field, such that when all the weights in the vector are multiplied by the NORMALIZER, the Euclidian length of the vector will be one. */void bow_wv_normalize_weights_by_vector_length (bow_wv *wv);/* Assign a value to the "word vector's" NORMALIZER field, such that when all the weights in the vector are multiplied by the NORMALIZER, all the vector entries will to one. */void bow_wv_normalize_weights_by_summing (bow_wv *wv);/* Return the sum of the weight entries. */float bow_wv_weight_sum (bow_wv *wv);/* Return the number of bytes required for writing the "word vector" WV. */int bow_wv_write_size (bow_wv *wv);/* Write "word vector" DV to the stream FP. */void bow_wv_write (bow_wv *wv, FILE *fp);/* Return a new "word vector" read from a pointer into a data file, FP. */bow_wv *bow_wv_new_from_data_fp (FILE *fp);/* Free the memory held by the "word vector" WV. */void bow_wv_free (bow_wv *wv);/* Collections of "word vectors. *//* An array that maps "document indices" to "word vectors" */typedef struct _bow_di2wv { int length; int size; bow_wv *entry[0];} bow_di2wv;/* Documents */ /* We want a nice way of saying this is a training or test document, or do we ignore it for now. */typedef enum { bow_doc_train, /* Use this to calculate P(w|C) */ bow_doc_test, /* Classify these for test results */ bow_doc_unlabeled, /* the "unlabeled" docs in EM and active learning */ bow_doc_untagged, /* Not yet assigned a tag */ bow_doc_validation, /* docs used for a validation set */ bow_doc_ignore, /* docs left unused */ bow_doc_pool, /* the cotraining candidate pool */ bow_doc_waiting /* the "unlabeled" docs not used by cotraining yet */} bow_doc_type;#define bow_str2type(STR) \((strcmp (STR, "train") == 0) \ ? bow_doc_train \ : ((strcmp (STR, "test") == 0) \ ? bow_doc_test \ : ((strcmp (STR, "unlabeled") == 0) \ ? bow_doc_unlabeled \ : ((strcmp (STR, "validation") == 0) \ ? bow_doc_validation \ : ((strcmp (STR, "ignore") == 0) \ ? bow_doc_ignore \ : ((strcmp (STR, "pool") == 0) \ ? bow_doc_pool \ : ((strcmp (STR, "waiting") == 0) \ ? bow_doc_waiting \ : -1))))))) #define bow_type2str(T) \((T == bow_doc_train) \ ? "train" \ : ((T == bow_doc_test) \ ? "test" \ : ((T == bow_doc_unlabeled) \ ? "unlabeled" \ : ((T == bow_doc_untagged) \ ? "untagged" \ : ((T == bow_doc_validation) \ ? "validation" \ : ((T == bow_doc_ignore) \ ? "ignore" \ : ((T == bow_doc_pool) \ ? "pool" \ : ((T == bow_doc_waiting) \ ? "waiting" \ : "UNKNOWN DOC TYPE"))))))))/* A generic "document" entry, useful for setting document types. All other "document" entries should begin the same as this one. */typedef struct _bow_doc { bow_doc_type type; int class; const char *filename;} bow_doc;/* These are defined in split.c */int bow_doc_is_train (bow_doc *doc);int bow_doc_is_test (bow_doc *doc);int bow_doc_is_unlabeled (bow_doc *doc);int bow_doc_is_untagged (bow_doc *doc);int bow_doc_is_validation (bow_doc *doc);int bow_doc_is_ignore (bow_doc *doc);int bow_doc_is_pool (bow_doc *doc);int bow_doc_is_waiting (bow_doc *doc);/* A "document" entry useful for standard classification tasks. */typedef struct _bow_cdoc { bow_doc_type type; /* Is this document part of the model to be built, a test document, or to be ignored */ int class; /* A classification label. */ const char *filename; /* Where to find the original document */ int word_count; /* Total number of words in this document */ float normalizer; /* Multiply weights by this for normalizing */ float prior; /* Prior probability of this class/doc */ float *class_probs; /* Probabilistic classification labels */} bow_cdoc;/* A convenient interface to bow_array that is specific to bow_cdoc. */#define bow_cdocs bow_array#define bow_cdocs_new(CAPACITY) bow_array_new (CAPACITY, sizeof (bow_cdoc), 0)#define bow_cdocs_register_doc(CDOCS,CDOC) bow_array_append (CDOCS, CDOC)#define bow_cdocs_di2doc(CDOCS, DI) bow_array_entry_at_index (CDOCS, DI)/* Traversing directories to get filenames. *//* A list of document names. *//* xxx We might change this someday to allow for multiple documents per file, e.g. for "mbox" files containing many email messages. */typedef struct _bow_doc_list { struct _bow_doc_list *next; char filename[0];} bow_doc_list;/* Return a non-zero value if the file FP contains mostly text. */int bow_fp_is_text (FILE *fp);/* Return a non-zero value if the char array BUF contains mostly text. */int bow_str_is_text (char *buf);/* bow_*_is_text() always returns `yes'. This is useful for Japanese byte codes. */extern int bow_is_text_always_yes;/* Calls the function CALLBACK for each of the filenames encountered when recursively descending the directory named DIRNAME. CALLBACK should be a pointer to function that takes a filename char-pointer, and a void-pointer as arguments and returns an integer. Currently the return value is ignored, but it may be used in the future to cut short, causing bow_map_filesnames_from_dir to return immediately. The value CONTEXT will be passed as the second argument to the CALLBACK function; it provides you with a way to transfer context you may need inside the implementation of the callback function. EXCLUDE_PATTERNS is currently ignored. */intbow_map_filenames_from_dir (int (*callback)(const char *filename, void *context), void *context, const char *dirname, const char *exclude_patterns);/* Calls the function CALLBACK for each of the files found in the database DIRNAME_ARG. See bow_map_filenames_from_dir for more info */intbow_map_filenames_from_hdb (int (*callback)(const char *filename, char *data, void *context), void *context, const char *dirname, const char *exclude_patterns);/* Create a linked list of filenames, and append the file list pointed to by FL to it; return the new concatenated lists in *FL. The function returns the total number of filenames. When creating the list, look for files (and recursively descend directories) among those matching INCLUDE_PATTERNS, but don't include those matching EXCLUDE_PATTERNS; don't include files that aren't text files. *//* xxx For now, this only works with a single directory name in INCLUDE_PATTERNS, and it ignores EXCLUDE_PATTERNS. */int bow_doc_list_append (bow_doc_list **list, const char *include_patterns, const char *exclude_patterns);/* Print the file list FL to the output stream FP. */void bow_doc_list_fprintf (FILE *fp, bow_doc_list *fl);/* Return the number of entries in the "docname list" DL. */int bow_doc_list_length (bow_doc_list *dl);/* Free the memory held by the file list FL. */void bow_doc_list_free (bow_doc_list *fl);/* A convient interface to a specific instance of the above int/string mapping; this one is intended for all the documents encountered. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -