📄 libbow.h
字号:
/* Given a "word index" WI, return its WORD, according to the global word-int mapping. */const char *bow_int2docname (int wi);/* Given a WORD, return its "word index", WI, according to the global word-int mapping; if it's not yet in the mapping, add it. */int bow_docname2int (const char *word);/* Return the total number of unique words in the int/word map. */int bow_num_docnames ();/* Save the docname map to file-pointer FP. */void bow_docnames_write (FILE *fp);/* Read the docname from file-pointer FP. */void bow_docnames_read_from_fp (FILE *fp);/* xxx Perhaps the name should be changed from "dv" to "cv", for "class vector", or "concept vector", or "codebook vector". *//* Document vectors. A "document vector" is a sorted array of documents, with count information attached to each document. Typically, there would be one "document vector" associated with a word. If "word vectors" are the rows of a large matrix, "document vectors" are the columns. It can be more efficient to search just the docment vectors of the words in the query document, than it is to search the word vectors of all documents. *//* A "document entry"; these are the elements of a "document vector". */typedef struct _bow_de { int di; /* a "document index" */ int count; /* number of times X appears in document DI */ float weight;} bow_de;/* A "document vector" */ typedef struct _bow_dv { int length; /* xxx Rename this to num_entries */ int size; float idf; /* The idf factor for this word. */ bow_de entry[0];} bow_dv;/* Create a new, empty "document vector". */bow_dv *bow_dv_new (int capacity);/* The default capacity used when 0 is passed for CAPACITY above. */extern unsigned int bow_dv_default_capacity; /* Add a new entry to the "document vector" *DV. */void bow_dv_add_di_count_weight (bow_dv **dv, int di, int count, float weight);/* Set the count & weight of the "document vector" *DV. */void bow_dv_set_di_count_weight (bow_dv **dv, int di, int count, float weight);/* Sum the WEIGHT into the document vector DV at document index DI, creating a new entry in the document vector if necessary. */void bow_dv_add_di_weight (bow_dv **dv, int di, float weight);/* Return a pointer to the BOW_DE for a particular document, or return NULL if there is no entry for that document. */bow_de *bow_dv_entry_at_di (bow_dv *dv, int di);/* Write "document vector" DV to the stream FP. */void bow_dv_write (bow_dv *dv, FILE *fp);/* Return the number of bytes required for writing the "document vector" DV. */int bow_dv_write_size (bow_dv *dv);/* Return a new "document vector" read from a pointer into a data file, FP. */bow_dv *bow_dv_new_from_data_fp (FILE *fp);/* Free the memory held by the "document vector" DV. */void bow_dv_free (bow_dv *dv);/* A "document vector with file info (file storage information)" */typedef struct _bow_dvf { int seek_start; bow_dv *dv;} bow_dvf;/* xxx Perhaps these should be generalized and renamed to `bow_i2v'? *//* An array that maps "word indices" to "document vectors with file info" */typedef struct _bow_wi2dvf { int size; /* the number of ENTRY's allocated */ int num_words; /* number of non-NULL dv's in this wi2dvf */ FILE *fp; /* where to get DVF's that aren't cached yet */ bow_dvf entry[0]; /* array of info about each word */} bow_wi2dvf;/* Create an empty `wi2dvf' */bow_wi2dvf *bow_wi2dvf_new (int capacity);/* The default capacity used when 0 is passed for CAPACITY above. */extern unsigned int bow_wi2dvf_default_capacity;/* Create a `wi2dvf' by reading data from file-pointer FP. This doesn't actually read in all the "document vectors"; it only reads in the DVF information, and lazily loads the actual "document vectors". */bow_wi2dvf *bow_wi2dvf_new_from_data_fp (FILE *fp);/* Create a `wi2dvf' by reading data from a file. This doesn't actually read in all the "document vectors"; it only reads in the DVF information, and lazily loads the actually "document vectors". */bow_wi2dvf *bow_wi2dvf_new_from_data_file (const char *filename);/* Return the "document vector" corresponding to "word index" WI. If is hasn't been read already, this function will read the "document vector" out of the file passed to bow_wi2dvf_new_from_data_file(). If the DV has been "hidden" (by feature selection, for example) it will return NULL.*/bow_dv *bow_wi2dvf_dv (bow_wi2dvf *wi2dvf, int wi);/* Return the "document vector" corresponding to "word index" WI. This function will read the "document vector" out of the file passed to bow_wi2dvf_new_from_file() if is hasn't been read already. If the DV has been "hidden" (by feature selection, for example) it will not be returned unless EVEN_IF_HIDDEN is non-zero. */bow_dv *bow_wi2dvf_dv_hidden (bow_wi2dvf *wi2dvf, int wi, int even_if_hidden);/* Return a pointer to the BOW_DE for a particular word/document pair, or return NULL if there is no entry for that pair. */bow_de *bow_wi2dvf_entry_at_wi_di (bow_wi2dvf *wi2dvf, int wi, int di);/* Read all the words from file pointer FP, and add them to the map WI2DVF, such that they are associated with document index DI. */int bow_wi2dvf_add_di_text_fp (bow_wi2dvf **wi2dvf, int di, FILE *fp, const char *filename);/* Read all the words from file pointer FP, and add them to the map WI2DVF, such that they are associated with document index DI. */int bow_wi2dvf_add_di_text_str (bow_wi2dvf **wi2dvf, int di, char *data, const char *filename);/* Add a "word vector" WV, associated with "document index" DI, to the map WI2DVF. */ void bow_wi2dvf_add_di_wv (bow_wi2dvf **wi2dvf, int di, bow_wv *wv);/* Write WI2DVF to file-pointer FP, in a machine-independent format. This is the format expected by bow_wi2dvf_new_from_fp(). */void bow_wi2dvf_add_wi_di_count_weight (bow_wi2dvf **wi2dvf, int wi, int di, int count, float weight);/* set the count and weight of the appropriate entry in the wi2dvf */void bow_wi2dvf_set_wi_di_count_weight (bow_wi2dvf **wi2dvf, int wi, int di, int count, float weight);/* Remove the word with index WI from the vocabulary of the map WI2DVF */void bow_wi2dvf_remove_wi (bow_wi2dvf *wi2dvf, int wi);/* Temporarily hide the word with index WI from the vocabulary of the map WI2DVF. The function BOW_WI2DVF_DV() will no longer see the entry for this WI, but */void bow_wi2dvf_hide_wi (bow_wi2dvf *wi2dvf, int wi);/* hide all the words that exist */void bow_wi2dvf_hide_all_wi (bow_wi2dvf *wi2dvf);/* unhide a specific word index */void bow_wi2dvf_unhide_wi (bow_wi2dvf *wi2dvf, int wi);/* Hide all words occuring in only COUNT or fewer number of documents. Return the number of words hidden. */int bow_wi2dvf_hide_words_by_doc_count (bow_wi2dvf *wi2dvf, int count);/* Hide all words occuring in only COUNT or fewer times. Return the number of words hidden. */int bow_wi2dvf_hide_words_by_occur_count (bow_wi2dvf *wi2dvf, int count);/* hide all words where the prefix of the word matches the given prefix */int bow_wi2dvf_hide_words_with_prefix (bow_wi2dvf *wi2dvf, char *prefix);/* hide all words where the prefix of the word doesn't match the given prefix */int bow_wi2dvf_hide_words_without_prefix (bow_wi2dvf *wi2dvf, char *prefix);/* Make visible all DVF's that were hidden with BOW_WI2DVF_HIDE_WI(). */void bow_wi2dvf_unhide_all_wi (bow_wi2dvf *wi2dvf);/* Set the WI2DVF->ENTRY[WI].IDF to the sum of the COUNTS for the given WI. */void bow_wi2dvf_set_idf_to_count (bow_wi2dvf *wi2dvf);/* Write WI2DVF to file-pointer FP, in a machine-independent format. This is the format expected by bow_wi2dvf_new_from_fp(). */void bow_wi2dvf_write (bow_wi2dvf *wi2dvf, FILE *fp);/* Write WI2DVF to a file, in a machine-independent format. This is the format expected by bow_wi2dvf_new_from_file(). */void bow_wi2dvf_write_data_file (bow_wi2dvf *wi2dvf, const char *filename);/* Compare two maps, and return 0 if they are equal. This function was written for debugging. */int bow_wi2dvf_compare (bow_wi2dvf *map1, bow_wi2dvf *map2);/* Print statistics about the WI2DVF map to STDOUT. */void bow_wi2dvf_print_stats (bow_wi2dvf *map);/* Free the memory held by the map WI2DVF. */void bow_wi2dvf_free (bow_wi2dvf *wi2dvf);/* Remove words that don't occur in WI2DVF */void bow_wv_prune_words_not_in_wi2dvf (bow_wv *wv, bow_wi2dvf *wi2dvf);/* xxx Move these to prind.c *//* If this is non-zero, use uniform class priors. */extern int bow_prind_uniform_priors;/* If this is non-zero, scale PrInd P(w|d) by information gain */extern int bow_prind_scale_by_infogain;/* If this is zero, do not normalize the PrInd classification scores. */extern int bow_prind_normalize_scores;typedef enum { bow_smoothing_goodturing, bow_smoothing_laplace, bow_smoothing_mestimate, bow_smoothing_wittenbell, bow_smoothing_dirichlet} bow_smoothing;/* A wrapper around a wi2dvf/cdocs combination. */typedef struct _bow_barrel { struct _rainbow_method *method; /* TFIDF, NaiveBayes, PrInd, others. */ bow_array *cdocs; /* The documents (or classes, for VPC) */ bow_wi2dvf *wi2dvf; /* The matrix of words vs documents */ bow_int4str *classnames; /* A map between classnames and indices */ int is_vpc; /* non-zero if each `document' is a `class' */} bow_barrel;/* An array of these is filled in by the method's scoring function. */typedef struct _bow_score { int di; /* The "document index" for this document */ double weight; /* Its score */ const char *name;} bow_score;typedef struct _bow_method { /* String identifer for the method, used for selection. */ const char *name;} bow_method;/* The parameters of weighting and scoring in barrel's. */typedef struct _rainbow_method { /* String identifer for the method, used for selection. */ const char *name; /* Functions for implementing parts of the method. */ void (*set_weights)(bow_barrel *barrel); void (*scale_weights)(bow_barrel *barrel, bow_barrel *doc_barrel); void (*normalize_weights)(bow_barrel *barrel); bow_barrel* (*vpc_with_weights)(bow_barrel *doc_barrel); void (*vpc_set_priors)(bow_barrel *barrel, bow_barrel *doc_barrel); int (*score)(bow_barrel *barrel, bow_wv *query_wv, bow_score *scores, int num_scores, int loo_class); void (*wv_set_weights)(bow_wv *wv, bow_barrel *barrel); void (*wv_normalize_weights)(bow_wv *wv); void (*free_barrel)(bow_barrel *barrel); /* Parameters of the method. */ void *params;} rainbow_method;/* Macros that make it easier to call the RAINBOW_METHOD functions */#define bow_barrel_set_weights(BARREL) \if ((*(BARREL)->method->set_weights)) \ ((*(BARREL)->method->set_weights)(BARREL))#define bow_barrel_scale_weights(BARREL, DOC_BARREL) \if ((*(BARREL)->method->scale_weights)) \ ((*(BARREL)->method->scale_weights)(BARREL, DOC_BARREL))#define bow_barrel_normalize_weights(BARREL) \if ((*(BARREL)->method->normalize_weights)) \ ((*(BARREL)->method->normalize_weights)(BARREL))#define bow_barrel_new_vpc_with_weights(BARREL) \((*(BARREL)->method->vpc_with_weights)(BARREL))#define bow_barrel_score(BARREL, QUERY_WV, SCORES, NUM_SCORES, LOO_CLASS) \((*(BARREL)->method->score)(BARREL, QUERY_WV, SCORES, NUM_SCORES, LOO_CLASS))#define bow_wv_set_weights(WV,BARREL) \if ((*(BARREL)->method->wv_set_weights)) \ ((*(BARREL)->method->wv_set_weights)(WV, BARREL))#define bow_wv_normalize_weights(WV,BARREL) \if (((*(BARREL)->method->wv_normalize_weights))) \ ((*(BARREL)->method->wv_normalize_weights)(WV))#define bow_free_barrel(BARREL) \((*(BARREL)->method->free_barrel)(BARREL))#define bow_barrel_num_classes(BARREL) \(((BARREL)->classnames) \ ? ((BARREL)->classnames->str_array_length) \ : ((BARREL)->cdocs->length))#define bow_barrel_classname_at_index(BARREL, INDEX) \(bow_int2str ((BARREL)->classnames, INDEX))#define bow_barrel_add_classname(BARREL, NAME) \(bow_str2int ((BARREL)->classnames, NAME))#include <bow/tfidf.h>#include <bow/naivebayes.h>#include <bow/prind.h>#include <bow/kl.h>#include <bow/em.h>#include <bow/knn.h>struct argp_child; /* forward declare this type *//* Associate method M with the string NAME, so the method structure can be retrieved later with BOW_METHOD_AT_NAME(). Set the group number of the CHILD so the command-line options for this option will appear separately. If there is no argp_child for this method, pass NULL for CHILD. */int bow_method_register_with_name (bow_method *m, const char *name, int size, struct argp_child *child);/* Return a pointer to a method structure that was previous registered with string NAME using BOW_METHOD_REGISTER_WITH_NAME(). */bow_method *bow_method_at_name (const char *name);/* The mapping from names to BOW_METHOD's. */extern bow_sarray *bow_methods;#define bow_default_method_name "naivebayes"/* Create a new, empty `bow_barrel', with cdoc's of size ENTRY_SIZE and cdoc free function FREE_FUNC. WORD_CAPACITY and CLASS_CAPACITY are just hints. */bow_barrel *bow_barrel_new (int word_capacity, int class_capacity, int entry_size, void (*free_func)());
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -