⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 libbow.h

📁 机器学习作者tom mitchell的书上代码
💻 H
📖 第 1 页 / 共 5 页
字号:
/* Given a "word index" WI, return its WORD, according to the global   word-int mapping. */const char *bow_int2docname (int wi);/* Given a WORD, return its "word index", WI, according to the global   word-int mapping; if it's not yet in the mapping, add it. */int bow_docname2int (const char *word);/* Return the total number of unique words in the int/word map. */int bow_num_docnames ();/* Save the docname map to file-pointer FP. */void bow_docnames_write (FILE *fp);/* Read the docname from file-pointer FP. */void bow_docnames_read_from_fp (FILE *fp);/* xxx Perhaps the name should be changed from "dv" to "cv", for   "class vector", or "concept vector", or "codebook vector". *//* Document vectors.  A "document vector" is a sorted array of   documents, with count information attached to each document.   Typically, there would be one "document vector" associated with a   word.  If "word vectors" are the rows of a large matrix, "document   vectors" are the columns.  It can be more efficient to search just   the docment vectors of the words in the query document, than it is   to search the word vectors of all documents. *//* A "document entry"; these are the elements of a "document vector". */typedef struct _bow_de {  int di;			/* a "document index" */  int count;			/* number of times X appears in document DI */  float weight;} bow_de;/* A "document vector" */ typedef struct _bow_dv {  int length;			/* xxx Rename this to num_entries */  int size;  float idf;                    /* The idf factor for this word. */  bow_de entry[0];} bow_dv;/* Create a new, empty "document vector". */bow_dv *bow_dv_new (int capacity);/* The default capacity used when 0 is passed for CAPACITY above. */extern unsigned int bow_dv_default_capacity; /* Add a new entry to the "document vector" *DV. */void bow_dv_add_di_count_weight (bow_dv **dv, int di, int count, float weight);/* Set the count & weight of the "document vector" *DV. */void bow_dv_set_di_count_weight (bow_dv **dv, int di, int count, float weight);/* Sum the WEIGHT into the document vector DV at document index DI,   creating a new entry in the document vector if necessary. */void bow_dv_add_di_weight (bow_dv **dv, int di, float weight);/* Return a pointer to the BOW_DE for a particular document, or return   NULL if there is no entry for that document. */bow_de *bow_dv_entry_at_di (bow_dv *dv, int di);/* Write "document vector" DV to the stream FP. */void bow_dv_write (bow_dv *dv, FILE *fp);/* Return the number of bytes required for writing the "document vector" DV. */int bow_dv_write_size (bow_dv *dv);/* Return a new "document vector" read from a pointer into a data file, FP. */bow_dv *bow_dv_new_from_data_fp (FILE *fp);/* Free the memory held by the "document vector" DV. */void bow_dv_free (bow_dv *dv);/* A "document vector with file info (file storage information)" */typedef struct _bow_dvf {  int seek_start;  bow_dv *dv;} bow_dvf;/* xxx Perhaps these should be generalized and renamed to `bow_i2v'? *//* An array that maps "word indices" to "document vectors with file info" */typedef struct _bow_wi2dvf {  int size;			/* the number of ENTRY's allocated */  int num_words;		/* number of non-NULL dv's in this wi2dvf */  FILE *fp;			/* where to get DVF's that aren't cached yet */  bow_dvf entry[0];		/* array of info about each word */} bow_wi2dvf;/* Create an empty `wi2dvf' */bow_wi2dvf *bow_wi2dvf_new (int capacity);/* The default capacity used when 0 is passed for CAPACITY above. */extern unsigned int bow_wi2dvf_default_capacity;/* Create a `wi2dvf' by reading data from file-pointer FP.  This   doesn't actually read in all the "document vectors"; it only reads   in the DVF information, and lazily loads the actual "document   vectors". */bow_wi2dvf *bow_wi2dvf_new_from_data_fp (FILE *fp);/* Create a `wi2dvf' by reading data from a file.  This doesn't actually    read in all the "document vectors"; it only reads in the DVF    information, and lazily loads the actually "document vectors". */bow_wi2dvf *bow_wi2dvf_new_from_data_file (const char *filename);/* Return the "document vector" corresponding to "word index" WI.  If   is hasn't been read already, this function will read the "document   vector" out of the file passed to bow_wi2dvf_new_from_data_file().   If the DV has been "hidden" (by feature selection, for example) it   will return NULL.*/bow_dv *bow_wi2dvf_dv (bow_wi2dvf *wi2dvf, int wi);/* Return the "document vector" corresponding to "word index" WI.  This   function will read the "document vector" out of the file passed to   bow_wi2dvf_new_from_file() if is hasn't been read already.  If the    DV has been "hidden" (by feature selection, for example) it will not   be returned unless EVEN_IF_HIDDEN is non-zero. */bow_dv *bow_wi2dvf_dv_hidden (bow_wi2dvf *wi2dvf, int wi, int even_if_hidden);/* Return a pointer to the BOW_DE for a particular word/document pair,    or return NULL if there is no entry for that pair. */bow_de *bow_wi2dvf_entry_at_wi_di (bow_wi2dvf *wi2dvf, int wi, int di);/* Read all the words from file pointer FP, and add them to the map   WI2DVF, such that they are associated with document index DI. */int bow_wi2dvf_add_di_text_fp (bow_wi2dvf **wi2dvf, int di, FILE *fp,			       const char *filename);/* Read all the words from file pointer FP, and add them to the map   WI2DVF, such that they are associated with document index DI. */int bow_wi2dvf_add_di_text_str (bow_wi2dvf **wi2dvf, int di, char *data,				const char *filename);/* Add a "word vector" WV, associated with "document index" DI, to    the map WI2DVF. */ void bow_wi2dvf_add_di_wv (bow_wi2dvf **wi2dvf, int di, bow_wv *wv);/* Write WI2DVF to file-pointer FP, in a machine-independent format.   This is the format expected by bow_wi2dvf_new_from_fp(). */void bow_wi2dvf_add_wi_di_count_weight (bow_wi2dvf **wi2dvf, int wi,					int di, int count, float weight);/* set the count and weight of the appropriate entry in the wi2dvf */void bow_wi2dvf_set_wi_di_count_weight (bow_wi2dvf **wi2dvf, int wi,					int di, int count, float weight);/* Remove the word with index WI from the vocabulary of the map WI2DVF */void bow_wi2dvf_remove_wi (bow_wi2dvf *wi2dvf, int wi);/* Temporarily hide the word with index WI from the vocabulary of the   map WI2DVF. The function BOW_WI2DVF_DV() will no longer see the entry   for this WI, but */void bow_wi2dvf_hide_wi (bow_wi2dvf *wi2dvf, int wi);/* hide all the words that exist */void bow_wi2dvf_hide_all_wi (bow_wi2dvf *wi2dvf);/* unhide a specific word index */void bow_wi2dvf_unhide_wi (bow_wi2dvf *wi2dvf, int wi);/* Hide all words occuring in only COUNT or fewer number of documents.   Return the number of words hidden. */int bow_wi2dvf_hide_words_by_doc_count (bow_wi2dvf *wi2dvf, int count);/* Hide all words occuring in only COUNT or fewer times.   Return the number of words hidden. */int bow_wi2dvf_hide_words_by_occur_count (bow_wi2dvf *wi2dvf, int count);/* hide all words where the prefix of the word matches the given   prefix */int bow_wi2dvf_hide_words_with_prefix (bow_wi2dvf *wi2dvf, char *prefix);/* hide all words where the prefix of the word doesn't match the given   prefix */int bow_wi2dvf_hide_words_without_prefix (bow_wi2dvf *wi2dvf, char *prefix);/* Make visible all DVF's that were hidden with BOW_WI2DVF_HIDE_WI(). */void bow_wi2dvf_unhide_all_wi (bow_wi2dvf *wi2dvf);/* Set the WI2DVF->ENTRY[WI].IDF to the sum of the COUNTS for the   given WI. */void bow_wi2dvf_set_idf_to_count (bow_wi2dvf *wi2dvf);/* Write WI2DVF to file-pointer FP, in a machine-independent format.   This is the format expected by bow_wi2dvf_new_from_fp(). */void bow_wi2dvf_write (bow_wi2dvf *wi2dvf, FILE *fp);/* Write WI2DVF to a file, in a machine-independent format.  This   is the format expected by bow_wi2dvf_new_from_file(). */void bow_wi2dvf_write_data_file (bow_wi2dvf *wi2dvf, const char *filename);/* Compare two maps, and return 0 if they are equal.  This function was   written for debugging. */int bow_wi2dvf_compare (bow_wi2dvf *map1, bow_wi2dvf *map2);/* Print statistics about the WI2DVF map to STDOUT. */void bow_wi2dvf_print_stats (bow_wi2dvf *map);/* Free the memory held by the map WI2DVF. */void bow_wi2dvf_free (bow_wi2dvf *wi2dvf);/* Remove words that don't occur in WI2DVF */void bow_wv_prune_words_not_in_wi2dvf (bow_wv *wv, bow_wi2dvf *wi2dvf);/* xxx Move these to prind.c *//* If this is non-zero, use uniform class priors. */extern int bow_prind_uniform_priors;/* If this is non-zero, scale PrInd P(w|d) by information gain */extern int bow_prind_scale_by_infogain;/* If this is zero, do not normalize the PrInd classification scores. */extern int bow_prind_normalize_scores;typedef enum {  bow_smoothing_goodturing,  bow_smoothing_laplace,  bow_smoothing_mestimate,  bow_smoothing_wittenbell,  bow_smoothing_dirichlet} bow_smoothing;/* A wrapper around a wi2dvf/cdocs combination. */typedef struct _bow_barrel {  struct _rainbow_method *method; /* TFIDF, NaiveBayes, PrInd, others. */  bow_array *cdocs;		/* The documents (or classes, for VPC) */  bow_wi2dvf *wi2dvf;		/* The matrix of words vs documents */  bow_int4str *classnames;	/* A map between classnames and indices */  int is_vpc;			/* non-zero if each `document' is a `class' */} bow_barrel;/* An array of these is filled in by the method's scoring function. */typedef struct _bow_score {  int di;			/* The "document index" for this document */  double weight;		/* Its score */  const char *name;} bow_score;typedef struct _bow_method {  /* String identifer for the method, used for selection. */  const char *name;} bow_method;/* The parameters of weighting and scoring in barrel's. */typedef struct _rainbow_method {  /* String identifer for the method, used for selection. */  const char *name;  /* Functions for implementing parts of the method. */  void (*set_weights)(bow_barrel *barrel);  void (*scale_weights)(bow_barrel *barrel, bow_barrel *doc_barrel);  void (*normalize_weights)(bow_barrel *barrel);  bow_barrel* (*vpc_with_weights)(bow_barrel *doc_barrel);  void (*vpc_set_priors)(bow_barrel *barrel, bow_barrel *doc_barrel);  int (*score)(bow_barrel *barrel, bow_wv *query_wv, 	       bow_score *scores, int num_scores, int loo_class);  void (*wv_set_weights)(bow_wv *wv, bow_barrel *barrel);  void (*wv_normalize_weights)(bow_wv *wv);  void (*free_barrel)(bow_barrel *barrel);  /* Parameters of the method. */  void *params;} rainbow_method;/* Macros that make it easier to call the RAINBOW_METHOD functions */#define bow_barrel_set_weights(BARREL)		\if ((*(BARREL)->method->set_weights))           \  ((*(BARREL)->method->set_weights)(BARREL))#define bow_barrel_scale_weights(BARREL, DOC_BARREL)		\if ((*(BARREL)->method->scale_weights))				\  ((*(BARREL)->method->scale_weights)(BARREL, DOC_BARREL))#define bow_barrel_normalize_weights(BARREL)		\if ((*(BARREL)->method->normalize_weights))		\  ((*(BARREL)->method->normalize_weights)(BARREL))#define bow_barrel_new_vpc_with_weights(BARREL) \((*(BARREL)->method->vpc_with_weights)(BARREL))#define bow_barrel_score(BARREL, QUERY_WV, SCORES, NUM_SCORES, LOO_CLASS) \((*(BARREL)->method->score)(BARREL, QUERY_WV, SCORES, NUM_SCORES, LOO_CLASS))#define bow_wv_set_weights(WV,BARREL)		\if ((*(BARREL)->method->wv_set_weights))	\  ((*(BARREL)->method->wv_set_weights)(WV, BARREL))#define bow_wv_normalize_weights(WV,BARREL)		\if (((*(BARREL)->method->wv_normalize_weights)))	\  ((*(BARREL)->method->wv_normalize_weights)(WV))#define bow_free_barrel(BARREL)			\((*(BARREL)->method->free_barrel)(BARREL))#define bow_barrel_num_classes(BARREL)		\(((BARREL)->classnames)				\ ? ((BARREL)->classnames->str_array_length)	\ : ((BARREL)->cdocs->length))#define bow_barrel_classname_at_index(BARREL, INDEX) \(bow_int2str ((BARREL)->classnames, INDEX))#define bow_barrel_add_classname(BARREL, NAME) \(bow_str2int ((BARREL)->classnames, NAME))#include <bow/tfidf.h>#include <bow/naivebayes.h>#include <bow/prind.h>#include <bow/kl.h>#include <bow/em.h>#include <bow/knn.h>struct argp_child;		/* forward declare this type *//* Associate method M with the string NAME, so the method structure   can be retrieved later with BOW_METHOD_AT_NAME().  Set the group   number of the CHILD so the command-line options for this option   will appear separately.  If there is no argp_child for this    method, pass NULL for CHILD. */int bow_method_register_with_name (bow_method *m, const char *name, int size,				   struct argp_child *child);/* Return a pointer to a method structure that was previous registered    with string NAME using BOW_METHOD_REGISTER_WITH_NAME(). */bow_method *bow_method_at_name (const char *name);/* The mapping from names to BOW_METHOD's. */extern bow_sarray *bow_methods;#define bow_default_method_name "naivebayes"/* Create a new, empty `bow_barrel', with cdoc's of size ENTRY_SIZE   and cdoc free function FREE_FUNC.  WORD_CAPACITY and CLASS_CAPACITY   are just hints. */bow_barrel *bow_barrel_new (int word_capacity, 			    int class_capacity,			    int entry_size, void (*free_func)());

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -