⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 libbow.h

📁 良好的代码实现
💻 H
📖 第 1 页 / 共 4 页
字号:
typedef struct _bow_wi2dvf {  int size;			/* the number of ENTRY's allocated */  int num_words;		/* number of non-NULL dv's in this wi2dvf */  FILE *fp;			/* where to get DVF's that aren't cached yet */  bow_dvf entry[0];		/* array of info about each word */} bow_wi2dvf;/* Create an empty `wi2dvf' */bow_wi2dvf *bow_wi2dvf_new (int capacity);/* The default capacity used when 0 is passed for CAPACITY above. */extern unsigned int bow_wi2dvf_default_capacity;/* Create a `wi2dvf' by reading data from file-pointer FP.  This   doesn't actually read in all the "document vectors"; it only reads   in the DVF information, and lazily loads the actually "document   vectors". */bow_wi2dvf *bow_wi2dvf_new_from_data_fp (FILE *fp);/* Create a `wi2dvf' by reading data from a file.  This doesn't actually    read in all the "document vectors"; it only reads in the DVF    information, and lazily loads the actually "document vectors". */bow_wi2dvf *bow_wi2dvf_new_from_data_file (const char *filename);/* Return the "document vector" corresponding to "word index" WI.  If   is hasn't been read already, this function will read the "document   vector" out of the file passed to bow_wi2dvf_new_from_data_file(). */bow_dv *bow_wi2dvf_dv (bow_wi2dvf *wi2dvf, int wi);/* Return a pointer to the BOW_DE for a particular word/document pair,    or return NULL if there is no entry for that pair. */bow_de *bow_wi2dvf_entry_at_wi_di (bow_wi2dvf *wi2dvf, int wi, int di);/* Read all the words from file pointer FP, and add them to the map   WI2DVF, such that they are associated with document index DI. */void bow_wi2dvf_add_di_text_fp (bow_wi2dvf **wi2dvf, int di, FILE *fp);/* Add a "word vector" WV, associated with "document index" DI, to    the map WI2DVF. */ void bow_wi2dvf_add_di_wv (bow_wi2dvf **wi2dvf, int di, bow_wv *wv);/* Write WI2DVF to file-pointer FP, in a machine-independent format.   This is the format expected by bow_wi2dvf_new_from_fp(). */void bow_wi2dvf_add_wi_di_count_weight (bow_wi2dvf **wi2dvf, int wi,					int di, int count, float weight);/* Remove the word with index WI from the vocabulary of the map WI2DVF */void bow_wi2dvf_remove_wi (bow_wi2dvf *wi2dvf, int wi);/* Temporarily hide the word with index WI from the vocabulary of the   map WI2DVF. The function BOW_WI2DVF_DV() will no longer see the entry   for this WI, but */void bow_wi2dvf_hide_wi (bow_wi2dvf *wi2dvf, int wi);/* Make visible all DVF's that were hidden with BOW_WI2DVF_HIDE_WI(). */void bow_wi2dvf_unhide_all_wi (bow_wi2dvf *wi2dvf);/* Write WI2DVF to file-pointer FP, in a machine-independent format.   This is the format expected by bow_wi2dvf_new_from_fp(). */void bow_wi2dvf_write (bow_wi2dvf *wi2dvf, FILE *fp);/* Write WI2DVF to a file, in a machine-independent format.  This   is the format expected by bow_wi2dvf_new_from_file(). */void bow_wi2dvf_write_data_file (bow_wi2dvf *wi2dvf, const char *filename);/* Compare two maps, and return 0 if they are equal.  This function was   written for debugging. */int bow_wi2dvf_compare (bow_wi2dvf *map1, bow_wi2dvf *map2);/* Print statistics about the WI2DVF map to STDOUT. */void bow_wi2dvf_print_stats (bow_wi2dvf *map);/* Free the memory held by the map WI2DVF. */void bow_wi2dvf_free (bow_wi2dvf *wi2dvf);#if 0typedef enum {  bow_method_tfidf_words,	/* TFIDF with DF=`word-count' */  bow_method_tfidf_log_words,	/* TFIDF with DF=`log-word-count' */  bow_method_tfidf_log_occur,	/* TFIDF with DF=`log-occurances' */  bow_method_tfidf_prtfidf,	/* Joachim's PrTFIDF */  bow_method_naivebayes,	/* Naive Bayes */  bow_method_prind,		/* Fuhr's Probabilistic Indexing */} bow_method;#endif/* If this is non-zero, use uniform class priors. */extern int bow_prind_uniform_priors;/* If this is non-zero, scale PrInd P(w|d) by information gain */extern int bow_prind_scale_by_infogain;/* If this is zero, do not normalize the PrInd classification scores. */extern int bow_prind_normalize_scores;/* A wrapper around a wi2dvf/cdocs combination. */typedef struct _bow_barrel {  struct _bow_method *method;	/* TFIDF, NaiveBayes, PrInd, or others. */  bow_array *cdocs;		/* The documents */  bow_wi2dvf *wi2dvf;		/* The matrix of words vs documents */  int is_vpc;			/* non-zero if each `document' is a `class' */} bow_barrel;/* An array of these is filled in by the method's scoring function. */typedef struct _bow_score {  int di;			/* The "document index" for this document */  float weight;			/* Its score */  const char *name;} bow_score;/* The parameters of weighting and scoring in barrel's. */typedef struct _bow_method {  /* String identifer for the method, used for archiving. */  const char *name;  /* Functions for implementing parts of the method. */  void (*set_weights)(bow_barrel *barrel);  void (*scale_weights)(bow_barrel *barrel, bow_barrel *doc_barrel);  void (*normalize_weights)(bow_barrel *barrel);  bow_barrel* (*vpc_with_weights)(bow_barrel *doc_barrel, 				  const char **classnames, int num_classes);  void (*vpc_set_priors)(bow_barrel *barrel, bow_barrel *doc_barrel);  int (*score)(bow_barrel *barrel, bow_wv *query_wv, 	       bow_score *scores, int num_scores, int loo_class);  void (*wv_set_weights)(bow_wv *wv);  void (*wv_normalize_weights)(bow_wv *wv);  /* Parameters of the method. */  void *params;} bow_method;/* Macros that make it easier to call the BOW_METHOD functions */#define bow_barrel_set_weights(BARREL)		\((*(BARREL)->method->set_weights)(BARREL))#define bow_barrel_scale_weights(BARREL, DOC_BARREL)		\if ((*(BARREL)->method->scale_weights))				\  ((*(BARREL)->method->scale_weights)(BARREL, DOC_BARREL))#define bow_barrel_normalize_weights(BARREL)		\if ((*(BARREL)->method->normalize_weights))		\  ((*(BARREL)->method->normalize_weights)(BARREL))#define bow_barrel_new_vpc_with_weights(BARREL, CLASSNAMES, NUMCLASSES) \((*(BARREL)->method->vpc_with_weights)(BARREL, CLASSNAMES, NUMCLASSES))#define bow_barrel_score(BARREL, QUERY_WV, SCORES, NUM_SCORES, LOO_CLASS) \((*(BARREL)->method->score)(BARREL, QUERY_WV, SCORES, NUM_SCORES, LOO_CLASS))#define bow_wv_set_weights(WV,BARREL)		\if ((*(BARREL)->method->wv_set_weights))	\  ((*(BARREL)->method->wv_set_weights)(WV))#define bow_wv_normalize_weights(WV,BARREL)		\if (((*(BARREL)->method->wv_normalize_weights)))	\  ((*(BARREL)->method->wv_normalize_weights)(WV))#include <bow/tfidf.h>#include <bow/naivebayes.h>#include <bow/prind.h>#include <bow/kl.h>/* Associate method M with the string NAME, so the method structure can   be retrieved later with BOW_METHOD_AT_NAME(). */int bow_method_register_with_name (bow_method *m, const char *name);/* Return a pointer to a method structure that was previous registered    with string NAME using BOW_METHOD_REGISTER_WITH_NAME(). */bow_method *bow_method_at_name (const char *name);/* The mapping from names to BOW_METHOD's. */extern bow_sarray *bow_methods;#define bow_default_method_name "naivebayes"/* Create a new, empty `bow_barrel', with cdoc's of size ENTRY_SIZE   and cdoc free function FREE_FUNC.  WORD_CAPACITY and CLASS_CAPACITY   are just hints. */bow_barrel *bow_barrel_new (int word_capacity, 			    int class_capacity,			    int entry_size, void (*free_func)());/* Create a BARREL by indexing all the documents found when   recursively decending directory DIRNAME, but skip files matching   EXCEPTION_NAME. */int bow_barrel_add_from_text_dir (bow_barrel *barrel,				  const char *dirname, 				  const char *except_name, 				  int class);/* Add statistics about the document described by CDOC and WV to the   BARREL. */int bow_barrel_add_document (bow_barrel *barrel, 			     bow_cdoc *cdoc, bow_wv *wv);/* Call this on a vector-per-document barrel to set the CDOC->PRIOR's   so that the CDOC->PRIOR's for all documents of the same class sum   to 1. */void bow_barrel_set_cdoc_priors_to_class_uniform (bow_barrel *barrel);/* Given a barrel of documents, create and return another barrel with   only one vector per class. The classes will be represented as   "documents" in this new barrel.  CLASSNAMES is an array of strings   that maps class indices to class names. */bow_barrel *bow_barrel_new_vpc (bow_barrel *barrel, 				const char **classnames, int num_classes);/* Like bow_barrel_new_vpc(), but it also sets and normalizes the   weights appropriately by calling SET_WEIGHTS from the METHOD of   DOC_BARREL on the `vector-per-class' barrel that will be returned. */bow_barrel *bow_barrel_new_vpc_merge_then_weight (bow_barrel *doc_barrel, 				      const char **classnames, 				      int num_classes);/* Same as above, but set the weights in the DOC_BARREL, create the   `Vector-Per-Class' barrel, and set the weights in the VPC barrel by   summing weights from the DOC_BARREL. */bow_barrel *bow_barrel_new_vpc_weight_then_merge (bow_barrel *doc_barrel, 				      const char **classnames,				      int num_classes);/* Set the class prior probabilities by counting the number of   documents of each class. */void bow_barrel_set_vpc_priors_by_counting (bow_barrel *vpc_barrel,					    bow_barrel *doc_barrel);/* Assign the values of the "word vector entry's" WEIGHT field   equal to the COUNT times the word's IDF, taken from the BARREL. */void bow_wv_set_weights_to_count_times_idf (bow_wv *wv, bow_barrel *barrel);/* Multiply each weight by the Quinlan `Foilgain' of that word. */void bow_barrel_scale_weights_by_foilgain (bow_barrel *barrel,					   bow_barrel *doc_barrel);/* Multiply each weight by the information gain of that word. */void bow_barrel_scale_weights_by_infogain (bow_barrel *barrel,					   bow_barrel *doc_barrel);/* Modify the BARREL by removing those entries for words that are not   in the int/str mapping MAP. */void bow_barrel_prune_words_not_in_map (bow_barrel *barrel,					bow_int4str *map);/* Modify the BARREL by removing those entries for words that are not   among the NUM_WORDS_TO_KEEP top words, by information gain.  This   function is similar to BOW_WORDS_KEEP_TOP_BY_INFOGAIN(), but this   one doesn't change the word-int mapping. */void bow_barrel_keep_top_words_by_infogain (int num_words_to_keep, 					    bow_barrel *barrel,					    int num_classes);/* Write BARREL to the file-pointer FP in a machine independent format. */void bow_barrel_write (bow_barrel *barrel, FILE *fp);/* Create and return a `barrel' by reading data from the file-pointer FP. */bow_barrel *bow_barrel_new_from_data_fp (FILE *fp);/* Print barrel to FP in human-readable and awk-accessible format. */void bow_barrel_printf (bow_barrel *barrel, FILE *fp, const char *format);/* Print on stdout the number of times WORD occurs in the various   docs/classes of BARREL. */void bow_barrel_print_word_count (bow_barrel *barrel, const char *word);/* Free the memory held by BARREL. */void bow_barrel_free (bow_barrel *barrel);/* Parsing headers from email messages. *//* xxx Eventually all these will be replaced by use of a regular   expression library. *//* Read in BUF the characters inside the `<>' of the `Message-Id:'   field of the email message contain in the file pointer FP.  Return   the number of characters placed in BUF.  Signal an error if more   than BUFLEN characters are necessary.  Return -1 if no matching   field is found. */int bow_email_get_msgid (FILE *fp, char *buf, int buflen);/* Read in BUF the characters between the `Received: from ' and the   following space, and the characters between the ` id ' and the   following `;' in the file pointer FP.  Return the number of   characters placed in BUF.  Signal an error if more than BUFLEN   characters are necessary.  Return -1 if no matching field is   found. */int bow_email_get_receivedid (FILE *fp, char *buf, int buflen);/* Read in BUF the characters inside the `<>' of the `In-Reply-To:'   field of the email message contain in the file pointer FP.  Return   the number of characters placed in BUF.  Signal an error if more than   than BUFLEN characters are necessary.  Return -1 if no matching   field is found. */int bow_email_get_replyid (FILE *fp, char *buf, int buflen);/* Read in BUF the characters inside the `<>' of the `References:'   field of the news message contain in the file pointer FP.  Return   the number of characters placed in BUF.  Signal an error if more   than BUFLEN characters are necessary.  Return -1 if no matching   field is found. */int bow_email_get_references (FILE *fp, char *buf, int buflen);/* Read in BUF the characters inside the `<>' of the   `Resent-Message-Id:' field of the email message contain in the file   pointer FP.  Return the number of characters placed in BUF.  Signal   an error if more than BUFLEN characters are necessary.  Return -1   if no matching field is found. */int bow_email_get_resent_msgid (FILE *fp, char *buf, int buflen);/* Read into BUF the characters inside the `<>' of the `From:' field   of the email message contain in the file pointer FP.  Return the   number of characters placed in BUF.  Signal an error if more than   BUFLEN characters are necessary.  Return -1 if no matching field is   found. */int bow_email_get_sender (FILE *fp, char *buf, int buflen);/* Read into BUF the characters inside the `<>' of the `To:' field   of the email message contain in the file pointer FP.  Return the   number of characters placed in BUF.  Signal an error if more than   BUFLEN characters are necessary.  Return -1 if no matching field is   found. */int bow_email_get_recipient (FILE *fp, char *buf, int buflen);/* Read into BUF the day, month and year of the `Date:' field of the   email message contain in the file pointer FP.  The format is   something like `21 Jul 1996'.  Return the number of characters   placed in BUF.  Signal an error if more than BUFLEN characters are   necessary.  Return -1 if no matching field is found. */int bow_email_get_date (FILE *fp, char *buf, int buflen);/* Progress and error reporting. */enum bow_verbosity_levels {  bow_silent = 0,		/* never print anything */  bow_quiet,			/* only warnings and errors */  bow_progress,			/* minimal # lines to show progress, use \b */  bow_verbose,			/* give more status info */  bow_chatty,			/* stuff most users wouldn't care about */  bow_screaming			/* every little nit */};/* Examined by bow_verbosify() to determine whether to print the message.   Default is bow_progress. */extern int bow_verbosity_level;/* If this is 0, and the message passed to bow_verbosify() contains   backspaces, then the message will not be printed.  It is useful to   turn this off when debugging inside an emacs window.  The default   value is on. */extern int bow_verbosity_use_backspace;/* Print the printf-style FORMAT string and ARGS on STDERR, only if   the BOW_VERBOSITY_LEVEL is equal or greater than the argument    VERBOSITY_LEVEL. */int bow_verbosify (int verbosity_level, const char *format, ...);/* Print the printf-style FORMAT string and ARGS on STDERR, and abort.   This function appends a newline to the printed message. */#define bow_error(FORMAT, ARGS...)			\({if (bow_verbosity_level > bow_silent)			\  {							\    fprintf (stderr, "%s: ", __PRETTY_FUNCTION__);	\    _bow_error (FORMAT , ## ARGS);			\  }							\ else							\  {							\    abort ();						\  }}) volatile void _bow_error (const char *format, ...);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -