📄 libbow.h

📁 机器学习作者tom mitchell的书上代码
💻 H
📖 第 1 页 / 共 5 页
字号:
void bow_words_remove_occurrences_less_than (int occur);/* Return the total number of unique words in the int/word map. */int bow_num_words ();/* Save the int/word map to file-pointer FP. */void bow_words_write (FILE *fp);/* Same as above, but with a filename instead of a FILE* */void bow_words_write_to_file (const char *filename);/* Read the int/word map from file-pointer FP. */void bow_words_read_from_fp (FILE *fp);/* Same as above, but with a filename instead of a FILE* */void bow_words_read_from_file (const char *filename);/* Same as above, but don't bother rereading unless filename is different   from the last one, or FORCE_UPDATE is non-zero. */void bow_words_reread_from_file (const char *filename, int force_update);/* Read the int/word map (in incremental format) from file-pointer FP. */void bow_words_read_from_fp_inc (FILE *fp);/* Lists of words sorted by some score, for example, infogain *//* An entry for a word and its score */typedef struct _bow_ws {  int wi;  float weight;} bow_ws;/* An array of words and their scores. */typedef struct _bow_wa {  int size;  int length;  bow_ws *entry;} bow_wa;/* Create a new, empty array of word/score entries, with CAPACITY entries. */bow_wa *bow_wa_new (int capacity);/* Add a new word and score to the array */int bow_wa_append (bow_wa *wa, int wi, float score);/* Add a score to the array.  If there is already an entry for WI, the   SCORE gets added to WI's current score.  If WI is not already in   the array, then this function behaves like bow_wa_append(). */int bow_wa_add (bow_wa *wa, int wi, float score);/* Add a score to the array.  If there is already an entry for WI at   the end, the SCORE gets added to WI's current score.  If WI is   greater than the WI at the end, then this function behaves like   bow_wa_append(), otherwise an error is raised. */int bow_wa_add_to_end (bow_wa *wa, int wi, float score);/* Remove the entry corresponding to word WI.  Return the new length   of the word array. */int bow_wa_remove (bow_wa *wa, int wi);/* Add to WA all the WI/WEIGHT entries from WA2.  Uses bow_wa_add(). */int bow_wa_union (bow_wa *wa, bow_wa *wa2);/* Return a new array containing only WI entries that are in both    WA1 and WA2. */bow_wa *bow_wa_intersection (bow_wa *wa1, bow_wa *wa2);/* Add weights to WA1 for those entries appearing in WA2 */int bow_wa_overlay (bow_wa *wa1, bow_wa *wa2);/* Return a new array containing only WI entries that are in WA1 but   not in WA2. */bow_wa *bow_wa_diff (bow_wa *wa1, bow_wa *wa2);/* Sort the word array with high values first. */void bow_wa_sort (bow_wa *wa);/* Sort the word array with high values last. */void bow_wa_sort_reverse (bow_wa *wa);/* Print the first N entries of the word array WA to stream FP. */void bow_wa_fprintf (bow_wa *wa, FILE *fp, int n);/* Remove all entries from the word array */void bow_wa_empty (bow_wa *wa);/* Free the word array */void bow_wa_free (bow_wa *wa);/* Word vectors.  A "word vector" is sorted array of words, with count   information attached to each word.  Typically, there would be one   "word vector" associated with a document, or with a concept. *//* A "word entry"; these are the elements of a "word vector" */typedef struct _bow_we {  int wi;  int count;  float weight;} bow_we;/* A "word vector", containing an array of words with their statistics */typedef struct _bow_wv {  int num_entries;		/* the number of unique words in the vector */  float normalizer;		/* multiply weights by this for normalizing */  bow_we entry[0];} bow_wv;/* Create and return a new "word vector" from a file. */bow_wv *bow_wv_new_from_text_fp (FILE *fp, const char *filename);/* Create and return a new "word vector" from a string. */bow_wv *bow_wv_new_from_text_string (char *the_string);/* Create and return a new "word vector" from a document buffer LEX. */bow_wv *bow_wv_new_from_lex (bow_lex *lex);/* Create and return a new "word vector" that is the sum of all the   "word vectors" in WV_ARRAY.  The second parameter, WV_ARRAY_LENGTH,   is the number of "word vectors" in WV_ARRAY. */bow_wv *bow_wv_add (bow_wv **wv_array, int wv_array_length);/* Create and return a new "word vector" with uninitialized contents. */bow_wv *bow_wv_new (int capacity);/* Allocate a return a copy of WV */bow_wv * bow_wv_copy (bow_wv *wv);/* Return the number of word occurrences in the WV */int bow_wv_word_count (bow_wv *wv);/* Return a pointer to the "word entry" with index WI in "word vector WV */bow_we *bow_wv_entry_for_wi (bow_wv *wv, int wi);/* Return the count entry of "word" with index WI in "word vector" WV */int bow_wv_count_for_wi (bow_wv *wv, int wi);/* Print "word vector" WV on stream FP in a human-readable format. */void bow_wv_fprintf (FILE *fp, bow_wv *wv);/* Print "word vector" WV to a string in a human-readable format. */char *bow_wv_sprintf (bow_wv *wv, unsigned int max_size_for_string);/* Print "word vector"'s actual words to a string. */char *bow_wv_sprintf_words (bow_wv *wv, unsigned int max_size_for_string);/* Assign a value to the "word vector's" NORMALIZER field, such that   when all the weights in the vector are multiplied by the   NORMALIZER, the Euclidian length of the vector will be one. */void bow_wv_normalize_weights_by_vector_length (bow_wv *wv);/* Assign a value to the "word vector's" NORMALIZER field, such that   when all the weights in the vector are multiplied by the   NORMALIZER, all the vector entries will to one. */void bow_wv_normalize_weights_by_summing (bow_wv *wv);/* Return the sum of the weight entries. */float bow_wv_weight_sum (bow_wv *wv);/* Return the number of bytes required for writing the "word vector" WV. */int bow_wv_write_size (bow_wv *wv);/* Write "word vector" DV to the stream FP. */void bow_wv_write (bow_wv *wv, FILE *fp);/* Return a new "word vector" read from a pointer into a data file, FP. */bow_wv *bow_wv_new_from_data_fp (FILE *fp);/* Free the memory held by the "word vector" WV. */void bow_wv_free (bow_wv *wv);/* Collections of "word vectors. *//* An array that maps "document indices" to "word vectors" */typedef struct _bow_di2wv {  int length;  int size;  bow_wv *entry[0];} bow_di2wv;/* Documents */  /* We want a nice way of saying this is a training or test document, or do   we ignore it for now. */typedef enum {  bow_doc_train,	/* Use this to calculate P(w|C) */  bow_doc_test,		/* Classify these for test results */  bow_doc_unlabeled,    /* the "unlabeled" docs in EM and active learning */  bow_doc_untagged,	/* Not yet assigned a tag */  bow_doc_validation,   /* docs used for a validation set */  bow_doc_ignore,	/* docs left unused */  bow_doc_pool,         /* the cotraining candidate pool */  bow_doc_waiting       /* the "unlabeled" docs not used by cotraining yet */} bow_doc_type;#define bow_str2type(STR) \((strcmp (STR, "train") == 0) \ ? bow_doc_train \ : ((strcmp (STR, "test") == 0) \    ? bow_doc_test \    : ((strcmp (STR, "unlabeled") == 0) \       ? bow_doc_unlabeled \       : ((strcmp (STR, "validation") == 0) \	  ? bow_doc_validation \	  : ((strcmp (STR, "ignore") == 0) \	     ? bow_doc_ignore \	     : ((strcmp (STR, "pool") == 0) \		? bow_doc_pool \		: ((strcmp (STR, "waiting") == 0) \		   ? bow_doc_waiting \		   : -1)))))))     #define bow_type2str(T) \((T == bow_doc_train)                                \ ? "train"                                          \ : ((T == bow_doc_test)                             \    ? "test"                                        \    : ((T == bow_doc_unlabeled)                     \       ? "unlabeled"                                \       : ((T == bow_doc_untagged)                   \	  ? "untagged"                              \	  : ((T == bow_doc_validation)              \	     ? "validation"                         \	     : ((T == bow_doc_ignore)               \		? "ignore"                          \		: ((T == bow_doc_pool)            \		   ? "pool"                         \		   : ((T == bow_doc_waiting)        \		      ? "waiting"                  \		      : "UNKNOWN DOC TYPE"))))))))/* A generic "document" entry, useful for setting document types.     All other "document" entries should begin the same as this one. */typedef struct _bow_doc {  bow_doc_type type;  int class;  const char *filename;} bow_doc;/* These are defined in split.c */int bow_doc_is_train (bow_doc *doc);int bow_doc_is_test (bow_doc *doc);int bow_doc_is_unlabeled (bow_doc *doc);int bow_doc_is_untagged (bow_doc *doc);int bow_doc_is_validation (bow_doc *doc);int bow_doc_is_ignore (bow_doc *doc);int bow_doc_is_pool (bow_doc *doc);int bow_doc_is_waiting (bow_doc *doc);/* A "document" entry useful for standard classification tasks. */typedef struct _bow_cdoc {  bow_doc_type type;		/* Is this document part of the model to be				   built, a test document, or to be ignored */  int class;			/* A classification label. */  const char *filename;		/* Where to find the original document */  int word_count;		/* Total number of words in this document */  float normalizer;		/* Multiply weights by this for normalizing */  float prior;			/* Prior probability of this class/doc */  float *class_probs;           /* Probabilistic classification labels */} bow_cdoc;/* A convenient interface to bow_array that is specific to bow_cdoc. */#define bow_cdocs bow_array#define bow_cdocs_new(CAPACITY) bow_array_new (CAPACITY, sizeof (bow_cdoc), 0)#define bow_cdocs_register_doc(CDOCS,CDOC) bow_array_append (CDOCS, CDOC)#define bow_cdocs_di2doc(CDOCS, DI) bow_array_entry_at_index (CDOCS, DI)/* Traversing directories to get filenames. *//* A list of document names. *//* xxx We might change this someday to allow for multiple documents   per file, e.g. for "mbox" files containing many email messages. */typedef struct _bow_doc_list {  struct _bow_doc_list *next;  char filename[0];} bow_doc_list;/* Return a non-zero value if the file FP contains mostly text. */int bow_fp_is_text (FILE *fp);/* Return a non-zero value if the char array BUF contains mostly text. */int bow_str_is_text (char *buf);/* bow_*_is_text() always returns `yes'.  This is useful for Japanese   byte codes. */extern int bow_is_text_always_yes;/* Calls the function CALLBACK for each of the filenames encountered   when recursively descending the directory named DIRNAME.  CALLBACK   should be a pointer to function that takes a filename char-pointer,   and a void-pointer as arguments and returns an integer.  Currently   the return value is ignored, but it may be used in the future to   cut short, causing bow_map_filesnames_from_dir to return   immediately.  The value CONTEXT will be passed as the second   argument to the CALLBACK function; it provides you with a way to   transfer context you may need inside the implementation of the   callback function.  EXCLUDE_PATTERNS is currently ignored. */intbow_map_filenames_from_dir (int (*callback)(const char *filename, 					    void *context),			    void *context,			    const char *dirname,			    const char *exclude_patterns);/* Calls the function CALLBACK for each of the files found in the   database DIRNAME_ARG.  See bow_map_filenames_from_dir for more info */intbow_map_filenames_from_hdb (int (*callback)(const char *filename, char *data,					    void *context),			    void *context,			    const char *dirname,			    const char *exclude_patterns);/* Create a linked list of filenames, and append the file list pointed   to by FL to it; return the new concatenated lists in *FL.  The   function returns the total number of filenames.  When creating the   list, look for files (and recursively descend directories) among   those matching INCLUDE_PATTERNS, but don't include those matching   EXCLUDE_PATTERNS; don't include files that aren't text files. *//* xxx For now, this only works with a single directory name in   INCLUDE_PATTERNS, and it ignores EXCLUDE_PATTERNS. */int bow_doc_list_append (bow_doc_list **list, 			 const char *include_patterns,			 const char *exclude_patterns);/* Print the file list FL to the output stream FP. */void bow_doc_list_fprintf (FILE *fp, bow_doc_list *fl);/* Return the number of entries in the "docname list" DL. */int bow_doc_list_length (bow_doc_list *dl);/* Free the memory held by the file list FL. */void bow_doc_list_free (bow_doc_list *fl);/* A convient interface to a specific instance of the above int/string   mapping; this one is intended for all the documents encountered. */
💿 文件大小 522 K
👤 上传用户 yuanata
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#mitchell #tom #机器学习 #代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -