📄 libbow.h
字号:
/* Create a new barrel and fill it from contents in --print-barrel=FORMAT read in from FILENAME. */bow_barrel *bow_barrel_new_from_printed_barrel_file (const char *filename, const char *format);/* Add statistics to the barrel BARREL by indexing all the documents found when recursively decending directory DIRNAME. Return the number of additional documents indexed. */int bow_barrel_add_from_text_dir (bow_barrel *barrel, const char *dirname, const char *except_name, const char *classnames);/* Add statistics to the barrel BARREL by indexing all the documents in HDB database DIRNAME. Return the number of additional documents indexed. */int bow_barrel_add_from_hdb (bow_barrel *barrel, const char *dirname, const char *except_name, const char *classnames);/* Add statistics about the document described by CDOC and WV to the BARREL. */int bow_barrel_add_document (bow_barrel *barrel, bow_cdoc *cdoc, bow_wv *wv);/* Call this on a vector-per-document barrel to set the CDOC->PRIOR's so that the CDOC->PRIOR's for all documents of the same class sum to 1. */void bow_barrel_set_cdoc_priors_to_class_uniform (bow_barrel *barrel);/* Given a barrel of documents, create and return another barrel with only one vector per class. The classes will be represented as "documents" in this new barrel. CLASSNAMES is an array of strings that maps class indices to class names. */bow_barrel *bow_barrel_new_vpc (bow_barrel *barrel);/* Like bow_barrel_new_vpc(), but it also sets and normalizes the weights appropriately by calling SET_WEIGHTS from the METHOD of DOC_BARREL on the `vector-per-class' barrel that will be returned. */bow_barrel *bow_barrel_new_vpc_merge_then_weight (bow_barrel *doc_barrel);/* Same as above, but set the weights in the DOC_BARREL, create the `Vector-Per-Class' barrel, and set the weights in the VPC barrel by summing weights from the DOC_BARREL. */bow_barrel *bow_barrel_new_vpc_weight_then_merge (bow_barrel *doc_barrel);/* Set the class prior probabilities by counting the number of documents of each class. */void bow_barrel_set_vpc_priors_by_counting (bow_barrel *vpc_barrel, bow_barrel *doc_barrel);/* Like bow_barrel_new_vpc, but uses both labeled and unlabeled data. It uses the class_probs of each doc to determine its class membership. The counts in the wi2dvf are set to bogus numbers. The weights of the wi2dvf contain the real information. The normalizer of each vpc cdoc is set to the fractional number of documents per class. The word_count of each vpc cdoc is rounded integer for the number of documents per class. The word_count of each document cdoc is set to the sum of the counts of its corresponding word vector. This is to get correct numbers for the doc-then-word event model. */bow_barrel * bow_barrel_new_vpc_using_class_probs (bow_barrel *doc_barrel);/* Set the class prior probabilities by doing a weighted (by class membership) count of the number of labeled and unlabeled documents in each class. This uses class_probs to determine class memberships of the documents. */void bow_barrel_set_vpc_priors_using_class_probs (bow_barrel *vpc_barrel, bow_barrel *doc_barrel);/* Multiply each weight by the Quinlan `Foilgain' of that word. */void bow_barrel_scale_weights_by_foilgain (bow_barrel *barrel, bow_barrel *doc_barrel);/* Multiply each weight by the information gain of that word. */void bow_barrel_scale_weights_by_infogain (bow_barrel *barrel, bow_barrel *doc_barrel);/* Modify the BARREL by removing those entries for words that are not in the int/str mapping MAP. */void bow_barrel_prune_words_not_in_map (bow_barrel *barrel, bow_int4str *map);/* Modify the BARREL by removing those entries for words that are in the int/str mapping MAP. */void bow_barrel_prune_words_in_map (bow_barrel *barrel, bow_int4str *map);/* Modify the BARREL by removing those entries for words that are not among the NUM_WORDS_TO_KEEP top words, by information gain. This function is similar to BOW_WORDS_KEEP_TOP_BY_INFOGAIN(), but this one doesn't change the word-int mapping. */void bow_barrel_keep_top_words_by_infogain (int num_words_to_keep, bow_barrel *barrel, int num_classes);/* Set the BARREL->WI2DVF->ENTRY[WI].IDF to the sum of the COUNTS for the given WI among those documents in the training set. */void bow_barrel_set_idf_to_count_in_train (bow_barrel *barrel);/* Return the number of unique words among those documents with TYPE tag (train, test, unlabeled, etc) equal to TYPE. */int bow_barrel_num_unique_words_of_type (bow_barrel *doc_barrel, int type);/* Write BARREL to the file-pointer FP in a machine independent format. */void bow_barrel_write (bow_barrel *barrel, FILE *fp);/* Create and return a `barrel' by reading data from the file-pointer FP. */bow_barrel *bow_barrel_new_from_data_fp (FILE *fp);/* Print barrel to FP in human-readable and awk-accessible format. */void bow_barrel_printf (bow_barrel *barrel, FILE *fp, const char *format);/* Print as above, but print only those documents for which the function PRINT_IF_TRUE returns non-zero. */void bow_barrel_printf_selected (bow_barrel *barrel, FILE *fp, const char *format, int (*print_if_true)(bow_cdoc*));/* Print on stdout the number of times WORD occurs in the various docs/classes of BARREL. */void bow_barrel_print_word_count (bow_barrel *barrel, const char *word);/* For copying a class barrel. Doesn't deal with class_probs at all. */bow_barrel *bow_barrel_copy (bow_barrel *barrel);/* Return an iterator for the columns of BARREL in class CI */bow_iterator_double *bow_barrel_iterator_for_ci_new(bow_barrel *barrel,int ci);/* Free the memory held by BARREL. */void bow_barrel_free (bow_barrel *barrel);/* Assign the values of the "word vector entry's" WEIGHT field equal to the COUNT. */void bow_wv_set_weights_to_count (bow_wv *wv, bow_barrel *barrel);/* Assign weight values appropriate for the different event models. For document, weights are 0/1. For word, weights are same as counts. For doc-then-word, weights are normalized counts. Sets normalizer to be total number of words as appropriate for the event model.*/void bow_wv_set_weights_by_event_model (bow_wv *wv, bow_barrel *barrel);/* Assign the values of the "word vector entry's" WEIGHT field equal to the COUNT times the word's IDF, taken from the BARREL. */void bow_wv_set_weights_to_count_times_idf (bow_wv *wv, bow_barrel *barrel);/* Assign the values of the "word vector entry's" WEIGHT field equal to the log(COUNT) times the word's IDF, taken from the BARREL. */void bow_wv_set_weights_to_log_count_times_idf(bow_wv *wv, bow_barrel *barrel);/* Parsing headers from email messages. *//* xxx Eventually all these will be replaced by use of a regular expression library. *//* Read in BUF the characters inside the `<>' of the `Message-Id:' field of the email message contain in the file pointer FP. Return the number of characters placed in BUF. Signal an error if more than BUFLEN characters are necessary. Return -1 if no matching field is found. */int bow_email_get_msgid (FILE *fp, char *buf, int buflen);/* Read in BUF the characters between the `Received: from ' and the following space, and the characters between the ` id ' and the following `;' in the file pointer FP. Return the number of characters placed in BUF. Signal an error if more than BUFLEN characters are necessary. Return -1 if no matching field is found. */int bow_email_get_receivedid (FILE *fp, char *buf, int buflen);/* Read in BUF the characters inside the `<>' of the `In-Reply-To:' field of the email message contain in the file pointer FP. Return the number of characters placed in BUF. Signal an error if more than than BUFLEN characters are necessary. Return -1 if no matching field is found. */int bow_email_get_replyid (FILE *fp, char *buf, int buflen);/* Read in BUF the characters inside the `<>' of the `References:' field of the news message contain in the file pointer FP. Return the number of characters placed in BUF. Signal an error if more than BUFLEN characters are necessary. Return -1 if no matching field is found. */int bow_email_get_references (FILE *fp, char *buf, int buflen);/* Read in BUF the characters inside the `<>' of the `Resent-Message-Id:' field of the email message contain in the file pointer FP. Return the number of characters placed in BUF. Signal an error if more than BUFLEN characters are necessary. Return -1 if no matching field is found. */int bow_email_get_resent_msgid (FILE *fp, char *buf, int buflen);/* Read into BUF the characters inside the `<>' of the `From:' field of the email message contain in the file pointer FP. Return the number of characters placed in BUF. Signal an error if more than BUFLEN characters are necessary. Return -1 if no matching field is found. */int bow_email_get_sender (FILE *fp, char *buf, int buflen);/* Read into BUF the characters inside the `<>' of the `To:' field of the email message contain in the file pointer FP. Return the number of characters placed in BUF. Signal an error if more than BUFLEN characters are necessary. Return -1 if no matching field is found. */int bow_email_get_recipient (FILE *fp, char *buf, int buflen);/* Read into BUF the day, month and year of the `Date:' field of the email message contain in the file pointer FP. The format is something like `21 Jul 1996'. Return the number of characters placed in BUF. Signal an error if more than BUFLEN characters are necessary. Return -1 if no matching field is found. */int bow_email_get_date (FILE *fp, char *buf, int buflen);/* Progress and error reporting. */enum bow_verbosity_levels { bow_silent = 0, /* never print anything */ bow_quiet, /* only warnings and errors */ bow_progress, /* minimal # lines to show progress, use \b */ bow_verbose, /* give more status info */ bow_chatty, /* stuff most users wouldn't care about */ bow_screaming /* every little nit */};/* Examined by bow_verbosify() to determine whether to print the message. Default is bow_progress. */extern int bow_verbosity_level;/* If this is 0, and the message passed to bow_verbosify() contains backspaces, then the message will not be printed. It is useful to turn this off when debugging inside an emacs window. The default value is on. */extern int bow_verbosity_use_backspace;/* Print the printf-style FORMAT string and ARGS on STDERR, only if the BOW_VERBOSITY_LEVEL is equal or greater than the argument VERBOSITY_LEVEL. */int bow_verbosify (int verbosity_level, const char *format, ...);/* Print the printf-style FORMAT string and ARGS on STDERR, and abort. This function appends a newline to the printed message. */#define bow_error(FORMAT, ARGS...) \({if (bow_verbosity_level > bow_silent) \ { \ fprintf (stderr, "%s: ", __PRETTY_FUNCTION__); \ _bow_error (FORMAT , ## ARGS); \ } \ else \ { \ abort (); \ }}) volatile void _bow_error (const char *format, ...);/* Memory allocation with error checking. *//* These "extern inline" functions in this .h file will only be taken from here if gcc is optimizing, otherwise, they will be taken from identical copies defined in io.c */void (*bow_malloc_hook) (void *ptr);void (*bow_realloc_hook) (void *old, void *new);void (*bow_free_hook) (void *ptr);#if ! defined (_BOW_MALLOC_INLINE_EXTERN)#define _BOW_MALLOC_INLINE_EXTERN inline extern#endif_BOW_MALLOC_INLINE_EXTERN void *bow_malloc (size_t s){ void *ret;#if BOW_MCHECK static int mcheck_called = 0; if (!mcheck_called) { int r; r = mcheck (NULL); assert (r == 0); mcheck_called = 1; }#endif /* BOW_MCHECK */ ret = malloc (s); if (!ret) bow_error ("Memory exhausted."); if (bow_malloc_hook) (*bow_malloc_hook) (ret); return ret;}_BOW_MALLOC_INLINE_EXTERN void * bow_realloc (void *ptr, size_t s){ void *ret; ret = realloc (ptr, s); if (!ret) bow_error ("Memory exhausted."); if (bow_realloc_hook) (*bow_realloc_hook) (ptr, ret); return ret;}_BOW_MALLOC_INLINE_EXTERN voidbow_free (void *ptr){ if (bow_free_hook) bow_free_hook (ptr); free (ptr);}/* Conveniences for writing and reading. *//* Version number of file format used to write binary data. */extern int bow_file_format_version;/* The default, initial value of above variable. The above variable will take on a different value when reading from binary data archived with a different format version. */#define BOW_DEFAULT_FILE_FORMAT_VERSION 7/* Functions for conveniently recording and finding out the format version used to write binary data to disk. */void bow_write_format_version_to_file (const char *filename);void bow_read_format_version_from_file (const char *filename);/* Open a file using fopen(), with the same parameters. Check the return value, and raise an error if the open failed. The caller should close the returned file-pointer with fclose(). */#define bow_fopen(FILENAME, MODE) \({ \ FILE *ret;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -