📄 libbow.h
字号:
/* Memory allocation with error checking. *//* These "extern inline" functions in this .h file will only be taken from here if gcc is optimizing, otherwise, they will be taken from identical copies defined in io.c */#if ! defined (_BOW_MALLOC_INLINE_EXTERN)#define _BOW_MALLOC_INLINE_EXTERN inline extern#endif_BOW_MALLOC_INLINE_EXTERN void *bow_malloc (size_t s){ void *ret; ret = malloc (s); if (!ret) bow_error ("Memory exhausted."); return ret;}_BOW_MALLOC_INLINE_EXTERN void * bow_realloc (void *ptr, size_t s){ void *ret; ret = realloc (ptr, s); if (!ret) bow_error ("Memory exhausted."); return ret;}_BOW_MALLOC_INLINE_EXTERN voidbow_free (void *ptr){ free (ptr);}/* Conveniences for writing and reading. *//* Version number of file format used to write binary data. */extern int bow_file_format_version;/* The default, initial value of above variable. The above variable will take on a different value when reading from binary data archived with a different format version. */#define BOW_DEFAULT_FILE_FORMAT_VERSION 5/* Functions for conveniently recording and finding out the format version used to write binary data to disk. */void bow_write_format_version_to_file (const char *filename);void bow_read_format_version_from_file (const char *filename);/* Open a file using fopen(), with the same parameters. Check the return value, and raise an error if the open failed. The caller should close the returned file-pointer with fclose(). */#define bow_fopen(FILENAME, MODE) \({ \ FILE *ret; \ ret = fopen (FILENAME, MODE); \ if (ret == NULL) \ { \ if (*MODE == 'r') \ { \ perror ("bow_fopen"); \ bow_error ("Couldn't open file `%s' for reading", FILENAME); \ } \ else \ { \ perror ("bow_fopen"); \ bow_error ("Couldn't open file `%s' for writing", FILENAME); \ } \ } \ ret; \})/* These "extern inline" functions in this .h file will only be taken from here if gcc is optimizing, otherwise, they will be taken from identical copies defined in io.c */#if ! defined (_BOW_IO_INLINE_EXTERN)#define _BOW_IO_INLINE_EXTERN inline extern#endif/* Write a (int) value to the stream FP. */_BOW_IO_INLINE_EXTERN intbow_fwrite_int (int n, FILE *fp){ int num_written; n = htonl (n); num_written = fwrite (&n, sizeof (int), 1, fp); assert (num_written == 1); return num_written * sizeof (int);}/* Read a (long) value from the stream FP. */_BOW_IO_INLINE_EXTERN intbow_fread_int (int *np, FILE *fp){ int num_read; num_read = fread (np, sizeof (int), 1, fp); assert (num_read == 1); *np = ntohl (*np); return num_read * sizeof (int);}/* Write a (short) value to the stream FP. */_BOW_IO_INLINE_EXTERN intbow_fwrite_short (short n, FILE *fp){ int num_written; n = htons (n); num_written = fwrite (&n, sizeof (short), 1, fp); assert (num_written == 1); return num_written * sizeof (short);}/* Read a (long) value from the stream FP. */_BOW_IO_INLINE_EXTERN intbow_fread_short (short *np, FILE *fp){ int num_read; num_read = fread (np, sizeof (short), 1, fp); assert (num_read == 1); *np = ntohs (*np); return num_read * sizeof (short);}/* Write a "char*"-string value to the stream FP. */_BOW_IO_INLINE_EXTERN intbow_fwrite_string (const char *s, FILE *fp){ short len; int ret; if (s) len = strlen (s); else len = 0; ret = bow_fwrite_short (len, fp); if (len) ret += fwrite (s, sizeof (char), len, fp); assert (ret == (int)sizeof (short) + len); return ret;}/* Read a "char*"-string value from the stream FP. The memory for the string will be allocated using bow_malloc(). */_BOW_IO_INLINE_EXTERN intbow_fread_string (char **s, FILE *fp){ short len; int ret; ret = bow_fread_short (&len, fp); assert (ret >= 0); *s = bow_malloc (len+1); if (len) ret += fread (*s, sizeof (char), len, fp); assert (ret = sizeof (short) + len); (*s)[len] = '\0'; return ret;}/* Write a (float) value to the stream FP. */_BOW_IO_INLINE_EXTERN intbow_fwrite_float (float n, FILE *fp){ /* xxx This is not machine-independent! */ int num_written; num_written = fwrite (&n, sizeof (float), 1, fp); assert (num_written == 1); return num_written * sizeof (float);}/* Read a (float) value from the stream FP. */_BOW_IO_INLINE_EXTERN intbow_fread_float (float *np, FILE *fp){ /* xxx This is not machine-independent! */ int num_written; num_written = fread (np, sizeof (float), 1, fp); assert (num_written == 1); return num_written * sizeof (float);}/* Manipulating a heap of documents *//* Elements of the heap. */typedef struct _bow_dv_heap_element { bow_dv *dv; /* The document vector */ int wi; /* The id of this word */ int index; /* Where we are in the vector at the mo. */ int current_di; /* Might as well keep the key here. */} bow_dv_heap_element;/* The heap itself */typedef struct _bow_dv_heap { int length; /* How many items in the heap */ bow_dv_heap_element entry[0]; /* The heap */} bow_dv_heap;/* Turn an array of bow_dv_heap_elements into a proper heap. The heapify function starts working at position i and works down the heap. The heap is indexed from position 1. */void bow_heapify (bow_dv_heap *wi2dvf, int i);/* Function to take the top element of the heap - move it's index along and place it back in the heap. */void bow_dv_heap_update (bow_dv_heap *heap);/* Function to make a heap from all the vectors of documents in the big data structure we've built - I hope it all fits.... */bow_dv_heap *bow_make_dv_heap_from_wi2dvf (bow_wi2dvf *wi2dvf);/* Function to create a heap of the vectors of documents associated with each word in the word vector. */bow_dv_heap *bow_make_dv_heap_from_wv (bow_wi2dvf *wi2dvf, bow_wv *wv);/* Classes for classification. In some cases each document will be in its own class. */typedef struct _bow_class { short class; float length;} bow_class;/* If non-zero, print to stdout the contribution of each word to each class. Currently implemented only for PrInd. */extern int bow_print_word_scores;/* Assigning weights to documents and calculating vector lengths *//* Normalize the weight-vector for each class (or document) such that all vectors have Euclidean length 1. */void bow_barrel_normalize_weights_by_vector_length (bow_barrel *barrel);/* Normalize the weight-vector for each class (or document) such that in all vectors, the elements of the vector sum to 1. */void bow_barrel_normalize_weights_by_summing (bow_barrel *barrel);/* Creating and working with test sets. *//* This takes a bow_array of bow_cdoc's and first sets them all to be in the model. It then randomly choses 'no_test' bow_cdoc's to be in the test set and sets their type to be test. */void bow_test_split (bow_barrel *barrel, int num_test);/* This function sets up the data structure so we can step through the word vectors for each test document easily. */bow_dv_heap *bow_test_new_heap (bow_barrel *barrel);typedef struct _bow_test_wv { int di; /* The di of this test document. */ bow_wv wv; /* It's associated wv */} bow_test_wv;/* This function takes the heap returned by bow_initialise_test_set and creates a word vector corresponding to the next document in the test set. The index of the test document is returned. If the test set is empty, 0 is returned and *wv == NULL. This can't really deal with vectors which are all zero, since they are not represented explicitly in our data structure. Not sure what we should/can do. */int bow_test_next_wv (bow_dv_heap *heap, bow_barrel *barrel, bow_wv **wv);/* Like BOW_TEST_NEXT_WV, but for type!=test instead of type==test */int bow_nontest_next_wv (bow_dv_heap *heap, bow_barrel *barrel, bow_wv **wv);/* Like bow_test_next_wv, but for type==model instead of type==test */int bow_model_next_wv (bow_dv_heap *heap, bow_barrel *barrel, bow_wv **wv);/* Functions for information gain and Foilgain *//* Return a malloc()'ed array containing an infomation-gain score for each word index; it is the caller's responsibility to free this array. NUM_CLASSES should be the total number of classes in the BARREL. The number of entries in the returned array will be found in SIZE. */float *bow_infogain_per_wi_new (bow_barrel *barrel, int num_classes, int *size);/* Return a malloc()'ed array containing an infomation-gain score for each word index, but the infogain scores are computing from co-occurance of word pairs. */float *bow_infogain_per_wi_new_using_pairs (bow_barrel *barrel, int num_classes, int *size);/* Return a malloc()'ed array containing an Foil-gain score for each ``word-index / class pair''. BARREL must be a `doc_barrel' */float **bow_foilgain_per_wi_ci_new (bow_barrel *barrel, int num_classes, int *size);/* Free the memory allocated in the return value of the function bow_foilgain_per_wi_ci_new() */void bow_foilgain_free (float **fig_per_wi_ci, int num_wi);/* Print to stdout the sorted results of bow_infogain_per_wi_new(). It will print the NUM_TO_PRINT words with the highest infogain. */void bow_infogain_per_wi_print (FILE *fp, bow_barrel *barrel, int num_classes, int num_to_print);/* Modify the int/word mapping by removing all words except the NUM_WORDS_TO_KEEP number of words that have the top information gain. */void bow_words_keep_top_by_infogain (int num_words_to_keep, bow_barrel *barrel, int num_classes);/* Parsing news article headers *//* Function which takes a freshly opened file and reads in the lines up to the first blank line, parsing them into header/contents. An sarray is returned with the header lines (e.g.Subject) as keys and the entries are strings containing the contents. This function _will_ do bad things if not used on a news article. */bow_sarray *bow_parse_news_headers (FILE *fp);/* Function to take the headers bow_sarray and return a bow_array of strings corresponding to the newsgroups. */bow_array *bow_headers2newsgroups(bow_sarray *headers);/* argp command-line processing for libbow */extern struct argp_child bow_argp_children[];/* Global variables whose value is set by bow_argp functions, but which must be examined by some other function (called later) in order to have any effect. *//* N for removing all but the top N words by selecting words with highest information gain */extern int bow_prune_vocab_by_infogain_n;/* N for removing words that occur less than N times */extern int bow_prune_vocab_by_occur_count_n;/* The weight-setting and scoring method */extern bow_method *bow_argp_method;/* The directory in which we'll store word-vector data. */extern const char *bow_data_dirname;/* If non-zero, use equal prior probabilities on classes when setting weights, calculating infogain and scoring */extern int bow_uniform_class_priors;/* If non-zero, use binary occurrence counts for words. */extern int bow_binary_word_counts;/* Don't lex any files with names matching this. */extern const char *bow_exclude_filename;/* Pipe the files through this shell command before lexing. */extern const char *bow_lex_pipe_command;/* If non-zero, check for eencoding blocks before istext() says that the file is text. */extern int bow_istext_avoid_uuencode;#endif /* __libbow_h_INCLUDE */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -