📄 libbow.h

📁 良好的代码实现
💻 H
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
/* Memory allocation with error checking. *//* These "extern inline" functions in this .h file will only be taken   from here if gcc is optimizing, otherwise, they will be taken from   identical copies defined in io.c */#if ! defined (_BOW_MALLOC_INLINE_EXTERN)#define _BOW_MALLOC_INLINE_EXTERN inline extern#endif_BOW_MALLOC_INLINE_EXTERN void *bow_malloc (size_t s){  void *ret;  ret = malloc (s);  if (!ret)    bow_error ("Memory exhausted.");  return ret;}_BOW_MALLOC_INLINE_EXTERN void * bow_realloc (void *ptr, size_t s){  void *ret;  ret = realloc (ptr, s);  if (!ret)    bow_error ("Memory exhausted.");  return ret;}_BOW_MALLOC_INLINE_EXTERN voidbow_free (void *ptr){  free (ptr);}/* Conveniences for writing and reading. *//* Version number of file format used to write binary data. */extern int bow_file_format_version;/* The default, initial value of above variable.  The above variable will   take on a different value when reading from binary data archived with    a different format version. */#define BOW_DEFAULT_FILE_FORMAT_VERSION 5/* Functions for conveniently recording and finding out the format   version used to write binary data to disk. */void bow_write_format_version_to_file (const char *filename);void bow_read_format_version_from_file (const char *filename);/* Open a file using fopen(), with the same parameters.  Check the   return value, and raise an error if the open failed.  The caller   should close the returned file-pointer with fclose(). */#define bow_fopen(FILENAME, MODE)					\({									\  FILE *ret;								\  ret = fopen (FILENAME, MODE);						\  if (ret == NULL)							\    {									\      if (*MODE == 'r')							\        {								\	  perror ("bow_fopen");						\	  bow_error ("Couldn't open file `%s' for reading", FILENAME);	\        }								\      else								\        {								\          perror ("bow_fopen");						\	  bow_error ("Couldn't open file `%s' for writing", FILENAME);	\        }								\    }									\  ret;									\})/* These "extern inline" functions in this .h file will only be taken   from here if gcc is optimizing, otherwise, they will be taken from   identical copies defined in io.c */#if ! defined (_BOW_IO_INLINE_EXTERN)#define _BOW_IO_INLINE_EXTERN inline extern#endif/* Write a (int) value to the stream FP. */_BOW_IO_INLINE_EXTERN intbow_fwrite_int (int n, FILE *fp){  int num_written;  n = htonl (n);  num_written = fwrite (&n, sizeof (int), 1, fp);  assert (num_written == 1);  return num_written * sizeof (int);}/* Read a (long) value from the stream FP. */_BOW_IO_INLINE_EXTERN intbow_fread_int (int *np, FILE *fp){  int num_read;  num_read = fread (np, sizeof (int), 1, fp);  assert (num_read == 1);  *np = ntohl (*np);  return num_read * sizeof (int);}/* Write a (short) value to the stream FP. */_BOW_IO_INLINE_EXTERN intbow_fwrite_short (short n, FILE *fp){  int num_written;  n = htons (n);  num_written = fwrite (&n, sizeof (short), 1, fp);  assert (num_written == 1);  return num_written * sizeof (short);}/* Read a (long) value from the stream FP. */_BOW_IO_INLINE_EXTERN intbow_fread_short (short *np, FILE *fp){  int num_read;  num_read = fread (np, sizeof (short), 1, fp);  assert (num_read == 1);  *np = ntohs (*np);  return num_read * sizeof (short);}/* Write a "char*"-string value to the stream FP. */_BOW_IO_INLINE_EXTERN intbow_fwrite_string (const char *s, FILE *fp){  short len;  int ret;  if (s)    len = strlen (s);  else    len = 0;  ret = bow_fwrite_short (len, fp);  if (len)    ret += fwrite (s, sizeof (char), len, fp);  assert (ret == (int)sizeof (short) + len);  return ret;}/* Read a "char*"-string value from the stream FP.  The memory for the   string will be allocated using bow_malloc(). */_BOW_IO_INLINE_EXTERN intbow_fread_string (char **s, FILE *fp){  short len;  int ret;  ret = bow_fread_short (&len, fp);  assert (ret >= 0);  *s = bow_malloc (len+1);  if (len)    ret += fread (*s, sizeof (char), len, fp);  assert (ret = sizeof (short) + len);  (*s)[len] = '\0';  return ret;}/* Write a (float) value to the stream FP. */_BOW_IO_INLINE_EXTERN intbow_fwrite_float (float n, FILE *fp){  /* xxx This is not machine-independent! */  int num_written;  num_written = fwrite (&n, sizeof (float), 1, fp);  assert (num_written == 1);  return num_written * sizeof (float);}/* Read a (float) value from the stream FP. */_BOW_IO_INLINE_EXTERN intbow_fread_float (float *np, FILE *fp){  /* xxx This is not machine-independent! */  int num_written;  num_written = fread (np, sizeof (float), 1, fp);  assert (num_written == 1);  return num_written * sizeof (float);}/* Manipulating a heap of documents *//* Elements of the heap. */typedef struct _bow_dv_heap_element {  bow_dv *dv;                   /* The document vector */  int wi;                       /* The id of this word */  int index;                    /* Where we are in the vector at the mo. */  int current_di;               /* Might as well keep the key here. */} bow_dv_heap_element;/* The heap itself */typedef struct _bow_dv_heap {  int length;                   /* How many items in the heap */  bow_dv_heap_element entry[0];	/* The heap */} bow_dv_heap;/* Turn an array of bow_dv_heap_elements into a proper heap. The   heapify function starts working at position i and works down the   heap.  The heap is indexed from position 1. */void bow_heapify (bow_dv_heap *wi2dvf, int i);/* Function to take the top element of the heap - move it's index   along and place it back in the heap. */void bow_dv_heap_update (bow_dv_heap *heap);/* Function to make a heap from all the vectors of documents in the big   data structure we've built - I hope it all fits.... */bow_dv_heap *bow_make_dv_heap_from_wi2dvf (bow_wi2dvf *wi2dvf);/* Function to create a heap of the vectors of documents associated   with each word in the word vector. */bow_dv_heap *bow_make_dv_heap_from_wv (bow_wi2dvf *wi2dvf, bow_wv *wv);/* Classes for classification.  In some cases each document will   be in its own class. */typedef struct _bow_class {  short class;  float length;} bow_class;/* If non-zero, print to stdout the contribution of each word to   each class.  Currently implemented only for PrInd. */extern int bow_print_word_scores;/* Assigning weights to documents and calculating vector lengths *//* Normalize the weight-vector for each class (or document) such that   all vectors have Euclidean length 1. */void bow_barrel_normalize_weights_by_vector_length (bow_barrel *barrel);/* Normalize the weight-vector for each class (or document) such that   in all vectors, the elements of the vector sum to 1. */void bow_barrel_normalize_weights_by_summing (bow_barrel *barrel);/* Creating and working with test sets. *//* This takes a bow_array of bow_cdoc's and first sets them all to be in the   model. It then randomly choses 'no_test' bow_cdoc's to be in the test set   and sets their type to be test. */void bow_test_split (bow_barrel *barrel, int num_test);/* This function sets up the data structure so we can step through the word   vectors for each test document easily. */bow_dv_heap *bow_test_new_heap (bow_barrel *barrel);typedef struct _bow_test_wv {  int di;                          /* The di of this test document. */  bow_wv wv;                       /* It's associated wv */} bow_test_wv;/* This function takes the heap returned by bow_initialise_test_set and   creates a word vector corresponding to the next document in the test set.   The index of the test document is returned. If the test set is empty, 0   is returned and *wv == NULL. This can't really deal with   vectors which are all zero, since they are not represented explicitly   in our data structure. Not sure what we should/can do. */int bow_test_next_wv (bow_dv_heap *heap, bow_barrel *barrel, bow_wv **wv);/* Like BOW_TEST_NEXT_WV, but for type!=test instead of type==test */int bow_nontest_next_wv (bow_dv_heap *heap, bow_barrel *barrel, bow_wv **wv);/* Like bow_test_next_wv, but for type==model instead of type==test */int bow_model_next_wv (bow_dv_heap *heap, bow_barrel *barrel, bow_wv **wv);/* Functions for information gain and Foilgain *//* Return a malloc()'ed array containing an infomation-gain score for   each word index; it is the caller's responsibility to free this   array.  NUM_CLASSES should be the total number of classes in the   BARREL.  The number of entries in the returned array will be found   in SIZE. */float *bow_infogain_per_wi_new (bow_barrel *barrel, int num_classes, 				int *size);/* Return a malloc()'ed array containing an infomation-gain score for   each word index, but the infogain scores are computing from   co-occurance of word pairs. */float *bow_infogain_per_wi_new_using_pairs (bow_barrel *barrel, 					    int num_classes, int *size);/* Return a malloc()'ed array containing an Foil-gain score for   each ``word-index / class pair''.  BARREL must be a `doc_barrel' */float **bow_foilgain_per_wi_ci_new (bow_barrel *barrel, 				    int num_classes, int *size);/* Free the memory allocated in the return value of the function   bow_foilgain_per_wi_ci_new() */void bow_foilgain_free (float **fig_per_wi_ci, int num_wi);/* Print to stdout the sorted results of bow_infogain_per_wi_new().   It will print the NUM_TO_PRINT words with the highest infogain. */void bow_infogain_per_wi_print (FILE *fp, bow_barrel *barrel, int num_classes, 				int num_to_print);/* Modify the int/word mapping by removing all words except the   NUM_WORDS_TO_KEEP number of words that have the top information   gain. */void bow_words_keep_top_by_infogain (int num_words_to_keep, 				     bow_barrel *barrel, int num_classes);/* Parsing news article headers *//* Function which takes a freshly opened file and reads in the lines up to   the first blank line, parsing them into header/contents. An sarray is   returned with the header lines (e.g.Subject) as keys and the entries are   strings containing the contents. This function _will_ do bad things if not   used on a news article. */bow_sarray *bow_parse_news_headers (FILE *fp);/* Function to take the headers bow_sarray and return a bow_array of strings   corresponding to the newsgroups. */bow_array *bow_headers2newsgroups(bow_sarray *headers);/* argp command-line processing for libbow */extern struct argp_child bow_argp_children[];/* Global variables whose value is set by bow_argp functions, but   which must be examined by some other function (called later) in   order to have any effect. *//* N for removing all but the top N words by selecting words with   highest information gain */extern int bow_prune_vocab_by_infogain_n;/* N for removing words that occur less than N times */extern int bow_prune_vocab_by_occur_count_n;/* The weight-setting and scoring method */extern bow_method *bow_argp_method;/* The directory in which we'll store word-vector data. */extern const char *bow_data_dirname;/* If non-zero, use equal prior probabilities on classes when setting   weights, calculating infogain and scoring */extern int bow_uniform_class_priors;/* If non-zero, use binary occurrence counts for words. */extern int bow_binary_word_counts;/* Don't lex any files with names matching this. */extern const char *bow_exclude_filename;/* Pipe the files through this shell command before lexing. */extern const char *bow_lex_pipe_command;/* If non-zero, check for eencoding blocks before istext() says that   the file is text. */extern int bow_istext_avoid_uuencode;#endif /* __libbow_h_INCLUDE */
上一页 1 2 34
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -