📄 libbow.h

📁 良好的代码实现
💻 H
📖 第 1 页 / 共 4 页
字号:
/* Return a pointer to the entry at index INDEX. */void *bow_sarray_entry_at_index (bow_sarray *sa, int index);/* Return a pointer to the entry associated with string KEYSTR. */void *bow_sarray_entry_at_keystr (bow_sarray *sa, const char *keystr);/* Return the string KEYSTR associated with the entry at index INDEX. */const char *bow_sarray_keystr_at_index (bow_sarray *sa, int index);/* Return the index of the entry associated with the string KEYSTR. */int bow_sarray_index_at_keystr (bow_sarray *sa, const char *keystr);/* Write the sarray SARRAY to the file-pointer FP, using the function   WRITE_FUNC to write each of the entries in SARRAY. */void bow_sarray_write (bow_sarray *sarray, int (*write_func)(void*,FILE*), 		       FILE *fp);/* Return a new sarray, created by reading file-pointer FP, and using   the function READ_FUNC to read each of the sarray entries.  The   returned sarray will have entry-freeing-function FREE_FUNC. */bow_sarray *bow_sarray_new_from_data_fp (int (*read_func)(void*,FILE*), 					 void (*free_func)(),					 FILE *fp);/* Free the memory held by the bow_sarray SA. */void bow_sarray_free (bow_sarray *sa);/* Bit vectors, indexed by multiple dimensions.  They can grow   automatically in the last dimension. */typedef struct _bow_bitvec {  int num_dimensions;		/* the number of dimensions by which indexed */  int *dimension_sizes;		/* the sizes of each index dimension */  int vector_size;		/* size of VECTOR in bytes */  unsigned char *vector;	/* the memory for the bit vector */} bow_bitvec;/* Allocate, initialize and return a new "bit vector" that is indexed   by NUM_DIMENSIONS different dimensions.  The size of each dimension   is given in DIMENSION_SIZES.  The size of the last dimension is   used as hint for allocating initial memory for the vector, but in   practice, higher indices for the last dimension can be used later,   and the bit vector will grow automatically.  Initially, the bit   vector contains all zeros. */bow_bitvec *bow_bitvec_new (int num_dimensions, int *dimension_sizes);/* Set all the bits in the bit vector BV to 0 if value is zero, to 1   otherwise. */void bow_bitvec_set_all_to_value (bow_bitvec *bv, int value);/* If VALUE is non-zero, set the bit at indices INDICES to 1,   otherwise set it to zero.  Returns the previous value of that   bit. */int bow_bitvec_set (bow_bitvec *bv, int *indices, int value);/* Return the value of the bit at indicies INDICIES. */int bow_bitvec_value (bow_bitvec *bv, int *indices);/* Free the memory held by the "bit vector" BV. */void bow_bitvec_free (bow_bitvec *bv);/* A convient interface to a specific instance of the above int/string   mapping; this one is intended for all the words encountered in all   documents. *//* Given a "word index" WI, return its WORD, according to the global   word-int mapping. */const char *bow_int2word (int wi);/* Given a WORD, return its "word index", WI, according to the global   word-int mapping; if it's not yet in the mapping, add it. */int bow_word2int (const char *word);/* Like bow_word2int(), except it also increments the occurrence count    associated with WORD. */int bow_word2int_add_occurrence (const char *word);/* If this is non-zero, then bow_word2int() will return -1 when asked   for the index of a word that is not already in the mapping.   Essentially, setting this to non-zero makes bow_word2int() and   bow_word2int_add_occurrence() behave like bow_str2int_no_add(). */extern int bow_word2int_do_not_add;/* Add to the word occurrence counts by recursively decending directory    DIRNAME and lexing all the text files; skip any files matching   EXCEPTION_NAME. */int bow_words_add_occurrences_from_text_dir (const char *dirname,					     const char *exception_name);/* Return the number of times bow_word2int_add_occurrence() was   called with the word whose index is WI. */int bow_words_occurrences_for_wi (int wi);/* Replace the current word/int mapping with MAP. */void bow_words_set_map (bow_int4str *map, int free_old_map);/* Modify the int/word mapping by removing all words that occurred    less than OCCUR number of times.  WARNING: This totally changes   the word/int mapping; any WV's, WI2DVF's or BARREL's you build   with the old mapping will have bogus WI's afterward. */void bow_words_remove_occurrences_less_than (int occur);/* Return the total number of unique words in the int/word map. */int bow_num_words ();/* Save the int/word map to file-pointer FP. */void bow_words_write (FILE *fp);/* Same as above, but with a filename instead of a FILE* */void bow_words_write_to_file (const char *filename);/* Read the int/word map from file-pointer FP. */void bow_words_read_from_fp (FILE *fp);/* Same as above, but with a filename instead of a FILE* */void bow_words_read_from_file (const char *filename);/* Same as above, but don't bother rereading unless filename is different   from the last one, or FORCE_UPDATE is non-zero. */void bow_words_reread_from_file (const char *filename, int force_update);/* Word vectors.  A "word vector" is sorted array of words, with count   information attached to each word.  Typically, there would be one   "word vector" associated with a document, or with a concept. *//* A "word entry"; these are the elements of a "word vector" */typedef struct _bow_we {  int wi;  int count;  float weight;} bow_we;/* A "word vector", containing an array of words with their statistics */typedef struct _bow_wv {  int num_entries;		/* the number of unique words in the vector */  float normalizer;		/* multiply weights by this for normalizing */  bow_we entry[0];} bow_wv;/* Create and return a new "word vector" from a file. */bow_wv *bow_wv_new_from_text_fp (FILE *fp);/* Create and return a new "word vector" from a string. */bow_wv *bow_wv_new_from_text_string (char *the_string);/* Create and return a new "word vector" from a document buffer LEX. */bow_wv *bow_wv_new_from_lex (bow_lex *lex);/* Create and return a new "word vector" that is the sum of all the   "word vectors" in WV_ARRAY.  The second parameter, WV_ARRAY_LENGTH,   is the number of "word vectors" in WV_ARRAY. */bow_wv *bow_wv_add (bow_wv **wv_array, int wv_array_length);/* Create and return a new "word vector" with uninitialized contents. */bow_wv *bow_wv_new (int capacity);/* Return a pointer to the "word entry" with index WI in "word vector WV */bow_we *bow_wv_entry_for_wi (bow_wv *wv, int wi);/* Return the count entry of "word" with index WI in "word vector" WV */int bow_wv_count_for_wi (bow_wv *wv, int wi);/* Print "word vector" WV on stream FP in a human-readable format. */void bow_wv_fprintf (FILE *fp, bow_wv *wv);/* Print "word vector" WV to a string in a human-readable format. */char *bow_wv_sprintf (bow_wv *wv, unsigned int max_size_for_string);/* Print "word vector"'s actual words to a string. */char *bow_wv_sprintf_words (bow_wv *wv, unsigned int max_size_for_string);/* Assign the values of the "word vector entry's" WEIGHT field   equal to the COUNT. */void bow_wv_set_weights_to_count (bow_wv *wv);/* Assign a value to the "word vector's" NORMALIZER field, such that   when all the weights in the vector are multiplied by the   NORMALIZER, the Euclidian length of the vector will be one. */void bow_wv_normalize_weights_by_vector_length (bow_wv *wv);/* Assign a value to the "word vector's" NORMALIZER field, such that   when all the weights in the vector are multiplied by the   NORMALIZER, all the vector entries will to one. */void bow_wv_normalize_weights_by_summing (bow_wv *wv);/* Return the number of bytes required for writing the "word vector" WV. */int bow_wv_write_size (bow_wv *wv);/* Write "word vector" DV to the stream FP. */void bow_wv_write (bow_wv *wv, FILE *fp);/* Return a new "word vector" read from a pointer into a data file, FP. */bow_wv *bow_wv_new_from_data_fp (FILE *fp);/* Free the memory held by the "word vector" WV. */void bow_wv_free (bow_wv *wv);/* Collections of "word vectors. *//* An array that maps "document indices" to "word vectors" */typedef struct _bow_di2wv {  int length;  int size;  bow_wv *entry[0];} bow_di2wv;/* Documents */  /* We want a nice way of saying this is a training or test document, or do   we ignore it for now. */typedef enum {model, test, ignore, ignored_model} bow_doc_type;/* A "document" entry useful for standard classification tasks. */typedef struct _bow_cdoc {  bow_doc_type type;		/* Is this document part of the model to be				   built, a test document, or to be ignored */  float normalizer;		/* Multiply weights by this for normalizing */  int word_count;		/* Total number of words in this document */  float prior;			/* Prior probability of this class/doc */  const char *filename;		/* Where to find the original document */  int class;			/* A classification label. */} bow_cdoc;/* A convenient interface to bow_array that is specific to bow_cdoc. */#define bow_cdocs bow_array#define bow_cdocs_new(CAPACITY) bow_array_new (CAPACITY, sizeof (bow_cdoc), 0)#define bow_cdocs_register_doc(CDOCS,CDOC) bow_array_append (CDOCS, CDOC)#define bow_cdocs_di2doc(CDOCS, DI) bow_array_entry_at_index (CDOCS, DI)/* Traversing directories to get filenames. *//* A list of document names. *//* xxx We might change this someday to allow for multiple documents   per file, e.g. for "mbox" files containing many email messages. */typedef struct _bow_doc_list {  struct _bow_doc_list *next;  char filename[0];} bow_doc_list;/* Return a non-zero value if the file FP contains mostly text. */int bow_fp_is_text (FILE *fp);/* Calls the function CALLBACK for each of the filenames encountered   when recursively descending the directory named DIRNAME.  CALLBACK   should be a pointer to function that takes a filename char-pointer,   and a void-pointer as arguments and returns an integer.  Currently   the return value is ignored, but it may be used in the future to   cut short, causing bow_map_filesnames_from_dir to return   immediately.  The value CONTEXT will be passed as the second   argument to the CALLBACK function; it provides you with a way to   transfer context you may need inside the implementation of the   callback function.  EXCLUDE_PATTERNS is currently ignored. */intbow_map_filenames_from_dir (int (*callback)(const char *filename, 					    void *context),			    void *context,			    const char *dirname,			    const char *exclude_patterns);/* Create a linked list of filenames, and append the file list pointed   to by FL to it; return the new concatenated lists in *FL.  The   function returns the total number of filenames.  When creating the   list, look for files (and recursively descend directories) among   those matching INCLUDE_PATTERNS, but don't include those matching   EXCLUDE_PATTERNS; don't include files that aren't text files. *//* xxx For now, this only works with a single directory name in   INCLUDE_PATTERNS, and it ignores EXCLUDE_PATTERNS. */int bow_doc_list_append (bow_doc_list **list, 			 const char *include_patterns,			 const char *exclude_patterns);/* Print the file list FL to the output stream FP. */void bow_doc_list_fprintf (FILE *fp, bow_doc_list *fl);/* Return the number of entries in the "docname list" DL. */int bow_doc_list_length (bow_doc_list *dl);/* Free the memory held by the file list FL. */void bow_doc_list_free (bow_doc_list *fl);/* A convient interface to a specific instance of the above int/string   mapping; this one is intended for all the documents encountered. *//* Given a "word index" WI, return its WORD, according to the global   word-int mapping. */const char *bow_int2docname (int wi);/* Given a WORD, return its "word index", WI, according to the global   word-int mapping; if it's not yet in the mapping, add it. */int bow_docname2int (const char *word);/* Return the total number of unique words in the int/word map. */int bow_num_docnames ();/* Save the docname map to file-pointer FP. */void bow_docnames_write (FILE *fp);/* Read the docname from file-pointer FP. */void bow_docnames_read_from_fp (FILE *fp);/* xxx Perhaps the name should be changed from "dv" to "cv", for   "class vector", or "concept vector", or "codebook vector". *//* Document vectors.  A "document vector" is a sorted array of   documents, with count information attached to each document.   Typically, there would be one "document vector" associated with a   word.  If "word vectors" are the rows of a large matrix, "document   vectors" are the columns.  It can be more efficient to search just   the docment vectors of the words in the query document, than it is   to search the word vectors of all documents. *//* A "document entry"; these are the elements of a "document vector". */typedef struct _bow_de {  int di;			/* a "document index" */  int count;			/* number of times X appears in document DI */  float weight;} bow_de;/* A "document vector" */ typedef struct _bow_dv {  int length;			/* xxx Rename this to num_entries */  int size;  float idf;                    /* The idf factor for this word. */  bow_de entry[0];} bow_dv;/* Create a new, empty "document vector". */bow_dv *bow_dv_new (int capacity);/* The default capacity used when 0 is passed for CAPACITY above. */extern unsigned int bow_dv_default_capacity; /* Add a new entry to the "document vector" *DV. */void bow_dv_add_di_count_weight (bow_dv **dv, int di, int count, float weight);/* Sum the WEIGHT into the document vector DV at document index DI,   creating a new entry in the document vector if necessary. */void bow_dv_add_di_weight (bow_dv **dv, int di, float weight);/* Return a pointer to the BOW_DE for a particular document, or return   NULL if there is no entry for that document. */bow_de *bow_dv_entry_at_di (bow_dv *dv, int di);/* Write "document vector" DV to the stream FP. */void bow_dv_write (bow_dv *dv, FILE *fp);/* Return the number of bytes required for writing the "document vector" DV. */int bow_dv_write_size (bow_dv *dv);/* Return a new "document vector" read from a pointer into a data file, FP. */bow_dv *bow_dv_new_from_data_fp (FILE *fp);/* Free the memory held by the "document vector" DV. */void bow_dv_free (bow_dv *dv);/* A "document vector with file info (file storage information)" */typedef struct _bow_dvf {  int seek_start;  bow_dv *dv;} bow_dvf;/* xxx Perhaps these should be generalized and renamed to `bow_i2v'? *//* An array that maps "word indices" to "document vectors with file info" */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -