📄 libbow.h
字号:
/* libbow.h - public declarations for the Bag-Of-Words Library, libbow. Copyright (C) 1997 Andrew McCallum Written by: Andrew Kachites McCallum <mccallum@cs.cmu.edu> This file is part of the Bag-Of-Words Library, `libbow'. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation, version 2. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA *//* Pronounciation guide: "libbow" rhymes with "lib-low", not "lib-cow". */#ifndef __libbow_h_INCLUDE#define __libbow_h_INCLUDE/* These next two macros are automatically maintained by the Makefile, in conjunction with the file ./Version. */#define BOW_MAJOR_VERSION 0#define BOW_MINOR_VERSION 8#define BOW_VERSION BOW_MAJOR_VERSION.BOW_MINOR_VERSION#include <stdio.h>#include <math.h>#include <assert.h>#include <sys/types.h> /* for netinet/in.h on SunOS */#include <sys/stat.h>#include <netinet/in.h> /* for machine-independent byte-order */#include <malloc.h> /* for malloc() and friends. */#include <stdlib.h> /* For malloc() etc. on DEC Alpha */#include <string.h> /* for strlen() on DEC Alpha */#include <limits.h> /* for PATH_MAX and SHRT_MAX and friends */#include <float.h> /* for FLT_MAX and friends */#include <unistd.h> /* for SEEK_SET and friends on SunOS */#if defined (Windows_NT) && OS == Windows_NT#define htonl(a) (a)#define htons(a) (a)#define ntohl(a) (a)#define ntohs(a) (a)#endif#if !PATH_MAX /* for SunOS */#define PATH_MAX 255#endif#if !defined SEEK_SET || !defined(SEEK_CUR) /* for SunOS */#define SEEK_SET 0#define SEEK_CUR 1#define SEEK_END 2#endif#if !defined(MIN)#define MIN(a,b) (((a) < (b)) ? (a) : (b))#endif#if !defined(MAX)#define MAX(a,b) (((a) > (b)) ? (a) : (b))#endif#ifndef STRINGIFY#define STRINGIFY(s) XSTRINGIFY(s)#define XSTRINGIFY(s) #s#endiftypedef enum { bow_no, bow_yes } bow_boolean;/* Lexing words from a file. */#define BOW_MAX_WORD_LENGTH 1024/* A structure for maintaining the context of a lexer. (If you need to create a lexer that uses more context than this, define a new structure that includes this structure as its first element; BOW_LEX_GRAM, defined below is an example of this.) */typedef struct _bow_lex { char *document; int document_length; int document_position;} bow_lex;/* A lexer is represented by a pointer to a structure of this type. */typedef struct _bow_lexer { int sizeof_lex; /* The size of this structure */ /* Pointers to functions for opening, closing and getting words. */ bow_lex* (*open_text_fp) (struct _bow_lexer *self, FILE *fp); int (*get_word) (struct _bow_lexer *self, bow_lex *lex, char *buf, int buflen); void (*close) (struct _bow_lexer *self, bow_lex *lex); /* How to recognize the beginning and end of a document. */ const char *document_start_pattern; const char *document_end_pattern; /* NULL pattern means don't scan forward at all. "" pattern means scan forward to EOF. */} bow_lexer;/* This is an augmented version of BOW_LEXER that works for simple, context-free lexers. */typedef struct _bow_lexer_simple { /* The basic lexer. */ bow_lexer lexer; /* Parameters of the simple, context-free lexing. */ int (*true_to_start)(int character); /* non-zero on char to start */ int (*false_to_end)(int character); /* zero on char to end */ int (*stoplist_func)(const char *); /* one on token in stoplist */ int (*stem_func)(char *); /* modify arg by stemming */ int case_sensitive; /* boolean */ int strip_non_alphas_from_end; /* boolean */ int toss_words_containing_non_alphas; /* boolean */ int toss_words_containing_this_many_digits; int toss_words_longer_than;} bow_lexer_simple;/* Get the raw token from the document buffer by scanning forward until we get a start character, and filling the buffer until we get an ending character. The resulting token in the buffer is NULL-terminated. Return the length of the token. */int bow_lexer_simple_get_raw_word (bow_lexer_simple *self, bow_lex *lex, char *buf, int buflen);/* Perform all the necessary postprocessing after the initial token boundaries have been found: strip non-alphas from end, toss words containing non-alphas, toss words containing certaing many digits, toss words appearing in the stop list, stem the word, check the stoplist again, toss words of length one. If the word is tossed, return zero, otherwise return the length of the word. */int bow_lexer_simple_postprocess_word (bow_lexer_simple *self, bow_lex *lex, char *buf, int buflen);/* Create and return a BOW_LEX, filling the document buffer from characters in FP, starting after the START_PATTERN, and ending with the END_PATTERN. */bow_lex *bow_lexer_simple_open_text_fp (bow_lexer *self, FILE *fp);/* Close the LEX buffer, freeing the memory held by it. */void bow_lexer_simple_close (bow_lexer *self, bow_lex *lex);/* Scan a single token from the LEX buffer, placing it in BUF, and returning the length of the token. BUFLEN is the maximum number of characters that will fit in BUF. If the token won't fit in BUF, an error is raised. */int bow_lexer_simple_get_word (bow_lexer *self, bow_lex *lex, char *buf, int buflen);/* Here are some simple, ready-to-use lexers that are implemented in lex-simple.c *//* A lexer that throws out all space-delimited strings that have any non-alphabetical characters. For example, the string `obtained from http://www.cs.cmu.edu' will result in the tokens `obtained' and `from', but the URL will be skipped. */extern const bow_lexer_simple *bow_alpha_only_lexer;/* A lexer that keeps all alphabetic strings, delimited by non-alphabetic characters. For example, the string `http://www.cs.cmu.edu' will result in the tokens `http', `www', `cs', `cmu', `edu'. */extern const bow_lexer_simple *bow_alpha_lexer;/* A lexer that keeps all strings that begin and end with alphabetic characters, delimited by white-space. For example, the string `http://www.cs.cmu.edu' will be a single token. */extern const bow_lexer_simple *bow_white_lexer;/* Some declarations for a generic indirect lexer. See lex-indirect.c */typedef struct _bow_lexer_indirect { bow_lexer lexer; bow_lexer *underlying_lexer;} bow_lexer_indirect;/* Open the underlying lexer. */bow_lex *bow_lexer_indirect_open_text_fp (bow_lexer *self, FILE *fp);/* Close the underlying lexer. */void bow_lexer_indirect_close (bow_lexer *self, bow_lex *lex);/* Some declarations for a simple N-gram lexer. See lex-gram.c *//* An augmented version of BOW_LEXER that provides N-grams */typedef struct _bow_lexer_gram { bow_lexer_indirect indirect_lexer; int gram_size;} bow_lexer_gram;/* An augmented version of BOW_LEX that works for N-grams */typedef struct _bow_lex_gram { bow_lex lex; int gram_size_this_time;} bow_lex_gram;/* A lexer that returns N-gram tokens using BOW_ALPHA_ONLY_LEXER. It actually returns all 1-grams, 2-grams ... N-grams, where N is specified by GRAM_SIZE. */extern const bow_lexer_gram *bow_gram_lexer;/* A lexer that ignores all HTML directives, ignoring all characters between angled brackets: < and >. */extern const bow_lexer_indirect *bow_html_lexer;/* An unsorted, NULL-terminated array of strings, indicating headers which should be removed from an e-mail/newsgroup message. If this pointer is not NULL, the GET_WORD() function should be BOW_LEXER_EMAIL_GET_WORD. */extern char **bow_email_headers_to_remove;/* A lexer that removes all header lines which is one of the headers contained in HEADERS_TO_REMOVE */extern const bow_lexer_indirect *bow_email_lexer;/* The default lexer that will be used by various library functions like BOW_WV_NEW_FROM_TEXT_FP(). You should set this variable to point at whichever lexer you desire. If you do not set it, it will point at bow_alpha_lexer. */extern bow_lexer *bow_default_lexer;/* Default instances of the lexers that can be modified by libbow's argp cmdline argument processing. */extern bow_lexer_simple *bow_default_lexer_simple;extern bow_lexer_indirect *bow_default_lexer_indirect;extern bow_lexer_gram *bow_default_lexer_gram;extern bow_lexer_indirect *bow_default_lexer_html;extern bow_lexer_indirect *bow_default_lexer_email;/* Functions that may be useful in writing a lexer. *//* Apply the Porter stemming algorithm to modify WORD. Return 0 on success. */int bow_stem_porter (char *word);/* A function wrapper around POSIX's `isalpha' macro. */int bow_isalpha (int character);/* A function wrapper around POSIX's `isgraph' macro. */int bow_isgraph (int character);/* Return non-zero if WORD is on the stoplist. */int bow_stoplist_present (const char *word);/* Add to the stoplist the white-space delineated words from FILENAME. Return the number of words added. If the file could not be opened, return -1. */int bow_stoplist_add_from_file (const char *filename);/* Add WORD to the stop list. */void bow_stoplist_add_word (const char *word);/* Arrays of C struct's that can grow. */typedef struct _bow_array { int length; /* number of elements in the array */ int size; /* number of elts for which alloc'ed space */ int entry_size; /* number of bytes in each entry */ void (*free_func)(void*); /* call this with each entry when freeing */ int growth_factor; /* mult, then divide by 1-this when realloc */ void *entries; /* the malloc'ed space for the entries */} bow_array;extern int bow_array_default_capacity;extern int bow_array_default_growth_factor;/* Allocate, initialize and return a new array structure. */bow_array *bow_array_new (int capacity, int entry_size, void (*free_func)());/* Initialize an already allocated array structure. */void bow_array_init (bow_array *array, int capacity, int entry_size, void (*free_func)());/* Append an entry to the array. Return its index. */int bow_array_append (bow_array *array, void *entry);/* Return a pointer to the array entry at index INDEX. */void *bow_array_entry_at_index (bow_array *array, int index);/* Write the array ARRAY to the file-pointer FP, using the function WRITE_FUNC to write each of the entries in ARRAY. */void bow_array_write (bow_array *array, int (*write_func)(void*,FILE*), FILE *fp);/* Return a new array, created by reading file-pointer FP, and using the function READ_FUNC to read each of the array entries. The returned array will have entry-freeing-function FREE_FUNC. */bow_array *bow_array_new_from_data_fp (int (*read_func)(void*,FILE*), void (*free_func)(), FILE *fp);/* Free the memory held by the array ARRAY. */void bow_array_free (bow_array *array);/* Managing int->string and string->int mappings. */typedef struct _bow_int4str { const char **str_array; int str_array_length; int str_array_size; int *str_hash; int str_hash_size;} bow_int4str;/* Allocate, initialize and return a new int/string mapping structure. The parameter CAPACITY is used as a hint about the number of words to expect; if you don't know or don't care about a CAPACITY value, pass 0, and a default value will be used. */bow_int4str *bow_int4str_new (int capacity);/* Given a integer INDEX, return its corresponding string. */const char *bow_int2str (bow_int4str *map, int index);/* Given the char-pointer STRING, return its integer index. If this is the first time we're seeing STRING, add it to the mapping, assign it a new index, and return the new index. */int bow_str2int (bow_int4str *map, const char *string);/* Given the char-pointer STRING, return its integer index. If STRING is not yet in the mapping, return -1. */int bow_str2int_no_add (bow_int4str *map, const char *string);/* Create a new int-str mapping by lexing words from FILE. */bow_int4str *bow_int4str_new_from_text_file (const char *filename);/* Write the int-str mapping to file-pointer FP. */void bow_int4str_write (bow_int4str *map, FILE *fp);/* Return a new int-str mapping, created by reading file-pointer FP. */bow_int4str *bow_int4str_new_from_fp (FILE *fp);/* Return a new int-str mapping, created by reading FILENAME. */bow_int4str *bow_int4str_new_from_file (const char *filename);/* Free the memory held by the int-word mapping MAP. */void bow_int4str_free (bow_int4str *map);/* Arrays of C struct's that can grow. Entries can be retrieved either by integer index, or by string key. */typedef struct _bow_sarray { bow_array *array; bow_int4str *i4k;} bow_sarray;extern int bow_sarray_default_capacity;/* Allocate, initialize and return a new sarray structure. */bow_sarray *bow_sarray_new (int capacity, int entry_size, void (*free_func)());/* Initialize a newly allocated sarray structure. */void bow_sarray_init (bow_sarray *sa, int capacity, int entry_size, void (*free_func)());/* Append a new entry to the array. Also make the entry accessible by the string KEYSTR. Returns the index of the new entry. */int bow_sarray_add_entry_with_keystr (bow_sarray *sa, void *entry, const char *keystr);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -