📄 libbow.h
字号:
/* libbow.h - public declarations for the Bag-Of-Words Library, libbow. Copyright (C) 1997, 1998, 1999, 2000 Andrew McCallum Written by: Andrew Kachites McCallum <mccallum@cs.cmu.edu> This file is part of the Bag-Of-Words Library, `libbow'. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation, version 2. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA *//* Pronounciation guide: "libbow" rhymes with "lib-low", not "lib-cow". */#ifndef __libbow_h_INCLUDE#define __libbow_h_INCLUDE/* These next two macros are automatically maintained by the Makefile, in conjunction with the file ./Version. */#define BOW_MAJOR_VERSION 1#define BOW_MINOR_VERSION 0#define BOW_VERSION BOW_MAJOR_VERSION.BOW_MINOR_VERSION#define _FILE_OFFSET_BITS 64#include <stdio.h>#include <math.h>#include <assert.h>#include <sys/types.h> /* for netinet/in.h on SunOS */#include <sys/stat.h>#include <netinet/in.h> /* for machine-independent byte-order */#include <malloc.h> /* for malloc() and friends. */#include <stdlib.h> /* For malloc() etc. on DEC Alpha */#include <string.h> /* for strlen() on DEC Alpha */#include <limits.h> /* for PATH_MAX and SHRT_MAX and friends */#include <float.h> /* for FLT_MAX and friends */#include <unistd.h> /* for SEEK_SET and friends on SunOS */#if BOW_MCHECK#include <mcheck.h>#endif /* BOW_MCHECK */#if defined (Windows_NT) && OS == Windows_NT#define htonl(a) (a)#define htons(a) (a)#define ntohl(a) (a)#define ntohs(a) (a)#endif#ifdef __linux__#undef assert#define assert(expr) \((void) ((expr) || (bow_error ("Assertion failed %s:%d:" __STRING(expr), __FILE__, __LINE__), NULL)))#endif#if !(HAVE_SRANDOM && HAVE_RANDOM) /* for SunOS */#undef srandom#define srandom srand#undef random#define random rand#endif#ifndef HAVE_STRCHR#define strchr index#endif#ifndef HAVE_STRRCHR#define strrchr rindex#endif#if !PATH_MAX /* for SunOS */#define PATH_MAX 255#endif#if !defined SEEK_SET || !defined(SEEK_CUR) /* for SunOS */#define SEEK_SET 0#define SEEK_CUR 1#define SEEK_END 2#endif#if !defined(MIN)#define MIN(a,b) (((a) < (b)) ? (a) : (b))#endif#if !defined(MAX)#define MAX(a,b) (((a) > (b)) ? (a) : (b))#endif#if !defined(ABS)#define ABS(a) (((a) < 0) ? -(a) : (a))#endif#ifndef STRINGIFY#define STRINGIFY(s) XSTRINGIFY(s)#define XSTRINGIFY(s) #s#endiftypedef enum { bow_no, bow_yes } bow_boolean;typedef struct _bow_iterator_double { /* Move to first item collection (optional row indicates which collection)*/ void (*reset) (int row_index, void *context); /* Move to next item in collection. Return zero at end. */ int (*advance)(void *context); /* Index at current item, not necessarily contiguous. Returns INT_MIN when invalid. */ int (*index) (void *context); /* Value at current item. */ double (*value) (void *context); /* Collection-specific context */ void *context;} bow_iterator_double;/* Lexing words from a file. */#define BOW_MAX_WORD_LENGTH 4096/* A structure for maintaining the context of a lexer. (If you need to create a lexer that uses more context than this, define a new structure that includes this structure as its first element; BOW_LEX_GRAM, defined below is an example of this.) */typedef struct _bow_lex { char *document; int document_length; int document_position;} bow_lex;/* A lexer is represented by a pointer to a structure of this type. */typedef struct _bow_lexer { int sizeof_lex; /* The size of the bow_lex (or subclass) */ struct _bow_lexer *next; /* The next lexer in the "pipe-like" chain */ /* Pointers to functions for opening, closing and getting words. */ bow_lex* (*open_text_fp) (struct _bow_lexer *self, FILE *fp, const char *filename); bow_lex* (*open_str) (struct _bow_lexer *self, char *buf); int (*get_word) (struct _bow_lexer *self, bow_lex *lex, char *buf, int buflen); int (*get_raw_word) (struct _bow_lexer *self, bow_lex *lex, char *buf, int buflen); int (*postprocess_word) (struct _bow_lexer *self, bow_lex *lex, char *buf, int buflen); void (*close) (struct _bow_lexer *self, bow_lex *lex);} bow_lexer;/* Lexer global variables. Default values are in lex-simple.c *//* How to recognize the beginning and end of a document. NULL pattern means don't scan forward at all. "" means scan forward to EOF. */extern const char *bow_lexer_document_start_pattern;extern const char *bow_lexer_document_end_pattern;extern int *bow_lexer_case_sensitive;extern int (*bow_lexer_stoplist_func)(const char *);extern int (*bow_lexer_stem_func)(char *);extern int bow_lexer_toss_words_longer_than;extern int bow_lexer_toss_words_shorter_than;extern char *bow_lexer_infix_separator;extern int bow_lexer_infix_length;extern int bow_lexer_max_num_words_per_document;/* The parameters that control lexing. Many of these may be changed with command-line options. */typedef struct _bow_lexer_parameters { int (*true_to_start)(int character); /* non-zero on char to start */ int (*false_to_end)(int character); /* zero on char to end */ int strip_non_alphas_from_end; /* boolean */ int toss_words_containing_non_alphas; /* boolean */ int toss_words_containing_this_many_digits;} bow_lexer_parameters;/* Get the raw token from the document buffer by scanning forward until we get a start character, and filling the buffer until we get an ending character. The resulting token in the buffer is NULL-terminated. Return the length of the token. */int bow_lexer_simple_get_raw_word (bow_lexer *self, bow_lex *lex, char *buf, int buflen);/* Perform all the necessary postprocessing after the initial token boundaries have been found: strip non-alphas from end, toss words containing non-alphas, toss words containing certaing many digits, toss words appearing in the stop list, stem the word, check the stoplist again, toss words of length one. If the word is tossed, return zero, otherwise return the length of the word. */int bow_lexer_simple_postprocess_word (bow_lexer *self, bow_lex *lex, char *buf, int buflen);/* Create and return a BOW_LEX, filling the document buffer from characters in FP, starting after the START_PATTERN, and ending with the END_PATTERN. */bow_lex *bow_lexer_simple_open_text_fp (bow_lexer *self, FILE *fp, const char *filename);/* Create and return a BOW_LEX, filling the document buffer from characters in BUF, starting after the START_PATTERN, and ending with the END_PATTERN. NOTE: BUF is not modified, and it does not need to be saved for future use. */bow_lex *bow_lexer_simple_open_str (bow_lexer *self, char *buf);/* Close the LEX buffer, freeing the memory held by it. */void bow_lexer_simple_close (bow_lexer *self, bow_lex *lex);/* Scan a single token from the LEX buffer, placing it in BUF, and returning the length of the token. BUFLEN is the maximum number of characters that will fit in BUF. If the token won't fit in BUF, an error is raised. */int bow_lexer_simple_get_word (bow_lexer *self, bow_lex *lex, char *buf, int buflen);/* Here are some simple, ready-to-use lexers that are implemented in lex-simple.c */extern const bow_lexer *bow_simple_lexer;/* A lexer that throws out all space-delimited strings that have any non-alphabetical characters. For example, the string `obtained from http://www.cs.cmu.edu' will result in the tokens `obtained' and `from', but the URL will be skipped. */extern const bow_lexer_parameters *bow_alpha_only_lexer_parameters;/* A lexer that keeps all alphabetic strings, delimited by non-alphabetic characters. For example, the string `http://www.cs.cmu.edu' will result in the tokens `http', `www', `cs', `cmu', `edu'. */extern const bow_lexer_parameters *bow_alpha_lexer_parameters;/* A lexer that keeps all alphabetic strings, delimited by non-alphabetic characters. For example, the string `http://www.cs.cmu.edu:8080' will result in the tokens `http', `www', `cs', `cmu', `edu', `8080'. */extern const bow_lexer_parameters *bow_alphanum_lexer_parameters;/* A lexer that keeps all strings that begin and end with alphabetic characters, delimited by white-space. For example, the string `http://www.cs.cmu.edu' will be a single token. This does not change the words at all---no down-casing, no stemming, no stoplist, no word tossing. It's ideal for use when a --lex-pipe-command is used to do all the tokenizing. */extern const bow_lexer_parameters *bow_white_lexer_parameters;/* A lexer that prepends all tokens by the `Date:' string at the beginning of the line. */extern const bow_lexer *bow_suffixing_lexer;/* Call-back functions that just call the next lexer. *//* Open using the next lexer. */bow_lex *bow_lexer_next_open_text_fp (bow_lexer *self, FILE *fp, const char *filename);/* Open using the next lexer from a string. */bow_lex *bow_lexer_next_open_str (bow_lexer *self, char *buf);/* Get a word using the next lexer */int bow_lexer_next_get_word (bow_lexer *self, bow_lex *lex, char *buf, int buflen);/* Get a raw word using the next lexer */int bow_lexer_next_get_raw_word (bow_lexer *self, bow_lex *lex, char *buf, int buflen);/* Postprocess a word using the next lexer */int bow_lexer_next_postprocess_word (bow_lexer *self, bow_lex *lex, char *buf, int buflen);/* Close the underlying lexer. */void bow_lexer_next_close (bow_lexer *self, bow_lex *lex);/* Some declarations for a simple N-gram lexer. See lex-gram.c *//* An augmented version of BOW_LEXER that provides N-grams */typedef struct _bow_lexer_gram { bow_lexer lexer; int gram_size;} bow_lexer_gram;/* An augmented version of BOW_LEX that works for N-grams */typedef struct _bow_lex_gram { bow_lex lex; int gram_size_this_time;} bow_lex_gram;/* A lexer that returns N-gram tokens using BOW_ALPHA_ONLY_LEXER. It actually returns all 1-grams, 2-grams ... N-grams, where N is specified by GRAM_SIZE. */extern const bow_lexer_gram *bow_gram_lexer;/* A lexer that ignores all HTML directives, ignoring all characters between angled brackets: < and >. */extern const bow_lexer *bow_html_lexer;/* An unsorted, NULL-terminated array of strings, indicating headers which should be removed from an e-mail/newsgroup message. If this pointer is not NULL, the GET_WORD() function should be BOW_LEXER_EMAIL_GET_WORD. */extern char **bow_email_headers_to_remove;/* A lexer that removes all header lines which is one of the headers contained in HEADERS_TO_REMOVE */extern const bow_lexer *bow_email_lexer;/* The default lexer that will be used by various library functions like BOW_WV_NEW_FROM_TEXT_FP(). You should set this variable to point at whichever lexer you desire. If you do not set it, it will point at bow_alpha_lexer. */extern bow_lexer *bow_default_lexer;extern bow_lexer_parameters *bow_default_lexer_parameters;/* Functions that may be useful in writing a lexer. *//* Apply the Porter stemming algorithm to modify WORD. Return 0 on success. */int bow_stem_porter (char *word);/* A function wrapper around POSIX's `isalpha' macro. */int bow_isalpha (int character);/* A function wrapper around POSIX's `isgraph' macro. */int bow_isgraph (int character);/* Return non-zero if WORD is on the stoplist. */int bow_stoplist_present (const char *word);/* Return non-zero if WORD is on the stoplist, where HASH corresponds to int4str.c:_str2id */int bow_stoplist_present_hash (const char *word, unsigned hash);/* Add to the stoplist the white-space delineated words from FILENAME. Return the number of words added. If the file could not be opened, return -1. */int bow_stoplist_add_from_file (const char *filename);/* Empty the default stoplist, and add space-delimited words from FILENAME. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -