📄 lex-simple.c
字号:
bow_free (lex->document); bow_free (lex);}/* Get the raw token from the document buffer by scanning forward until we get a start character, and filling the buffer until we get an ending character. The resulting token in the buffer is NULL-terminated. Return the length of the token. */intold_bow_lexer_simple_get_raw_word (bow_lexer *self, bow_lex *lex, char *buf, int buflen){ int byte; /* characters read from the FP */ int wordlen; /* number of characters in the word so far */ assert (lex->document_position <= lex->document_length); /* Ignore characters until we get a beginning character. */ do { byte = (unsigned char)lex->document[lex->document_position++]; if (byte == 0) { lex->document_position--; return 0; } } while (! PARAMS->true_to_start (byte)); /* Add the first alphabetic character to the word. */ buf[0] = (bow_lexer_case_sensitive) ? byte : tolower (byte); /* Add all the following satisfying characters to the word. */ for (wordlen = 1; wordlen < buflen; wordlen++) { byte = lex->document[lex->document_position++];; if (byte == 0) break; if (! PARAMS->false_to_end (byte)) break; buf[wordlen] = tolower (byte); } /* Back up to point at the character that caused the end of the word. */ lex->document_position--; assert (lex->document_position <= lex->document_length); if (wordlen >= buflen) bow_error ("Encountered word longer than buffer length=%d", buflen); /* Terminate it. */ buf[wordlen] = '\0'; return wordlen;}intbow_lexer_simple_get_raw_word (bow_lexer *self, bow_lex *lex, char *buf, int buflen){ //int byte; /* characters read from the FP */ int wordlen; /* number of characters in the word so far */ const char *docptr; const char *word_start; docptr = lex->document + lex->document_position; /* Ignore characters until we get a beginning character. */ while (*docptr && !isalpha ((unsigned char)*docptr)) docptr++; if (*docptr == '\0') return 0; word_start = docptr; /* Add alphabetics to the word */ do *buf++ = tolower ((unsigned char)*docptr++); while (isalpha ((unsigned char)*docptr)); /* Now DOCPTR is pointing to the non-alpha immediately after the word */ /* Adjust the LEX's pointer into the the document for the next word */ lex->document_position += docptr - (lex->document + lex->document_position); wordlen = docptr - word_start; if (wordlen >= buflen) bow_error ("Encountered word longer than buffer length=%d", buflen); /* Terminate it. */ *buf = '\0'; return wordlen;}/* Perform all the necessary postprocessing after the initial token boundaries have been found: strip non-alphas from end, toss words containing non-alphas, toss words containing certain many digits, toss words appearing in the stop list, stem the word, check the stoplist again, toss words of length one. If the word is tossed, return zero, otherwise return the length of the word. */intbow_lexer_simple_postprocess_word (bow_lexer *self, bow_lex *lex, char *buf, int buflen){ int wordlen = strlen (buf); if (wordlen > bow_lexer_toss_words_longer_than /* Toss words shorter than 2 characters */ || buf[1] == '\0' || (bow_lexer_stoplist_func && bow_lexer_stoplist_func (buf)) ) return 0; /* Return the length of the word we found. */ return wordlen;}intold_bow_lexer_simple_postprocess_word (bow_lexer *self, bow_lex *lex, char *buf, int buflen){ int wordlen = strlen (buf); /* This is done at the end anyway. So it should be slightly faster to skip this step, assuming that often none of the PARAM tests below are true. */#if 0 /* Toss words that are longer than bow_lexer_TOSS_WORDS_LONGER_THAN */ if (bow_lexer_toss_words_longer_than) { if (wordlen > bow_lexer_toss_words_longer_than) return 0; }#endif if (PARAMS->strip_non_alphas_from_end) { /* Strip any non-alphabetic characters off the end of the word */ while (wordlen && !isalpha((unsigned char)buf[wordlen-1])) wordlen--; /* Terminate it. */ buf[wordlen] = '\0'; if (wordlen == 0) return 0; } if (PARAMS->toss_words_containing_non_alphas) { /* If the word contains any non-alphabetic characters, get another word instead. */ { char *bufp; for (bufp = buf; *bufp; bufp++) { if (!isalpha ((unsigned char)*bufp)) return 0; } } } /* If the word contain TOSS_WORDS_CONTAINING_THIS_MANY_DIGITS number of digits, get another word instead. (Here the variable BYTE holds the count of the number of digits.) */ if (PARAMS->toss_words_containing_this_many_digits) { int byte; char *bufp; for (bufp = buf, byte = 0; *bufp; bufp++) { if (isdigit (*bufp)) if (++byte > PARAMS->toss_words_containing_this_many_digits) return 0; } } if (bow_lexer_stoplist_func && bow_lexer_stoplist_func (buf)) return 0; /* Apply the stemming algorithm to the word. */ if (bow_lexer_stem_func) { bow_lexer_stem_func (buf); /* If the result of stemming is on the stoplist, go back and start again. */ if (bow_lexer_stoplist_func && bow_lexer_stoplist_func (buf)) return 0; wordlen = strlen (buf); } /* If the result of stemming is too short or too long, go back and start again. */ if (wordlen < bow_lexer_toss_words_shorter_than || wordlen > bow_lexer_toss_words_longer_than) return 0; if (bow_xxx_words_only && strstr (buf, "titlexxx") == NULL) return 0; bow_lexer_num_words_in_document++; if (bow_lexer_max_num_words_per_document && (bow_lexer_num_words_in_document > bow_lexer_max_num_words_per_document)) return 0; /* Return the length of the word we found. */ return wordlen;}/* Scan a single token from the LEX buffer, placing it in BUF, and returning the length of the token. BUFLEN is the maximum number of characters that will fit in BUF. If the token won't fit in BUF, an error is raised. */intbow_lexer_simple_get_word (bow_lexer *self, bow_lex *lex, char *buf, int buflen){ int wordlen; /* number of characters in the word so far */ do { wordlen = self->get_raw_word (self, lex, buf, buflen); if (wordlen == 0) return 0; } while ((wordlen = self->postprocess_word (self, lex, buf, buflen)) == 0); return wordlen;}/* A function wrapper around POSIX's `isalpha' macro. */intbow_isalpha (int character){ return isalpha (character);}/* A function wrapper around POSIX's `isalnum' macro. */intbow_isalphanum (int character){ return isalnum (character);}/* A function wrapper around POSIX's `isgraph' macro. */intbow_isgraph (int character){ return isgraph (character);}intbow_not_isspace (int character){ return (! isspace (character));}const bow_lexer _bow_simple_lexer ={ sizeof (bow_lex), NULL, bow_lexer_simple_open_text_fp, bow_lexer_simple_open_str, bow_lexer_simple_get_word, bow_lexer_simple_get_raw_word, bow_lexer_simple_postprocess_word, bow_lexer_simple_close};const bow_lexer *bow_simple_lexer = &_bow_simple_lexer;/* A lexer that keeps all alphabetic strings, delimited by non-alphabetic characters. For example, the string `http://www.cs.cmu.edu' will result in the tokens `http', `www', `cs', `cmu', `edu'. */const bow_lexer_parameters _bow_alpha_lexer_parameters ={ bow_isalpha, /* begin words with an alphabetic char */ bow_isalpha, /* end words with any non-alphabetic char */ NO, /* don't strip non-alphas from end */ NO, /* don't toss words w/ non-alphas */ 0, /* don't toss words with digits */};const bow_lexer_parameters *bow_alpha_lexer_parameters =&_bow_alpha_lexer_parameters;/* A lexer that keeps all alphabetic strings, delimited by non-alphabetic characters. For example, the string `http://www.cs.cmu.edu:8080' will result in the tokens `http', `www', `cs', `cmu', `edu', `8080'. */const bow_lexer_parameters _bow_alphanum_lexer_parameters ={ bow_isalphanum, /* begin words with an alphanumeric char */ bow_isalphanum, /* end words with any non-alphanumeric char */ NO, /* don't strip non-alphas from end */ NO, /* don't toss words w/ non-alphas */ 0, /* don't toss words with digits */};const bow_lexer_parameters *bow_alphanum_lexer_parameters =&_bow_alphanum_lexer_parameters;/* A lexer that throws out all space-delimited strings that have any non-alphabetical characters. For example, the string `obtained from http://www.cs.cmu.edu' will result in the tokens `obtained' and `from', but the URL will be skipped. */const bow_lexer_parameters _bow_alpha_only_lexer_parameters ={ bow_isalpha, /* begin words with an alphabetic char */ bow_isgraph, /* end words with any non-alphabetic char */ YES, /* strip non-alphas from end */ YES, /* toss words w/ non-alphas */ 3, /* toss words with 3 digits */};const bow_lexer_parameters *bow_alpha_only_lexer_parameters =&_bow_alpha_only_lexer_parameters;/* A lexer that keeps all strings that begin and end with alphabetic characters, delimited by white-space. For example, the string `http://www.cs.cmu.edu' will be a single token. This does not change the words at all---no down-casing, no stemming, no stoplist, no word tossing. It's ideal for use when a --lex-pipe-command is used to do all the tokenizing. */const bow_lexer_parameters _bow_white_lexer_parameters ={ bow_not_isspace, /* begin words with any non-whitespace */ bow_not_isspace, /* end words with whitespace */ NO, /* don't strip non-alphas from end */ NO, /* don't toss words w/ non-alphas */ 99, /* toss words with 99 digits */};const bow_lexer_parameters *bow_white_lexer_parameters =&_bow_white_lexer_parameters;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -