📄 lex-suffixing.c
字号:
#include <bow/libbow.h>#include <ctype.h>/* bow_default_lexer_suffixing should be an indirect lexer, with a simple lexer as its underlying lexer. However, I got lazy, and it should be considered that the bow_default_lexer_simple is always the underlying lexer */#define HEADER_TWICE 1static int suffixing_doing_headers;static int suffixing_appending_headers;static char suffixing_suffix[BOW_MAX_WORD_LENGTH];static int suffixing_suffix_length;int bow_lexer_html_get_raw_word (bow_lexer *self, bow_lex *lex, char *buf, int buflen);/* Put the string before the ':' into SUFFIXING_SUFFIX, and replace the following newline with a '\0' */static voidsuffixing_snarf_suffix (bow_lex *lex){ int i; /* Hack to get arrow to work on 23k+ research paper index */ if (! isalpha (lex->document[0])) { /*lex->document[0] = '\0';*/ return; } /* assert (isalpha (lex->document[0])); */ suffixing_suffix[0] = 'x'; suffixing_suffix[1] = 'x'; suffixing_suffix[2] = 'x'; suffixing_suffix_length = 3; /* Put characters into the suffix until we get to the colon */ while (lex->document[lex->document_position] != ':') { assert (lex->document[lex->document_position] != '\n'); if (!isalpha (lex->document[lex->document_position])) { lex->document_position++; continue; } suffixing_suffix[suffixing_suffix_length++] = tolower (lex->document[lex->document_position++]); assert (suffixing_suffix_length < BOW_MAX_WORD_LENGTH); } suffixing_suffix[suffixing_suffix_length] = '\0'; /* Throw away everything else until end of string */ i = 0; while (lex->document[lex->document_position + i] != '\n' /* This second condition is necessary if we are going through the header twice (when HEADER_TWICE=1) */ && lex->document[lex->document_position + i] != '\0') { i++; assert (lex->document_position + i < lex->document_length); } lex->document[lex->document_position + i] = '\0';}/* Create and return a BOW_LEX, filling the document buffer from characters in FP, starting after the START_PATTERN, and ending with the END_PATTERN. */bow_lex *bow_lexer_suffixing_open_text_fp (bow_lexer *self, FILE *fp, const char *filename){ bow_lex *ret; ret = bow_lexer_simple_open_text_fp (self, fp, filename); if (ret) { /* Make sure that the first line has a header-type suffix. */ int i; for (i = 0; i < ret->document_length && ret->document[i] != ':'; i++) if (!isalnum (ret->document[i]) || ret->document[i] == '\n') return 0; suffixing_doing_headers = 1; suffixing_appending_headers = 1; suffixing_snarf_suffix (ret); } return ret;}/* Create and return a BOW_LEX, filling the document buffer from characters in FP, starting after the START_PATTERN, and ending with the END_PATTERN. */bow_lex *bow_lexer_suffixing_open_str (bow_lexer *self, char *buf){ bow_lex *ret; ret = bow_lexer_simple_open_str (self, buf); if (ret) { suffixing_doing_headers = 1; suffixing_appending_headers = 1; suffixing_snarf_suffix (ret); } return ret;}intbow_lexer_suffixing_postprocess_word (bow_lexer *self, bow_lex *lex, char *buf, int buflen){ int len; /* Postprocess the word */ len = bow_lexer_next_postprocess_word (self, lex, buf, buflen); if (len != 0 && suffixing_doing_headers) { if (suffixing_appending_headers) { strcat (buf, suffixing_suffix); len = strlen (buf); assert (len < buflen); } else { /* Skip the `Reference.*:' words the second time through */ if (strstr (suffixing_suffix, "xxxreference")) len = 0; } }#if 0 if (lex->document_position == lex->document_length) return 0;#endif /* Set up for the next word */ if (suffixing_doing_headers && lex->document[lex->document_position] == '\0') { /* This was two newlines in a row or the end of the file. */ if (lex->document_position == (lex->document_length - 1) || lex->document[lex->document_position + 1] == '\n') {#if HEADER_TWICE if (!suffixing_appending_headers) suffixing_doing_headers = 0; else { lex->document_position = 0; suffixing_appending_headers = 0; suffixing_snarf_suffix (lex); }#else lex->document_position++;#endif suffixing_suffix[0] = '\0'; } else { lex->document_position++; /* Handle email messages with multi-line headers */ if (isalnum(lex->document[lex->document_position])) suffixing_snarf_suffix (lex); else { /* No need to grab a suffix, but must replace the \n with \0 */ int i = 0; while (lex->document[lex->document_position + i] != '\n' /* This second condition is necessary if we are going through the header twice (when HEADER_TWICE=1) */ && lex->document[lex->document_position + i] != '\0') { i++; assert (lex->document_position + i < lex->document_length); } lex->document[lex->document_position + i] = '\0'; } } } return len;}/* Scan a single token from the LEX buffer, placing it in BUF, and returning the length of the token. BUFLEN is the maximum number of characters that will fit in BUF. If the token won't fit in BUF, an error is raised. */intbow_lexer_suffixing_get_word (bow_lexer *self, bow_lex *lex, char *buf, int buflen){ int wordlen; /* number of characters in the word so far */ do { wordlen = bow_lexer_next_get_raw_word (self, lex, buf, buflen); if (wordlen == 0) { if (suffixing_doing_headers && lex->document_position < lex->document_length) /* We are just at the end of the headers, not at the end of the file. bow_lexer_suffixing_postprocess_word() will deal with this */ buf[0] = '\0'; else return 0; } } while (((wordlen = bow_lexer_suffixing_postprocess_word (self, lex, buf, buflen)) == 0) || strstr (suffixing_suffix, "URL")); wordlen = strlen (buf); return wordlen;}/* A lexer that prepends all tokens by the `Date:' string at the beginning of the line. */const bow_lexer _bow_suffixing_lexer ={ sizeof (bow_lex), NULL, bow_lexer_suffixing_open_text_fp, bow_lexer_suffixing_open_str, bow_lexer_suffixing_get_word, NULL, NULL, bow_lexer_simple_close,};const bow_lexer *bow_suffixing_lexer = &_bow_suffixing_lexer;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -