📄 fh.c
字号:
/* let each category process the token */ (*word_fun)(ntok, (token_order_t)1, 0); nq = ntok; *nq++ = DIAMOND; } else { if( ++nhow_many > ngram_order ) { nhow_many--; /* move all tokens down by one */ for(nq = ntok + 1; *nq != DIAMOND; nq++) {}; for(nq++, qq = ntok + 1; *nq; *qq++ = *nq++) {}; *qq = '\0'; nq = qq; } qq = ntok; for(n = nhow_many; n > 0; n--) { /* let each category process the token */ (*word_fun)(qq, (token_order_t)n, 0); qq++; /* skip to next token and repeat */ while(*qq != DIAMOND ) { qq++; } } } } p++; } } /* now summarize this line if required */ if( post_line_fun ) { (*post_line_fun)(pptextbuf); } } else { /* since we don't process this line, we reset XML tag state for next time */ xml.state = TEXT; } }}/*********************************************************** * WIDE CHARACTER FILE HANDLING FUNCTIONS * * this is needed for any locale whose character set * * encoding can include NUL bytes inside characters * ***********************************************************/#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H/* returns true if the line should be processed further depends on global mbox state */bool_t w_mbox_line_filter(wchar_t *line) { bool_t process_line = 0; /* by default we skip the line */ /* below we decide if we want to process the line */ switch(mbox.state) { case UNDEF: if( mbox.prev_line_empty && (!wcsncmp(line, L"From ", 5) || ((line[0] == L'-') && (line[1] == L'-') && !iswspace(line[2]))) ) { /* if it starts with From_ or else it looks like --xxxxx, it signals a new header */ mbox.state = HEADER; mbox.checked_content_type = 0; } break; case HEADER: if( *line == L'\n' ) { mbox.state = BODY; } else if( !wcsncmp(line, L"Content-Type:", 13) ) { mbox.checked_content_type = 1; if( wcsstr(line + 13, L"text/") || wcsstr(line + 13, L"TEXT/") ) { mbox.mime_type = 1; /* good */ } else { mbox.mime_type = 0; /* bad */ } } else if( !wcsncmp(line, L"Subject:", 8) || !(wcsncmp(line, L"From:", 5)) ) { /* process subject and from line like body */ process_line = 1; } break; case BODY: if( mbox.prev_line_empty && (!wcsncmp(line, L"From ", 5) || ((line[0] == L'-') && (line[1] == L'-') && !iswspace(line[2])))) { /* if it starts with From_ or else it looks like --xxxxx, it signals a new header */ mbox.state = HEADER; mbox.checked_content_type = 0; } else if( mbox.checked_content_type && mbox.mime_type ) { process_line = 1; } else if( !mbox.checked_content_type ) { /* unless we explicitly are told it's not text, we process it */ /* this will let quoted uuencoded content through :-( */ process_line = 1; } break; } mbox.prev_line_empty = (*line == L'\n') ? 1 : 0; /* for next time */ return process_line;}/* removes tags in the string - modifies in place *//* the name of this function is a misnomer, since it doesn't parse xml properly. But we just want a simple kludge for most html flavours */void w_xml_character_filter(wchar_t *line) { wchar_t *q; q = line; while( *line ) { switch(xml.state) { case TEXT: /* does it look like <x where x is either alpha or punctuation? */ if( line[0] == L'<' ) { if( !wcsncmp(line + 1, L"!--", 3) ) { xml.state = COMMENT; line += 3; } else if( wcsncasecmp(line + 1, L"script", 6) != 0 ) { xml.state = SPECIAL; line += 6; } else if( iswalpha(line[1]) || iswpunct(line[1]) ) { xml.state = TAG; line += 1; } } else { *q++ = *line; } break; case TAG: if( (line[1] == L'>') && (iswalpha(*line) || iswpunct(*line)) ) { xml.state = TEXT; line++; } break; case COMMENT: if( line[0] == L'-' && !wcsncmp(line + 1, L"->", 2) ) { xml.state = TEXT; line += 2; } break; case SPECIAL: if( line[0] == L'<' ) { if( !wcsncasecmp(line + 1, L"/script", 7) ) { xml.state = TEXT; line += 7; } } break; } line++; } *q = L'\0'; /* mark the end of the clean text string */}/* reads a text file as input, converting each lineinto a wide character representation and applies severalfilters. */void w_process_file(FILE *input, int (*line_filter)(wchar_t *), void (*character_filter)(wchar_t *), void (*word_fun)(char *, token_order_t, regex_count_t), char *(*pre_line_fun)(char *), void (*post_line_fun)(char *)) { char *s, *pptextbuf; regex_count_t i; charbuf_len_t k; int eflag; token_order_t z, j, n, order; charbuf_len_t l; charbuf_len_t wclen; wchar_t *wp; mbstate_t tok_shiftstate, input_shiftstate; charbuf_len_t tok_len; char *q, *qq; char tok[MAX_TOKEN_LEN+2]; regmatch_t pmatch[MAX_SUBMATCH]; char *nq; char ntok[MAX_TOKEN_LEN+2]; token_order_t nhow_many; /* initialize the norex state */ ntok[0] = DIAMOND; ntok[1] = '\0'; nq = ntok + 1; nhow_many = 0; memset(&input_shiftstate, 0, sizeof(mbstate_t)); while( !feof(input) ) { /* read in a full line, allocating memory as necessary */ textbuf[0] = '\0'; s = textbuf; l = textbuf_len; k = 1; while( fgets(s, l, input) && (strlen(s) >= (l - 1)) ) { textbuf = realloc(textbuf, 2 * textbuf_len); if( !textbuf ) { fprintf(stderr, "error: not enough memory for input line (%d bytes)\n", textbuf_len); exit(0); } s = textbuf + textbuf_len - (k++); l = textbuf_len; textbuf_len *= 2; } /* preprocesses textbuf, optionally censors it */ if( pre_line_fun ) { pptextbuf = (*pre_line_fun)(textbuf); if( !pptextbuf ) { continue; } } else { pptextbuf = textbuf; } /* now convert the line into a wide character string */ if( textbuf_len > wc_textbuf_len ) { wc_textbuf_len = textbuf_len; wc_textbuf = realloc(wc_textbuf, wc_textbuf_len * sizeof(wchar_t)); if( !wc_textbuf ) { fprintf(stderr, "error: not enough memory for wide character conversion " "(%ld bytes)\n", (long int)(wc_textbuf_len * sizeof(wchar_t))); exit(0); } } /* convert as much as we can of the line into wide characters */ s = pptextbuf; k = textbuf_len; wp = wc_textbuf; wclen = 0; /* since we ensured textbuf_len <= wctextbuf_len there will never be overflow of wctextbuf below */ while( k > 0 ) { l = mbrtowc(wp, s, k, &input_shiftstate); if( l > 0 ) { wp++; wclen++; k -= l; s += l; } else if( l == 0 ) { break; } else if( l == -1 ) { /* try to be robust */ s++; k--; memset(&input_shiftstate, 0, sizeof(mbstate_t)); } else if( l == -2) { /* couldn't parse a complete character */ break; } } *wp = L'\0'; /* next we check to see if this line should be skipped */ if( (wclen > 0) && (!line_filter || (*line_filter)(wc_textbuf)) ) { /* now filter some of the characters in the current line */ if( character_filter ) { (*character_filter)(wc_textbuf); } /* repeat for each regular expression: find all the instances of a matching substring */#if defined HAVE_LIBBOOST_REGEX for(i = 0; i < regex_count; i++) { k = 0; eflag = 0; /* see if a match */ while( (k < wclen) && (regexec(&re[i].regex, wc_textbuf + k, MAX_SUBMATCH, pmatch, eflag) == 0) ) { /* all the submatches (delimited by brackets in the regex) get converted, concatenated and the result gets word_fun'd */ q = tok; *q++ = DIAMOND; memset(&tok_shiftstate, 0, sizeof(mbstate_t)); for(order = 0, z = 1; (z < MAX_SUBMATCH) && (pmatch[z].rm_so > -1); z++) { if( !(re[i].submatches & (1<<z)) ) { continue; } else { order++; } /* transcribe the submatch into tok */ for(j = pmatch[z].rm_so; j < pmatch[z].rm_eo; j++) { if( q < tok + MAX_TOKEN_LEN - MULTIBYTE_EPSILON ) { if( options & (1<<OPTION_CASEN) ) { tok_len = wcrtomb(q, wc_textbuf[k + j], &tok_shiftstate); } else { tok_len = wcrtomb(q, towlower(wc_textbuf[k + j]), &tok_shiftstate); } if( (tok_len > -1) ) { q += tok_len; } } } *q++ = DIAMOND; } *q = '\0'; /* now let each category process the token */ (*word_fun)(tok, order, i + 1); k += pmatch[0].rm_so + 1; /* advance string and repeat */ eflag = REG_NOTBOL; } }#else /* the GNU regex routines expect a multibyte string */ if( textbuf_len > aux_textbuf_len ) { aux_textbuf_len = textbuf_len; aux_textbuf = realloc(aux_textbuf, aux_textbuf_len); if( !aux_textbuf ) { fprintf(stderr, "error: not enough memory for auxiliary text buffer " "(%d bytes)\n", aux_textbuf_len); exit(0); } } l = wcstombs(aux_textbuf, wc_textbuf, aux_textbuf_len - 1); aux_textbuf[aux_textbuf_len - 1] = '\0'; for(i = 0; i < regex_count; i++) { k = 0; eflag = 0; /* see if a match */ while( (k < l) && (regexec(&re[i].regex, aux_textbuf + k, MAX_SUBMATCH, pmatch, eflag) == 0) ) { /* all the submatches (delimited by brackets in the regex) get concatenated and the result gets word_fun'd */ q = tok; *q++ = DIAMOND; for(order = 0, z = 1; (z < MAX_SUBMATCH) && (pmatch[z].rm_so > -1); z++) { if( !(re[i].submatches & (1<<z)) ) { continue; } else { order++; } /* transcribe the submatch into tok */ for(j = pmatch[z].rm_so; j < pmatch[z].rm_eo; j++) { if( q < tok + MAX_TOKEN_LEN ) { if( options & (1<<OPTION_CASEN) ) { *q++ = aux_textbuf[k + j]; } else { *q++ = tolower(aux_textbuf[k + j]); } } } *q++ = DIAMOND; /* hope DIAMOND can never be matched inside regex */ } *q = '\0'; /* now let each category process the token */ (*word_fun)(tok, order, i + 1); k += pmatch[0].rm_so + 1; /* advance string and repeat */ eflag = REG_NOTBOL; } } #endif /* default processing: reads tokens, converting them to multibyte representation before passing them to the word_fun */ if( options & (1<<OPTION_NOREGEX) ) { wp = wc_textbuf; memset(&tok_shiftstate, 0, sizeof(mbstate_t)); while( *wp ) { if( iswalpha(*wp) ) { if( !(options & (1<<OPTION_CASEN)) ) { *wp = towlower(*wp); } if( nq < ntok + MAX_TOKEN_LEN - MULTIBYTE_EPSILON ) { tok_len = wcrtomb(nq, *wp, &tok_shiftstate); if( (tok_len > -1) ) { nq += tok_len; } } } else if( *(nq - 1) != DIAMOND ) { /* token boundary */ *nq++ = DIAMOND; *nq = '\0'; if( ngram_order == 1 ) { /* let each category process the token */ (*word_fun)(ntok, (token_order_t)1, 0); nq = ntok; *nq++ = DIAMOND; } else { if( ++nhow_many > ngram_order ) { nhow_many--; /* move all tokens down by one */ for(nq = ntok + 1; *nq != DIAMOND; nq++) {}; for(nq++, qq = ntok + 1; *nq; *qq++ = *nq++) {}; *qq = '\0'; nq = qq; } qq = ntok; for(n = nhow_many; n > 0; n--) { /* let each category process the token */ (*word_fun)(qq, (token_order_t)n, 0); qq++; /* skip to next token and repeat */ while(*qq != DIAMOND ) { qq++; } } } } wp++; } } /* now summarize this line if required */ if( post_line_fun ) { (*post_line_fun)(pptextbuf); } } else { /* since we don't process this line, we reset XML tag state for next time */ xml.state = TEXT; } }}#endif /* DISABLE_WCHAR */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -