📄 lexer.c

📁 一个C语言写的快速贝叶斯垃圾邮件过滤工具
💻 C
字号:
/* $Id: lexer.c,v 1.149 2006/06/20 22:10:41 relson Exp $ *//** * \file lexer.c * bogofilter's lexical analyzer (control routines) * * \date 2003-01-01 split out of lexer.l */#include "common.h"#include <ctype.h>#include <stdlib.h>#include "base64.h"#include "bogoconfig.h"#include "bogoreader.h"#include "charset.h"#include "error.h"#ifndef	DISABLE_UNICODE#include "convert_unicode.h"#include "iconvert.h"#endif#include "lexer.h"#include "memstr.h"#include "mime.h"#include "msgcounts.h"#include "qp.h"#include "textblock.h"#include "token.h"#include "word.h"#include "xmalloc.h"/* Global Variables */extern int yylineno;bool msg_header = true;bool have_body  = false;lexer_t *lexer = NULL;/* Local Variables */static lexer_t v3_lexer = {    yylex,    &yytext,    &yyleng};lexer_t msg_count_lexer = {    read_msg_count_line,    &msg_count_text,    &msg_count_leng};/* Function Prototypes */static int yy_get_new_line(buff_t *buff);static int get_decoded_line(buff_t *buff);static int skip_folded_line(buff_t *buff);/* Function Definitions */void lexer_init(void){    mime_reset();    token_init();    lexer_v3_init(NULL);    init_charset_table(charset_default);}static void lexer_display_buffer(buff_t *buff){    fprintf(dbgout, "*** %2d %c%c %2ld ",	    yylineno-1, msg_header ? 'h' : 'b', yy_get_state(),	    (long)(buff->t.leng - buff->read));    buff_puts(buff, 0, dbgout);    if (buff->t.leng > 0 && buff->t.text[buff->t.leng-1] != '\n')	fputc('\n', dbgout);}/** * Check for lines wholly composed of printable characters as they can * cause a scanner abort "input buffer overflow, can't enlarge buffer * because scanner uses REJECT" * * \bug this function must go, we need to fix the lexer */static bool long_token(byte *buf, uint count){    uint i;    for (i=0; i < count; i += 1) {	byte c = buf[i];	/* 10/23/05 - fix SIGSEGV with msg.1023.6479.txt	** evidently caused by 09/07/05 patch for 0.96.2	*/	if (c == '\0')	    break;	if ((iscntrl(c) || isspace(c) || ispunct(c)) && (c != '_'))	    return false;    }    return true;}static int yy_get_new_line(buff_t *buff){    int count = (*reader_getline)(buff);    const byte *buf = buff->t.text;    static size_t hdrlen = 0;    if (hdrlen==0)	hdrlen=strlen(spam_header_name);    if (count > 0)	yylineno += 1;    if (count == EOF) {	if (fpin == NULL || !ferror(fpin)) {	    return YY_NULL;	}	else {	    print_error(__FILE__, __LINE__, "input in flex scanner failed\n");	    exit(EX_ERROR);	}    }    /* Mime header check needs to be performed on raw input    ** -- before mime decoding.  Without it, flex aborts:    ** "fatal flex scanner internal error--end of buffer missed" */    if (buff->t.leng > 2 &&	buf[0] == '-' && buf[1] == '-' &&	got_mime_boundary(&buff->t)) {	yy_set_state_initial();    }    if (count >= 0 && DEBUG_LEXER(0))	lexer_display_buffer(buff);    /* skip spam_header ("X-Bogosity:") lines */    while (msg_header	   && count != EOF/* don't skip if inside message/rfc822 */	   && msg_state->parent == NULL	   && memcmp(buff->t.text,spam_header_name,hdrlen) == 0) {	count = skip_folded_line(buff);    }    return count;}static int get_decoded_line(buff_t *buff){    int count;    buff_t *linebuff;    if (encoding == E_RAW ||	msg_state->mime_dont_decode ) {	linebuff = buff;    }#ifndef	DISABLE_UNICODE    else {	static buff_t *tempbuff = NULL;	if (tempbuff == NULL)	    tempbuff = (buff_t *) calloc(sizeof(buff_t), 1);	/* UTF-8 uses up to six octets per character.  Make input buffer	 * sufficiently small that the UTF-8 text can fit in the output	 * buffer */	if (tempbuff->size < buff->size / 6) {	    xfree(tempbuff->t.text);	    tempbuff->size = buff->size / 6;	    tempbuff->t.text = (byte *) xmalloc(tempbuff->size+D);	}	tempbuff->t.leng = tempbuff->read = 0;	linebuff = tempbuff;    }#endif    count = yy_get_new_line(linebuff);    if (count == EOF) {	if ( !ferror(fpin))	    return YY_NULL;	else {	    print_error(__FILE__, __LINE__, "input in flex scanner failed\n");	    exit(EX_ERROR);	}    }    /* Save the text on a linked list of lines.     * Note that we store fixed-length blocks here, not lines.     * One very long physical line could break up into more     * than one of these. */    if (passthrough && passmode == PASS_MEM && count > 0)	textblock_add(linebuff->t.text+linebuff->read, (size_t) count);    if ( !msg_header && 	 !msg_state->mime_dont_decode &&	 msg_state->mime_type != MIME_TYPE_UNKNOWN)    {	word_t temp;	uint decoded_count;	temp.leng = (uint) count;	temp.text = linebuff->t.text+linebuff->read;	decoded_count = mime_decode(&temp);	/*change buffer size only if the decoding worked */	if (decoded_count != 0 && decoded_count < (uint) count) {	    linebuff->t.leng -= (uint) (count - decoded_count);	    count = (int) decoded_count;	    if (DEBUG_LEXER(1))		lexer_display_buffer(linebuff);	}    }#ifndef	DISABLE_UNICODE    if (encoding == E_UNICODE &&	!msg_state->mime_dont_decode)    {	iconvert(linebuff, buff);	/*	 * iconvert, treating multi-byte sequences, can shrink or enlarge	 * the output compared to its input.  Correct count.	 */	if (count > 0)	    count = buff->t.leng;    }#endif#ifdef EXCESSIVE_DEBUG    /* debug */    fprintf(dbgout, "%d: ", count);    buff_puts(buff, 0, dbgout);    fprintf(dbgout, "\n");#endif    /* CRLF -> NL */    if (count >= 2) {	byte *buf = buff->t.text;	if (memcmp(buf + count - 2, CRLF, 2) == 0) {	    count --;	    *(buf + count - 1) = (byte) '\n';	}    }    if (buff->t.leng < buff->size)     /* for easier debugging - removable */	Z(buff->t.text[buff->t.leng]);  /* for easier debugging - removable */    return count;}static int skip_folded_line(buff_t *buff){    for (;;) {	int count;	buff->t.leng = 0;	count = reader_getline(buff);	yylineno += 1;	/* only check for LWSP-char (RFC-822) aka. WSP (RFC-2822),	 * these only include SP and HTAB */	if (buff->t.text[0] != ' ' &&	    buff->t.text[0] != '\t')	    return count;	/* Check for empty line which terminates message header */	if (is_eol((char *)buff->t.text, count))	    return count;    }}int buff_fill(buff_t *buff, size_t used, size_t need){    int cnt = 0;    size_t leng = buff->t.leng;    size_t size = buff->size;    /* check bytes needed vs. bytes in buff */    while (size - leng > 2 && need > leng - used) {	/* too few, read more */	int add = get_decoded_line(buff);	if (add == EOF) return EOF;	if (add == 0) break ;	cnt += add;	leng += add;    }    return cnt;}void yyinit(void){    yylineno = 0;    if ( !msg_count_file)	lexer = &v3_lexer;}int yyinput(byte *buf, size_t used, size_t size)/* input getter for the scanner */{    int cnt;    int count = 0;    buff_t buff;    buff_init(&buff, buf, 0, (uint) size);    /* After reading a line of text, check if it has special characters.     * If not, trim some, but leave enough to match a max length token.     * Then read more text.  This will ensure that a really long sequence     * of alphanumerics, which bogofilter will ignore anyway, doesn't crash     * the flex lexer.     */    while ((cnt = get_decoded_line(&buff)) != 0) {	count += cnt;	/* Note: some malformed messages can cause xfgetsl() to report	** "Invalid buffer size, exiting."  ** and then abort.  This	** can happen when the parser is in html mode and there's a	** leading '<' but no closing '>'.	**	** The "fix" is to check for a nearly full lexer buffer and	** discard most of it.	*/	/* if not nearly full */	if (used < 1000 || used < size * 10)	    break;	if (count >= MAX_TOKEN_LEN * 2 && 	    long_token(buff.t.text, (uint) count)) {	    uint start = buff.t.leng - count;	    uint length = count - max_token_len;	    buff_shift(&buff, start, length);	    count = buff.t.leng;	}	else	    break;    }    if (msg_state &&	msg_state->mime_dont_decode &&	(msg_state->mime_disposition != MIME_DISPOSITION_UNKNOWN)) {	return (count == EOF ? 0 : count);   /* not decode at all */    }#if	defined(CP866) && !defined(ENABLE_ICONV)    /* EK -  decoding things like &#1084 and charset_table */    count = decode_and_htmlUNICODE_to_cp866(buf, count);#endif    if (replace_nonascii_characters) {	/* do non-ascii replacement */	int i;	for (i = 0; i < count; i++ )	{	    byte ch = buf[i];	    buf[i] = charset_table[ch];	}    }    return (count == EOF ? 0 : count);}static char *charset_as_string(const byte *txt, const size_t len){    static char *charset_text = NULL;    static ushort charset_leng = 0;    if (charset_text == NULL)	charset_text = xmalloc(len+D);    else {	if (charset_leng < len) {	    charset_leng = len;	    charset_text = xrealloc(charset_text, charset_leng+D);	}    }    memcpy(charset_text, txt, len);    Z(charset_text[len]);			/* for easier debugging - removable */    return charset_text;}word_t *text_decode(word_t *w){    word_t *r = w;    byte *const beg = w->text;		/* base pointer, fixed */    byte *const fin = beg + w->leng;	/* end+1 position */    byte *txt = (byte *) memstr(w->text, w->leng, "=?");	/* input position */    uint size = (uint) (txt - beg);				/* output offset */#ifndef	DISABLE_UNICODE    size_t max = w->leng * 4;    static buff_t * buf = NULL;#endif    if (txt == NULL)	return r;#ifndef	DISABLE_UNICODE    if (encoding == E_UNICODE) {	if (buf == NULL)	    buf = buff_new(xmalloc(max+D), 0, max);	r = &buf->t;				/* Use buf to return unicode result */	buf->t.leng = 0;	if (buf->size < max) {	    buf->size = max;	    buf->t.text = (byte *) xrealloc(buf->t.text, buf->size+D);	}	buf->t.leng = size;	memcpy(buf->t.text, beg, size );	Z(buf->t.text[buf->t.leng]);		/* for easier debugging - removable */    }#endif    if (DEBUG_LEXER(2)) {	fputs("**1**  ", dbgout);	word_puts(w, 0, dbgout);	fputs("\n", dbgout);    }    while (txt < fin) {	byte *typ, *tmp, *end;	uint len;	bool adjacent;	char *charset;	txt += 2;	typ = (byte *) memchr((char *)txt+1, '?', fin-txt);	/* Encoding type - 'B' or 'Q' */	*typ++ = '\0';						/* nul terminate */	charset = charset_as_string(txt, typ - txt - 1);	tmp = typ + 2;						/* start of encoded word */	end = (byte *) memstr((char *)tmp, fin-tmp, "?=");	/* last byte of encoded word  */	len = end - tmp;	w->text = tmp;				/* Start of encoded word */	w->leng = len;				/* Length of encoded word */	Z(w->text[w->leng]);			/* for easier debugging - removable */	if (DEBUG_LEXER(2)) {	    fputs("**2**  ", dbgout);	    word_puts(w, 0, dbgout);	    fputs("\n", dbgout);	}	switch (tolower(*typ)) {		/* ... encoding type */	case 'b':	    if (base64_validate(w))		len = base64_decode(w);		/* decode base64 */	    break;	case 'q':	    if (qp_validate(w, RFC2047))		len = qp_decode(w, RFC2047);	/* decode quoted-printable */	    break;	}	/* move decoded word to where the encoded used to be */	if (encoding == E_RAW) {	    memmove(beg+size, w->text, len);	    size += len;			/* bump output pointer */	    Z(beg[size]);			/* for easier debugging - removable */	    if (DEBUG_LEXER(3))		fprintf(dbgout, "**3**  %s\n", beg);	}#ifndef	DISABLE_UNICODE	if (encoding == E_UNICODE) {	    iconv_t cd;	    buff_t  src;	    /* convert 'word_t *w' to 'buff_t src' because	    ** iconvert_cd() needs buff_t pointers	    */	    src.t.text = w->text;	    src.t.leng = len;	    src.read   = 0;	    src.size   = len;	    cd = bf_iconv_open( charset_unicode, charset );	    iconvert_cd(cd, &src, buf);	    iconv_close(cd);	    if (DEBUG_LEXER(3)) {		fputs("**4**  ", dbgout);		word_puts(&buf->t, 0, dbgout);		fputs("\n", dbgout);	    }	}#endif	txt = end + 2;	/* skip ?= trailer */	if (txt >= fin)	    break;	/* check for next encoded word */	end = (byte *) memstr((char *)txt, fin-txt, "=?");	adjacent = end != NULL;	/* clear adjacent flag if non-whitespace character found between	 * adjacent encoded words */	if (adjacent) {	    tmp = txt;	    while (adjacent && tmp < end) {		if (*tmp && strchr(" \t\r\n", *tmp))		    tmp += 1;		else		    adjacent = false;	    }	}	/* we have a next encoded word and we've had only whitespace	 * between the current and the next */	if (adjacent)	    /* just skip whitespace */	    txt = end;	else	    /* copy everything that was between the encoded words */	    while (txt < end) {		if (encoding == E_RAW)		    beg[size++] = *txt++;#ifndef	DISABLE_UNICODE		if (encoding == E_UNICODE)		    buf->t.text[buf->t.leng++] = *txt++;#endif	    }    }    if (encoding == E_RAW) {	r->text = beg;	r->leng = size;    }    return r;}/* * The following sets edit modes for GNU EMACS * Local Variables: * mode:c * End: */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -