📄 token.c

📁 一个C语言写的快速贝叶斯垃圾邮件过滤工具
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* $Id: token.c,v 1.155 2007/01/12 00:20:16 relson Exp $ *//*****************************************************************************NAME:   token.c -- post-lexer token processing   12/08/02 - split out from lexer.lAUTHOR:   David Relson <relson@osagesoftware.com>******************************************************************************/#include "common.h"#include <assert.h>#include <ctype.h>#include <stdlib.h>#include "bogoreader.h"#include "charset.h"#include "error.h"#include "mime.h"#include "msgcounts.h"#include "word.h"#include "token.h"#include "xmemrchr.h"#define	MSG_COUNT_PADDING 2 * 10	/* space for 2 10-digit numbers *//* Local Variables */word_t	*msg_addr;	/* First IP Address in Received: statement */word_t	*msg_id;	/* Message ID */word_t	*queue_id;	/* Message's first queue ID */static token_t save_class = NONE;static word_t *ipsave;static byte  *yylval_text;static size_t yylval_text_size;static word_t yylval;static word_t *w_to   = NULL;	/* To:          */static word_t *w_from = NULL;	/* From:        */static word_t *w_rtrn = NULL;	/* Return-Path: */static word_t *w_subj = NULL;	/* Subject:     */static word_t *w_recv = NULL;	/* Received:    */static word_t *w_head = NULL;	/* Header:      */static word_t *w_mime = NULL;	/* Mime:        */static word_t *w_ip   = NULL;	/* ip:          */static word_t *w_url  = NULL;	/* url:         *//* Global Variables */bool block_on_subnets = false;static word_t *token_prefix = NULL;static uint32_t token_prefix_len;#define NONBLANK "spc:invalid_end_of_header"static word_t *nonblank_line = NULL;static uint tok_count         = 0;static uint init_token        = 1;static word_t *p_multi_words  = NULL;static byte   *p_multi_buff   = NULL;static byte   *p_multi_text   = NULL;static word_t **w_token_array = NULL;/* Function Prototypes */static void    token_clear(void);static token_t parse_new_token(word_t *token);static void    add_token_to_array(word_t *token);static void    build_token_from_array(word_t *token);static uint    token_copy_leng(const char *str, uint leng, byte *dest);/* Function Definitions */static void init_token_array(void){    uint i;    byte *text;    word_t *words;		        p_multi_words = calloc( max_token_len, sizeof(word_t) );    p_multi_buff  = malloc( max_multi_token_len+D );    p_multi_text  = calloc( max_token_len+1+D, multi_token_count );    w_token_array = calloc( multi_token_count, sizeof(*w_token_array) );    text = p_multi_text;    words = p_multi_words;    for (i = 0; i < multi_token_count; i += 1) {	words->leng = 0;	words->text = text;	w_token_array[i] = words;	words += 1;	text += max_token_len+1+D;    }}static void free_token_array(void){    free(p_multi_words);    free(p_multi_text );    free(w_token_array);}static void token_set( word_t *token, byte *text, uint leng ){    token->leng = leng;    memcpy(token->text, text, leng);		/* include nul terminator */    token->text[leng] = '\0';			/* ensure nul termination */}static inline void token_copy( word_t *dst, word_t *src ){    token_set(dst, src->text, src->leng);}static void build_prefixed_token( word_t *prefix, word_t *token,				  word_t *temp, uint32_t temp_size ){    uint len = token->leng + prefix->leng;        if (len >= temp_size)	len = temp_size - prefix->leng - 1;    temp->leng = len;    memmove(temp->text+prefix->leng, token->text, len-prefix->leng);    memcpy(temp->text, prefix->text, prefix->leng);    Z(temp->text[temp->leng]);    token->leng = temp->leng;    token->text = temp->text;}#define WRAP(n)	((n) % multi_token_count)token_t get_token(word_t *token){    token_t cls;        bool fSingle = (tok_count < 2 ||		    tok_count <= init_token ||		    multi_token_count <= init_token);    if (fSingle) {	cls = parse_new_token(token);	if (multi_token_count > 1)	    add_token_to_array(token);    }    else {	cls = TOKEN;	build_token_from_array(token);    }    if (token_prefix != NULL) {	/* IP addresses get special prefix */	if (save_class != IPADDR) {	    build_prefixed_token(token_prefix, token, &yylval, yylval_text_size);	}	else {	    word_t *prefix = (wordlist_version >= IP_PREFIX) ? w_ip : w_url;	    build_prefixed_token(prefix, token, &yylval, yylval_text_size);	}	/* if excessive length caused by prefix, get another token */	if (fSingle && token->leng > max_token_len)	    cls = get_token(token);    }    return cls;}token_t parse_new_token(word_t *token){    token_t cls = NONE;    unsigned char *cp;    bool done = false;    /* If saved IPADDR, truncate last octet */    if ( block_on_subnets && save_class == IPADDR )    {	byte *t = xmemrchr(ipsave->text, '.', ipsave->leng);	if (t == NULL)	    save_class = NONE;	else	{	    ipsave->leng = (uint) (t - ipsave->text);	    token_set( token, ipsave->text, ipsave->leng);	    cls = save_class;	    done = true;	}    }    while (!done) {	uint leng;	byte *text;	cls = (*lexer->yylex)();	token->leng = (uint)   *lexer->yyleng;	token->text = (byte *) *lexer->yytext;	Z(token->text[token->leng]);	/* for easier debugging - removable */	leng = token->leng;	text = token->text;	if (DEBUG_TEXT(2)) {	    word_puts(token, 0, dbgout);	    fputc('\n', dbgout);	} 	if (cls == NONE) /* End of message */	    break;	switch (cls) {	case EOH:	/* end of header - bogus if not empty */	    if (leng > max_token_len)		continue;	    if (msg_state->mime_type == MIME_MESSAGE)		mime_add_child(msg_state);	    if (leng == 2)		continue;	    else {	/* "spc:invalid_end_of_header" */		token_copy( &yylval, nonblank_line);		done = true;	    }	    break;	case BOUNDARY:	/* don't return boundary tokens to the user */	    continue;	case VERP:	/* Variable Envelope Return Path */	{	    byte *st = (byte *)text;	    byte *in;	    byte *fst = NULL;	    byte *lst = NULL;	    for (in = st; *in != '\0'; in += 1) {		if (*in == '-') {		    if (fst == NULL)			fst = in;		    lst = in;		}	    }	    if (fst != NULL && lst != NULL && lst - fst  > 3) {		byte *ot = fst;		*ot++ = '-';		*ot++ = '#';		for (in = lst; *in != '\0'; in += 1, ot += 1)		    *ot = *in;		token->leng = leng = (uint) (ot - st);	    }	    Z(token->text[token->leng]);	/* for easier debugging - removable */	}	break;	case HEADKEY:	{	    if (!header_line_markup || *text == '\0')		continue;	    else {		const char *delim = strchr((const char *)text, ':');		leng = (uint) (delim - (const char *)text);		if (leng > max_token_len)		    continue;		token_set( &yylval, text, leng);	    }	}	/*@fallthrough@*/	case TOKEN:	/* ignore anything when not reading text MIME types */	    if (leng < min_token_len)		continue;	case MONEY:	/* 2 character money is OK */	    if (leng > max_token_len)		continue;	    token->text = text;	    token->leng = leng;	    if (token_prefix == NULL) {		switch (msg_state->mime_type) {		case MIME_TEXT:		case MIME_TEXT_HTML:		case MIME_TEXT_PLAIN:		case MIME_MULTIPART:		    break;		case MIME_MESSAGE:		case MIME_APPLICATION:		case MIME_IMAGE:		    continue;		default:		    continue;		}	    }	    break;	case MESSAGE_ID:	    /* special token;  saved for formatted output, but not returned to bogofilter */	    /** \bug: the parser MUST be aligned with lexer_v3.l! */	    if (leng < max_token_len)	    {		while (!isspace(text[0])) {		    text += 1;		    leng -= 1;		}		while (isspace(text[0])) {		    text += 1;		    leng -= 1;		}		token_set( msg_id, text, leng);	    }	    continue;	case QUEUE_ID:	    /* special token;  saved for formatted output, but not returned to bogofilter */	    /** \bug: the parser MUST be aligned with lexer_v3.l! */	    if (*queue_id->text == '\0' &&		leng < max_token_len )	    {		while (isspace(text[0])) {		    text += 1;		    leng -= 1;		}		if (memcmp(text, "id", 2) == 0) {		    text += 2;		    leng -= 2;		}		while (isspace(text[0])) {		    text += 1;		    leng -= 1;		}		if (text[0] == '<') {		    text += 1;		    leng -= 1;		}		if (text[leng-1] == '>') {		    leng -= 1;		}
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -