⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 msgtok.c

📁 这是一个同样来自贝尔实验室的和UNIX有着渊源的操作系统, 其简洁的设计和实现易于我们学习和理解
💻 C
字号:
/* * RFC822 message tokenizer (really feature generator) for spam filter. *  * See Paul Graham's musings on spam filtering for theory. */#include <u.h>#include <libc.h>#include <bio.h>#include <regexp.h>#include <ctype.h>#include "dfa.h"void buildre(Dreprog*[3]);int debug;char *refile = "/mail/lib/classify.re";int maxtoklen = 20;int trim(char*);voidusage(void){	fprint(2, "usage: msgtok [-D] [-r /mail/lib/classify.re] [file]\n");	exits("usage");}voidmain(int argc, char **argv){	int i, hdr, n, eof, off;	Dreprog *re[3];	int m[3];	char *p, *ep, *tag;	Biobuf bout, bin;	char msg[1024+1];	char buf[1024];	buildre(re);	ARGBEGIN{	case 'D':		debug = 1;		break;	case 'n':		maxtoklen = atoi(EARGF(usage()));		break;	case 'r':		refile = EARGF(usage());		break;	default:		usage();	}ARGEND;	if(argc > 1)		usage();	if(argc == 1){		close(0);		if(open(argv[0], OREAD) < 0)			sysfatal("open %s: %r", argv[0]);	}	tag = nil;	Binit(&bin, 0, OREAD);	Binit(&bout, 1, OWRITE);	ep = msg;	p = msg;	eof = 0;	off = 0;	hdr = 1;	for(;;){		/* replenish buffer */		if(ep - p < 512 && !eof){			if(p > msg + 1){				n = ep - p;				memmove(msg, p-1, ep-(p-1));				off += (p-1) - msg;				p = msg+1;				ep = p + n;			}			n = Bread(&bin, ep, msg+(sizeof msg - 1)- ep);			if(n < 0)				sysfatal("read error: %r");			if(n == 0)				eof = 1;			ep += n;			*ep = 0;		}		if(p >= ep)			break;		if(*p == 0){			p++;			continue;		}		if(hdr && p[-1]=='\n'){			if(p[0]=='\n')				hdr = 0;			else if(cistrncmp(p-1, "\nfrom:", 6) == 0)				tag = "From*";			else if(cistrncmp(p-1, "\nto:", 4) == 0)				tag = "To*";			else if(cistrncmp(p-1, "\nsubject:", 9) == 0)				tag = "Subject*";			else if(cistrncmp(p-1, "\nreturn-path:", 13) == 0)				tag = "Return-Path*";			else				tag = nil;		}		m[0] = dregexec(re[0], p, p==msg || p[-1]=='\n');		m[1] = dregexec(re[1], p, p==msg || p[-1]=='\n');		m[2] = dregexec(re[2], p, p==msg || p[-1]=='\n');		n = m[0];		if(n < m[1])			n = m[1];		if(n < m[2])			n = m[2];		if(n <= 0){fprint(2, "«%s» %.2ux", p, p[0]);			sysfatal("no regexps matched at %ld", off + (p-msg));		}		if(m[0] >= m[1] && m[0] >= m[2]){			/* "From " marks start of new message */			Bprint(&bout, "*From*\n");			n = m[0];			hdr = 1;		}else if(m[2] > 1){			/* ignore */			n = m[2];		}else if(m[1] >= m[0] && m[1] >= m[2] && m[1] > 2 && m[1] <= maxtoklen){			/* keyword */			/* should do UTF-aware lowercasing, too much bother *//*			for(i=0; i<n; i++)				if('A' <= p[i] && p[i] <= 'Z')					p[i] += 'a' - 'A';*/			if(tag){				i = strlen(tag);					memmove(buf, tag, i);				memmove(buf+i, p, m[1]);				buf[i+m[1]] = 0;			}else{				memmove(buf, p, m[1]);				buf[m[1]] = 0;			}			Bprint(&bout, "%s\n", buf);			while(trim(buf) >= 0)				Bprint(&bout, "stem*%s\n", buf);			n = m[1];		}else			n = m[2];		if(debug)			fprint(2, "%.*s¦", utfnlen(p, n), p);		p += n;	}	Bterm(&bout);	exits(0);}voidbuildre(Dreprog *re[3]){	Biobuf *b;	if((b = Bopen(refile, OREAD)) == nil)		sysfatal("open %s: %r", refile);	re[0] = Breaddfa(b);	re[1] = Breaddfa(b);	re[2] = Breaddfa(b);	if(re[0]==nil || re[1]==nil || re[2]==nil)		sysfatal("Breaddfa: %r");	Bterm(b);}/* perhaps this belongs in the tokenizer */inttrim(char *s){	char *p, *op;	int mix, mix1;	if(*s == '*')		return -1;	/* strip leading punctuation */	p = strchr(s, '*');	if(p == nil)		p = s;	while(*p && !isalpha(*p))		p++;	if(strlen(p) < 2){		return -1;}	memmove(s, p, strlen(p)+1);	/* strip suffix of punctuation */	p = s+strlen(s);	op = p;	while(p > s && (uchar)p[-1]<0x80 && !isalpha(p[-1]))		p--;	/* chop punctuation */	if(p > s){		/* free!!! -> free! */		if(p+1 < op){			p[1] = 0;			return 0;		}		/* free! -> free */		if(p < op){			p[0] = 0;			return 0;		}	}	mix = mix1 = 0;	if(isupper(s[0]))		mix = 1;	for(p=s+1; *p; p++)		if(isupper(*p)){			mix1 = 1;			break;		}	/* turn FREE into Free */	if(mix1){		for(p=s+1; *p; p++)			if(isupper(*p))				*p += 'a'-'A';		return 0;	}	/* turn Free into free */	if(mix){		*s += 'a'-'A';		return 0;	}	return -1;}		

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -