⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dfa.c

📁 这是一个同样来自贝尔实验室的和UNIX有着渊源的操作系统, 其简洁的设计和实现易于我们学习和理解
💻 C
字号:
#include <u.h>#include <libc.h>#include <bin.h>#include <bio.h>#include <regexp.h>#include "/sys/src/libregexp/regcomp.h"#include "dfa.h"void rdump(Reprog*);void dump(Dreprog*);/* * Standard NFA determinization and DFA minimization. */typedef struct Deter Deter;typedef struct Reiset Reiset;void ddump(Deter*);/* state of determinization */struct Deter{	jmp_buf kaboom;	/* jmp on error */	Bin *bin;		/* bin for temporary allocations */	Reprog *p;	/* program being determinized */	uint ninst;		/* number of instructions in program */	Reiset *alloc;	/* chain of all Reisets */	Reiset **last;	Reiset **hash;	/* hash of all Reisets */	uint nhash;	Reiset *tmp;	/* temporaries for walk */	uchar *bits;	Rune *c;		/* ``interesting'' characters */	uint nc;};/* set of Reinsts: perhaps we should use a bit list instead of the indices? */struct Reiset{	uint *inst;		/* indices of instructions in set */	uint ninst;		/* size of set */	Reiset *next;	/* d.alloc chain */	Reiset *hash;	/* d.hash chain */	Reiset **delta;	/* where to go on each interesting char */	uint id;		/* assigned id during minimization */	uint isfinal;	/* is an accepting (final) state */};static Reiset*ralloc(Deter *d, int ninst){	Reiset *t;	t = binalloc(&d->bin, sizeof(Reiset)+2*d->nc*sizeof(Reiset*)+sizeof(uint)*ninst, 0);	if(t == nil)		longjmp(d->kaboom, 1);	t->delta = (Reiset**)&t[1];	t->inst = (uint*)&t->delta[2*d->nc];	return t;}/* find the canonical form a given Reiset */static Reiset*findreiset(Deter *d, Reiset *s){	int i, szinst;	uint h;	Reiset *t;	h = 0;	for(i=0; i<s->ninst; i++)		h = h*1000003 + s->inst[i];	h %= d->nhash;	szinst = s->ninst*sizeof(s->inst[0]);	for(t=d->hash[h]; t; t=t->hash)		if(t->ninst==s->ninst && memcmp(t->inst, s->inst, szinst)==0)			return t;	t = ralloc(d, s->ninst);	t->hash = d->hash[h];	d->hash[h] = t;	*d->last = t;	d->last = &t->next;	t->next = 0;	t->ninst = s->ninst;	memmove(t->inst, s->inst, szinst);	/* delta is filled in later */	return t;}/* convert bits to a real reiset */static Reiset*bits2reiset(Deter *d, uchar *bits){	int k;	Reiset *s;	s = d->tmp;	s->ninst = 0;	for(k=0; k<d->ninst; k++)		if(bits[k])			s->inst[s->ninst++] = k;	return findreiset(d, s);}/* add n to state set; if n < k, need to go around again */static intadd(int n, uchar *bits, int k){	if(bits[n])		return 0;	bits[n] = 1;	return n < k;}/* update bits to follow all the empty (non-character-related) transitions possible */static voidfollowempty(Deter *d, uchar *bits, int bol, int eol){	int again, k;	Reinst *i;	do{		again = 0;		for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){			if(!bits[k])				continue;			switch(i->type){			case RBRA:			case LBRA:				again |= add(i->next - d->p->firstinst, bits, k);				break;			case OR:				again |= add(i->left - d->p->firstinst, bits, k);				again |= add(i->right - d->p->firstinst, bits, k);				break;			case BOL:				if(bol)					again |= add(i->next - d->p->firstinst, bits, k);				break;			case EOL:				if(eol)					again |= add(i->next - d->p->firstinst, bits, k);				break;			}		}	}while(again);	/*	 * Clear bits for useless transitions.  We could do this during	 * the switch above, but then we have no guarantee of termination	 * if we get a loop in the regexp.	 */	for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){		if(!bits[k])			continue;		switch(i->type){		case RBRA:		case LBRA:		case OR:		case BOL:		case EOL:			bits[k] = 0;			break;		}	}}/* * Where does s go if it sees rune r? * Eol is true if a $ matches the string at the position just after r. */static Reiset*transition(Deter *d, Reiset *s, Rune r, uint eol){	int k;	uchar *bits;	Reinst *i, *inst0;	Rune *rp, *ep;	bits = d->bits;	memset(bits, 0, d->ninst);	inst0 = d->p->firstinst;	for(k=0; k < s->ninst; k++){		i = inst0 + s->inst[k];		switch(i->type){		default:			werrstr("bad reprog: got type %d", i->type);			longjmp(d->kaboom, 1);		case RBRA:		case LBRA:		case OR:		case BOL:		case EOL:			werrstr("internal error: got type %d", i->type);			longjmp(d->kaboom, 1);		case RUNE:			if(r == i->r)				bits[i->next - inst0] = 1;			break;		case ANY:			if(r != L'\n')				bits[i->next - inst0] = 1;			break;		case ANYNL:			bits[i->next - inst0] = 1;			break;		case NCCLASS:			if(r == L'\n')				break;			/* fall through */		case CCLASS:			ep = i->cp->end;			for(rp = i->cp->spans; rp < ep; rp += 2)				if(rp[0] <= r && r <= rp[1])					break;			if((rp < ep) ^! (i->type == CCLASS))				bits[i->next - inst0] = 1;			break;		case END:			break;		}	}	followempty(d, bits, r=='\n', eol);	return bits2reiset(d, bits);}static intcountinst(Reprog *pp){	int n;	Reinst *l;	n = 0;	l = pp->firstinst;	while(l++->type)		n++;	return n;}static voidset(Deter *d, u32int **tab, Rune r){	u32int *u;	if((u = tab[r/4096]) == nil){		u = binalloc(&d->bin, 4096/8, 1);		if(u == nil)			longjmp(d->kaboom, 1);		tab[r/4096] = u;	}	u[(r%4096)/32] |= 1<<(r%32);}/* * Compute the list of important characters.  * Other characters behave like the ones that surround them. */static voidfindchars(Deter *d, Reprog *p){	u32int *tab[65536/4096], *u, x;	Reinst *i;	Rune *rp, *ep;	int k, m, n, a;	memset(tab, 0, sizeof tab);	set(d, tab, 0);	set(d, tab, 0xFFFF);	for(i=p->firstinst; i->type; i++){		switch(i->type){		case ANY:			set(d, tab, L'\n'-1);			set(d, tab, L'\n');			set(d, tab, L'\n'+1);			break;		case RUNE:			set(d, tab, i->r-1);			set(d, tab, i->r);			set(d, tab, i->r+1);			break;		case NCCLASS:			set(d, tab, L'\n'-1);			set(d, tab, L'\n');			set(d, tab, L'\n'+1);			/* fall through */		case CCLASS:			ep = i->cp->end;			for(rp = i->cp->spans; rp < ep; rp += 2){				set(d, tab, rp[0]-1);				set(d, tab, rp[0]);				set(d, tab, rp[1]);				set(d, tab, rp[1]+1);			}			break;		}	}	n = 0;	for(k=0; k<nelem(tab); k++){		if((u = tab[k]) == nil)			continue;		for(m=0; m<4096/32; m++){			if((x = u[m]) == 0)				continue;			for(a=0; a<32; a++)				if(x&(1<<a))					n++;		}	}	d->c = binalloc(&d->bin, (n+1)*sizeof(Rune), 0);	if(d->c == 0)		longjmp(d->kaboom, 1);	d->nc = n;	n = 0;	for(k=0; k<nelem(tab); k++){		if((u = tab[k]) == nil)			continue;		for(m=0; m<4096/32; m++){			if((x = u[m]) == 0)				continue;			for(a=0; a<32; a++)				if(x&(1<<a))					d->c[n++] = k*4096+m*32+a;		}	}	d->c[n] = 0;	if(n != d->nc)		abort();}/* * convert the Deter and Reisets into a Dreprog. * if dp and c are nil, just return the count of Drecases needed. */static intbuildprog(Deter *d, Reiset **id2set, int nid, Dreprog *dp, Drecase *c){	int i, j, id, n, nn;	Dreinst *di;	Reiset *s;	nn = 0;	di = 0;	for(i=0; i<nid; i++){		s = id2set[i];		if(c){			di = &dp->inst[i];			di->isfinal = s->isfinal;		}		n = 0;		id = -1;		for(j=0; j<2*d->nc; j++){			if(s->delta[j]->id != id){				id = s->delta[j]->id;				if(c){					c[n].start = ((j/d->nc)<<16) | d->c[j%d->nc];					c[n].next = &dp->inst[id];				}				n++;			}		}		if(c){			if(n == 1 && c[0].next == di)				di->isloop = 1;			di->c = c;			di->nc = n;			c += n;		}		nn += n;	}	return nn;}Dreprog*dregcvt(Reprog *p){	uchar *bits;	uint again, n, nid, id;	Deter d;	Reiset **id2set, *s, *t, *start[4];	Dreprog *dp;	Drecase *c;	memset(&d, 0, sizeof d);	if(setjmp(d.kaboom)){		binfree(&d.bin);		return nil;	}	d.p = p;	d.ninst = countinst(p);	d.last = &d.alloc;	n = d.ninst;	/* round up to power of two; this loop is the least of our efficiency problems */	while(n&(n-1))		n++;	d.nhash = n;	d.hash = binalloc(&d.bin, d.nhash*sizeof(Reinst*), 1);	/* get list of important runes */	findchars(&d, p);#ifdef DUMP	print("relevant chars are: «%S»\n", d.c+1);#endif	d.bits = bits = binalloc(&d.bin, d.ninst, 0);	d.tmp = ralloc(&d, d.ninst);	/*	 * Convert to DFA	 */	/* 4 start states, depending on initial bol, eol */	for(n=0; n<4; n++){		memset(bits, 0, d.ninst);		bits[p->startinst - p->firstinst] = 1;		followempty(&d, bits, n&1, n&2);		start[n] = bits2reiset(&d, bits);	}	/* explore the reiset space */	for(s=d.alloc; s; s=s->next)		for(n=0; n<2*d.nc; n++)			s->delta[n] = transition(&d, s, d.c[n%d.nc], n/d.nc);#ifdef DUMP	nid = 0;	for(s=d.alloc; s; s=s->next)		s->id = nid++;	ddump(&d);#endif	/*	 * Minimize.	 */	/* first class division is final or not */	for(s=d.alloc; s; s=s->next){		s->isfinal = 0;		for(n=0; n<s->ninst; n++)			if(p->firstinst[s->inst[n]].type == END)				s->isfinal = 1;		s->id = s->isfinal;	}	/* divide states with different transition tables in id space */	nid = 2;	do{		again = 0;		for(s=d.alloc; s; s=s->next){			id = -1;			for(t=s->next; t; t=t->next){				if(s->id != t->id)					continue;				for(n=0; n<2*d.nc; n++){					/* until we finish the for(t) loop, s->id and id are same */					if((s->delta[n]->id == t->delta[n]->id)					|| (s->delta[n]->id == s->id && t->delta[n]->id == id)					|| (s->delta[n]->id == id && t->delta[n]->id == s->id))						continue;					break;				}				if(n == 2*d.nc)					continue;				if(id == -1)					id = nid++;				t->id = id;				again = 1;			}		}	}while(again);#ifdef DUMP	ddump(&d);#endif	/* build dreprog */	id2set = binalloc(&d.bin, nid*sizeof(Reiset*), 1);	if(id2set == nil)		longjmp(d.kaboom, 1);	for(s=d.alloc; s; s=s->next)		id2set[s->id] = s;	n = buildprog(&d, id2set, nid, nil, nil);	dp = mallocz(sizeof(Dreprog)+nid*sizeof(Dreinst)+n*sizeof(Drecase), 1);	if(dp == nil)		longjmp(d.kaboom, 1);	c = (Drecase*)&dp->inst[nid];	buildprog(&d, id2set, nid, dp, c);	for(n=0; n<4; n++)		dp->start[n] = &dp->inst[start[n]->id];	dp->ninst = nid;	binfree(&d.bin);	return dp;}intdregexec(Dreprog *p, char *s, int bol){	Rune r;	ulong rr;	Dreinst *i;	Drecase *c, *ec;	int best, n;	char *os;	i = p->start[(bol ? 1 : 0) | (s[1]=='\n' ? 2 : 0)];	best = -1;	os = s;	for(; *s; s+=n){		if(i->isfinal)			best = s - os;		if(i->isloop){			if(i->isfinal)				return strlen(os);			else				return best;		}		if((*s&0xFF) < Runeself){			r = *s;			n = 1;		}else			n = chartorune(&r, s);		c = i->c;		ec = c+i->nc;		rr = r;		if(s[n] == '\n' || s[n] == '\0')			rr |= 0x10000;		for(; c<ec; c++){			if(c->start > rr){				i = c[-1].next;				goto Out;			}		}		i = ec[-1].next;	Out:;	}	if(i->isfinal)		best = s - os;	return best;}#ifdef DUMPvoidddump(Deter *d){	int i, id;	Reiset *s;	for(s=d->alloc; s; s=s->next){		print("%d ", s->id);		id = -1;		for(i=0; i<2*d->nc; i++){			if(id != s->delta[i]->id){				if(i==0)					print(" [");				else if(i/d->nc)					print(" [%C$", d->c[i%d->nc]);				else					print(" [%C", d->c[i%d->nc]);				print(" %d]", s->delta[i]->id);				id = s->delta[i]->id;			}		}		print("\n");	}}voidrdump(Reprog *pp){	Reinst *l;	Rune *p;	l = pp->firstinst;	do{		print("%ld:\t0%o\t%ld\t%ld", l-pp->firstinst, l->type,			l->left-pp->firstinst, l->right-pp->firstinst);		if(l->type == RUNE)			print("\t%C\n", l->r);		else if(l->type == CCLASS || l->type == NCCLASS){			print("\t[");			if(l->type == NCCLASS)				print("^");			for(p = l->cp->spans; p < l->cp->end; p += 2)				if(p[0] == p[1])					print("%C", p[0]);				else					print("%C-%C", p[0], p[1]);			print("]\n");		} else			print("\n");	}while(l++->type);}voiddump(Dreprog *pp){	int i, j;	Dreinst *l;	print("start %ld %ld %ld %ld\n",		pp->start[0]-pp->inst,		pp->start[1]-pp->inst,		pp->start[2]-pp->inst,		pp->start[3]-pp->inst);	for(i=0; i<pp->ninst; i++){		l = &pp->inst[i];		print("%d:", i);		for(j=0; j<l->nc; j++){			print(" [");			if(j == 0)				if(l->c[j].start != 1)					abort();			if(j != 0)				print("%C%s", l->c[j].start&0xFFFF, (l->c[j].start&0x10000) ? "$" : "");			print("-");			if(j != l->nc-1)				print("%C%s", (l->c[j+1].start&0xFFFF)-1, (l->c[j+1].start&0x10000) ? "$" : "");			print("] %ld", l->c[j].next - pp->inst);		}		if(l->isfinal)			print(" final");		if(l->isloop)			print(" loop");		print("\n");	}}voidmain(int argc, char **argv){	int i;	Reprog *p;	Dreprog *dp;	i = 1;		p = regcomp(argv[i]);		if(p == 0){			print("=== %s: bad regexp\n", argv[i]);		}	//	print("=== %s\n", argv[i]);	//	rdump(p);		dp = dregcvt(p);		print("=== dfa\n");		dump(dp);		for(i=2; i<argc; i++)		print("match %d\n", dregexec(dp, argv[i], 0));	exits(0);}#endifvoidBprintdfa(Biobuf *b, Dreprog *p){	int i, j, nc;	Bprint(b, "# dreprog\n");	nc = 0;	for(i=0; i<p->ninst; i++)		nc += p->inst[i].nc;	Bprint(b, "%d %d %ld %ld %ld %ld\n", p->ninst, nc,		p->start[0]-p->inst, p->start[1]-p->inst,		p->start[2]-p->inst, p->start[3]-p->inst);	for(i=0; i<p->ninst; i++){		Bprint(b, "%d %d %d", p->inst[i].isfinal, p->inst[i].isloop, p->inst[i].nc);		for(j=0; j<p->inst[i].nc; j++)			Bprint(b, " %d %ld", p->inst[i].c[j].start, p->inst[i].c[j].next-p->inst);		Bprint(b, "\n");	}}static char*egetline(Biobuf *b, int c, jmp_buf jb){	char *p;	p = Brdline(b, c);	if(p == nil)		longjmp(jb, 1);	p[Blinelen(b)-1] = '\0';	return p;}static voidegetc(Biobuf *b, int c, jmp_buf jb){	if(Bgetc(b) != c)		longjmp(jb, 1);}static integetnum(Biobuf *b, int want, jmp_buf jb){	int c;	int n, first;	n = 0;	first = 1;	while((c = Bgetc(b)) != Beof){		if(c < '0' || c > '9'){			if(want == 0){				Bungetc(b);				c = 0;			}			if(first || c != want){				werrstr("format error");				longjmp(jb, 1);			}			return n;		}		n = n*10 + c - '0';		first = 0;	}	werrstr("unexpected eof");	longjmp(jb, 1);	return -1;}Dreprog*Breaddfa(Biobuf *b){	char *s;	int ninst, nc;	jmp_buf jb;	Dreprog *p;	Drecase *c;	Dreinst *l;	int j, k;	p = nil;	if(setjmp(jb)){		free(p);		return nil;	}	s = egetline(b, '\n', jb);	if(strcmp(s, "# dreprog") != 0){		werrstr("format error");		longjmp(jb, 1);	}	ninst = egetnum(b, ' ', jb);	nc = egetnum(b, ' ', jb);	p = mallocz(sizeof(Dreprog)+ninst*sizeof(Dreinst)+nc*sizeof(Drecase), 1);	if(p == nil)		longjmp(jb, 1);	c = (Drecase*)&p->inst[ninst];	p->start[0] = &p->inst[egetnum(b, ' ', jb)];	p->start[1] = &p->inst[egetnum(b, ' ', jb)];	p->start[2] = &p->inst[egetnum(b, ' ', jb)];	p->start[3] = &p->inst[egetnum(b, '\n', jb)];	for(j=0; j<ninst; j++){		l = &p->inst[j];		l->isfinal = egetnum(b, ' ', jb);		l->isloop = egetnum(b, ' ', jb);		l->nc = egetnum(b, 0, jb);		l->c = c;		for(k=0; k<l->nc; k++){			egetc(b, ' ', jb);			c->start = egetnum(b, ' ', jb);			c->next = &p->inst[egetnum(b, 0, jb)];			c++;		}		egetc(b, '\n', jb);	}	return p;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -