📄 dfa.c

📁 linux平台中
💻 C
📖 第 1 页 / 共 5 页
字号:
/* Some primitives for operating on sets of positions. *//* Copy one set to another; the destination must be large enough. */static voidcopy (position_set const *src, position_set *dst){  int i;  for (i = 0; i < src->nelem; ++i)    dst->elems[i] = src->elems[i];  dst->nelem = src->nelem;}/* Insert a position in a set.  Position sets are maintained in sorted   order according to index.  If position already exists in the set with   the same index then their constraints are logically or'd together.   S->elems must point to an array large enough to hold the resulting set. */static voidinsert (position p, position_set *s){  int i;  position t1, t2;  for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)    continue;  if (i < s->nelem && p.index == s->elems[i].index)    s->elems[i].constraint |= p.constraint;  else    {      t1 = p;      ++s->nelem;      while (i < s->nelem)	{	  t2 = s->elems[i];	  s->elems[i++] = t1;	  t1 = t2;	}    }}/* Merge two sets of positions into a third.  The result is exactly as if   the positions of both sets were inserted into an initially empty set. */static voidmerge (position_set const *s1, position_set const *s2, position_set *m){  int i = 0, j = 0;  m->nelem = 0;  while (i < s1->nelem && j < s2->nelem)    if (s1->elems[i].index > s2->elems[j].index)      m->elems[m->nelem++] = s1->elems[i++];    else if (s1->elems[i].index < s2->elems[j].index)      m->elems[m->nelem++] = s2->elems[j++];    else      {	m->elems[m->nelem] = s1->elems[i++];	m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;      }  while (i < s1->nelem)    m->elems[m->nelem++] = s1->elems[i++];  while (j < s2->nelem)    m->elems[m->nelem++] = s2->elems[j++];}/* Delete a position from a set. */static voiddelete (position p, position_set *s){  int i;  for (i = 0; i < s->nelem; ++i)    if (p.index == s->elems[i].index)      break;  if (i < s->nelem)    for (--s->nelem; i < s->nelem; ++i)      s->elems[i] = s->elems[i + 1];}/* Find the index of the state corresponding to the given position set with   the given preceding context, or create a new state if there is no such   state.  Newline and letter tell whether we got here on a newline or   letter, respectively. */static intstate_index (struct dfa *d, position_set const *s, int newline, int letter){  int hash = 0;  int constraint;  int i, j;  newline = newline ? 1 : 0;  letter = letter ? 1 : 0;  for (i = 0; i < s->nelem; ++i)    hash ^= s->elems[i].index + s->elems[i].constraint;  /* Try to find a state that exactly matches the proposed one. */  for (i = 0; i < d->sindex; ++i)    {      if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem	  || newline != d->states[i].newline || letter != d->states[i].letter)	continue;      for (j = 0; j < s->nelem; ++j)	if (s->elems[j].constraint	    != d->states[i].elems.elems[j].constraint	    || s->elems[j].index != d->states[i].elems.elems[j].index)	  break;      if (j == s->nelem)	return i;    }  /* We'll have to create a new state. */  REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);  d->states[i].hash = hash;  MALLOC(d->states[i].elems.elems, position, s->nelem);  copy(s, &d->states[i].elems);  d->states[i].newline = newline;  d->states[i].letter = letter;  d->states[i].backref = 0;  d->states[i].constraint = 0;  d->states[i].first_end = 0;#ifdef MBS_SUPPORT  if (MB_CUR_MAX > 1)    d->states[i].mbps.nelem = 0;#endif  for (j = 0; j < s->nelem; ++j)    if (d->tokens[s->elems[j].index] < 0)      {	constraint = s->elems[j].constraint;	if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))	  d->states[i].constraint |= constraint;	if (! d->states[i].first_end)	  d->states[i].first_end = d->tokens[s->elems[j].index];      }    else if (d->tokens[s->elems[j].index] == BACKREF)      {	d->states[i].constraint = NO_CONSTRAINT;	d->states[i].backref = 1;      }  ++d->sindex;  return i;}/* Find the epsilon closure of a set of positions.  If any position of the set   contains a symbol that matches the empty string in some context, replace   that position with the elements of its follow labeled with an appropriate   constraint.  Repeat exhaustively until no funny positions are left.   S->elems must be large enough to hold the result. */static voidepsclosure (position_set *s, struct dfa const *d){  int i, j;  int *visited;  position p, old;  MALLOC(visited, int, d->tindex);  for (i = 0; i < d->tindex; ++i)    visited[i] = 0;  for (i = 0; i < s->nelem; ++i)    if (d->tokens[s->elems[i].index] >= NOTCHAR	&& d->tokens[s->elems[i].index] != BACKREF#ifdef MBS_SUPPORT	&& d->tokens[s->elems[i].index] != ANYCHAR	&& d->tokens[s->elems[i].index] != MBCSET#endif	&& d->tokens[s->elems[i].index] < CSET)      {	old = s->elems[i];	p.constraint = old.constraint;	delete(s->elems[i], s);	if (visited[old.index])	  {	    --i;	    continue;	  }	visited[old.index] = 1;	switch (d->tokens[old.index])	  {	  case BEGLINE:	    p.constraint &= BEGLINE_CONSTRAINT;	    break;	  case ENDLINE:	    p.constraint &= ENDLINE_CONSTRAINT;	    break;	  case BEGWORD:	    p.constraint &= BEGWORD_CONSTRAINT;	    break;	  case ENDWORD:	    p.constraint &= ENDWORD_CONSTRAINT;	    break;	  case LIMWORD:	    p.constraint &= LIMWORD_CONSTRAINT;	    break;	  case NOTLIMWORD:	    p.constraint &= NOTLIMWORD_CONSTRAINT;	    break;	  default:	    break;	  }	for (j = 0; j < d->follows[old.index].nelem; ++j)	  {	    p.index = d->follows[old.index].elems[j].index;	    insert(p, s);	  }	/* Force rescan to start at the beginning. */	i = -1;      }  free(visited);}/* Perform bottom-up analysis on the parse tree, computing various functions.   Note that at this point, we're pretending constructs like \< are real   characters rather than constraints on what can follow them.   Nullable:  A node is nullable if it is at the root of a regexp that can   match the empty string.   *  EMPTY leaves are nullable.   * No other leaf is nullable.   * A QMARK or STAR node is nullable.   * A PLUS node is nullable if its argument is nullable.   * A CAT node is nullable if both its arguments are nullable.   * An OR node is nullable if either argument is nullable.   Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)   that could correspond to the first character of a string matching the   regexp rooted at the given node.   * EMPTY leaves have empty firstpos.   * The firstpos of a nonempty leaf is that leaf itself.   * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its     argument.   * The firstpos of a CAT node is the firstpos of the left argument, union     the firstpos of the right if the left argument is nullable.   * The firstpos of an OR node is the union of firstpos of each argument.   Lastpos:  The lastpos of a node is the set of positions that could   correspond to the last character of a string matching the regexp at   the given node.   * EMPTY leaves have empty lastpos.   * The lastpos of a nonempty leaf is that leaf itself.   * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its     argument.   * The lastpos of a CAT node is the lastpos of its right argument, union     the lastpos of the left if the right argument is nullable.   * The lastpos of an OR node is the union of the lastpos of each argument.   Follow:  The follow of a position is the set of positions that could   correspond to the character following a character matching the node in   a string matching the regexp.  At this point we consider special symbols   that match the empty string in some context to be just normal characters.   Later, if we find that a special symbol is in a follow set, we will   replace it with the elements of its follow, labeled with an appropriate   constraint.   * Every node in the firstpos of the argument of a STAR or PLUS node is in     the follow of every node in the lastpos.   * Every node in the firstpos of the second argument of a CAT node is in     the follow of every node in the lastpos of the first argument.   Because of the postfix representation of the parse tree, the depth-first   analysis is conveniently done by a linear scan with the aid of a stack.   Sets are stored as arrays of the elements, obeying a stack-like allocation   scheme; the number of elements in each set deeper in the stack can be   used to determine the address of a particular set's array. */voiddfaanalyze (struct dfa *d, int searchflag){  int *nullable;		/* Nullable stack. */  int *nfirstpos;		/* Element count stack for firstpos sets. */  position *firstpos;		/* Array where firstpos elements are stored. */  int *nlastpos;		/* Element count stack for lastpos sets. */  position *lastpos;		/* Array where lastpos elements are stored. */  int *nalloc;			/* Sizes of arrays allocated to follow sets. */  position_set tmp;		/* Temporary set for merging sets. */  position_set merged;		/* Result of merging sets. */  int wants_newline;		/* True if some position wants newline info. */  int *o_nullable;  int *o_nfirst, *o_nlast;  position *o_firstpos, *o_lastpos;  int i, j;  position *pos;#ifdef DEBUG  fprintf(stderr, "dfaanalyze:\n");  for (i = 0; i < d->tindex; ++i)    {      fprintf(stderr, " %d:", i);      prtok(d->tokens[i]);    }  putc('\n', stderr);#endif  d->searchflag = searchflag;  MALLOC(nullable, int, d->depth);  o_nullable = nullable;  MALLOC(nfirstpos, int, d->depth);  o_nfirst = nfirstpos;  MALLOC(firstpos, position, d->nleaves);  o_firstpos = firstpos, firstpos += d->nleaves;  MALLOC(nlastpos, int, d->depth);  o_nlast = nlastpos;  MALLOC(lastpos, position, d->nleaves);  o_lastpos = lastpos, lastpos += d->nleaves;  MALLOC(nalloc, int, d->tindex);  for (i = 0; i < d->tindex; ++i)    nalloc[i] = 0;  MALLOC(merged.elems, position, d->nleaves);  CALLOC(d->follows, position_set, d->tindex);  for (i = 0; i < d->tindex; ++i)#ifdef DEBUG    {				/* Nonsyntactic #ifdef goo... */#endif    switch (d->tokens[i])      {      case EMPTY:	/* The empty set is nullable. */	*nullable++ = 1;	/* The firstpos and lastpos of the empty leaf are both empty. */	*nfirstpos++ = *nlastpos++ = 0;	break;      case STAR:      case PLUS:	/* Every element in the firstpos of the argument is in the follow	   of every element in the lastpos. */	tmp.nelem = nfirstpos[-1];	tmp.elems = firstpos;	pos = lastpos;	for (j = 0; j < nlastpos[-1]; ++j)	  {	    merge(&tmp, &d->follows[pos[j].index], &merged);	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,				 nalloc[pos[j].index], merged.nelem - 1);	    copy(&merged, &d->follows[pos[j].index]);	  }      case QMARK:	/* A QMARK or STAR node is automatically nullable. */	if (d->tokens[i] != PLUS)	  nullable[-1] = 1;	break;      case CAT:	/* Every element in the firstpos of the second argument is in the	   follow of every element in the lastpos of the first argument. */	tmp.nelem = nfirstpos[-1];	tmp.elems = firstpos;	pos = lastpos + nlastpos[-1];	for (j = 0; j < nlastpos[-2]; ++j)	  {	    merge(&tmp, &d->follows[pos[j].index], &merged);	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,				 nalloc[pos[j].index], merged.nelem - 1);	    copy(&merged, &d->follows[pos[j].index]);	  }	/* The firstpos of a CAT node is the firstpos of the first argument,	   union that of the second argument if the first is nullable. */	if (nullable[-2])	  nfirstpos[-2] += nfirstpos[-1];	else	  firstpos += nfirstpos[-1];	--nfirstpos;	/* The lastpos of a CAT node is the lastpos of the second argument,	   union that of the first argument if the second is nullable. */	if (nullable[-1])	  nlastpos[-2] += nlastpos[-1];	else	  {	    pos = lastpos + nlastpos[-2];	    for (j = nlastpos[-1] - 1; j >= 0; --j)	      pos[j] = lastpos[j];	    lastpos += nlastpos[-2];	    nlastpos[-2] = nlastpos[-1];	  }	--nlastpos;	/* A CAT node is nullable if both arguments are nullable. */	nullable[-2] = nullable[-1] && nullable[-2];	--nullable;	break;      case OR:      case ORTOP:	/* The firstpos is the union of the firstpos of each argument. */	nfirstpos[-2] += nfirstpos[-1];	--nfirstpos;	/* The lastpos is the union of the lastpos of each argument. */	nlastpos[-2] += nlastpos[-1];	--nlastpos;	/* An OR node is nullable if either argument is nullable. */	nullable[-2] = nullable[-1] || nullable[-2];	--nullable;	break;      default:	/* Anything else is a nonempty position.  (Note that special	   constructs like \< are treated as nonempty strings here;	   an "epsilon closure" effectively makes them nullable later.	   Backreferences have to get a real position so we can detect	   transitions on them later.  But they are nullable. */	*nullable++ = d->tokens[i] == BACKREF;	/* This position is in its own firstpos and lastpos. */	*nfirstpos++ = *nlastpos++ = 1;	--firstpos, --lastpos;	firstpos->index = lastpos->index = i;	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;	/* Allocate the follow set for this position. */	nalloc[i] = 1;	MALLOC(d->follows[i].elems, position, nalloc[i]);	break;      }#ifdef DEBUG    /* ... balance the above nonsyntactic #ifdef goo... */      fprintf(stderr, "node %d:", i);      prtok(d->tokens[i]);      putc('\n', stderr);      fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");      fprintf(stderr, " firstpos:");      for (j = nfirstpos[-1] - 1; j >= 0; --j)	{	  fprintf(stderr, " %d:", firstpos[j].index);	  prtok(d->tokens[firstpos[j].index]);	}      fprintf(stderr, "\n lastpos:");      for (j = nlastpos[-1] - 1; j >= 0; --j)	{	  fprintf(stderr, " %d:", lastpos[j].index);	  prtok(d->tokens[lastpos[j].index]);	}      putc('\n', stderr);    }#endif  /* For each follow set that is the follow set of a real position, replace     it with its epsilon closure. */  for (i = 0; i < d->tindex; ++i)    if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF#ifdef MBS_SUPPORT        || d->tokens[i] == ANYCHAR        || d->tokens[i] == MBCSET#endif	|| d->tokens[i] >= CSET)      {#ifdef DEBUG	fprintf(stderr, "follows(%d:", i);	prtok(d->tokens[i]);	fprintf(stderr, "):");	for (j = d->follows[i].nelem - 1; j >= 0; --j)	  {	    fprintf(stderr, " %d:", d->follows[i].elems[j].index);
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -