📄 dfa.c

📁 早期freebsd实现
💻 C
📖 第 1 页 / 共 4 页
字号:
/* Recursive descent parser for regular expressions. */static _token tok;		/* Lookahead token. */static depth;			/* Current depth of a hypothetical stack				   holding deferred productions.  This is				   used to determine the depth that will be				   required of the real stack later on in				   reganalyze(). *//* Add the given token to the parse tree, maintaining the depth count and   updating the maximum depth if necessary. */static voidaddtok(t)     _token t;{  REALLOC_IF_NECESSARY(reg->tokens, _token, reg->talloc, reg->tindex);  reg->tokens[reg->tindex++] = t;  switch (t)    {    case _QMARK:    case _STAR:    case _PLUS:      break;    case _CAT:    case _OR:      --depth;      break;    default:      ++reg->nleaves;    case _EMPTY:      ++depth;      break;    }  if (depth > reg->depth)    reg->depth = depth;}/* The grammar understood by the parser is as follows.   start:     regexp     _ALLBEGLINE regexp     regexp _ALLENDLINE     _ALLBEGLINE regexp _ALLENDLINE   regexp:     regexp _OR branch     branch   branch:     branch closure     closure   closure:     closure _QMARK     closure _STAR     closure _PLUS     atom   atom:     <normal character>     _SET     _BACKREF     _BEGLINE     _ENDLINE     _BEGWORD     _ENDWORD     _LIMWORD     _NOTLIMWORD     <empty>   The parser builds a parse tree in postfix form in an array of tokens. */#ifdef __STDC__static void regexp(void);#elsestatic void regexp();#endifstatic voidatom(){  if (tok >= 0 && (tok < _NOTCHAR || tok >= _SET || tok == _BACKREF      || tok == _BEGLINE || tok == _ENDLINE || tok == _BEGWORD      || tok == _ENDWORD || tok == _LIMWORD || tok == _NOTLIMWORD))    {      addtok(tok);      tok = lex();    }  else if (tok == _LPAREN)    {      tok = lex();      regexp();      if (tok != _RPAREN)	reg_error("Unbalanced (");      tok = lex();    }  else    addtok(_EMPTY);}static voidclosure(){  atom();  while (tok == _QMARK || tok == _STAR || tok == _PLUS)    {      addtok(tok);      tok = lex();    }}static voidbranch(){  closure();  while (tok != _RPAREN && tok != _OR && tok != _ALLENDLINE && tok >= 0)    {      closure();      addtok(_CAT);    }}static voidregexp(){  branch();  while (tok == _OR)    {      tok = lex();      branch();      addtok(_OR);    }}/* Main entry point for the parser.  S is a string to be parsed, len is the   length of the string, so s can include NUL characters.  R is a pointer to   the struct regexp to parse into. */voidregparse(s, len, r)     const char *s;     size_t len;     struct regexp *r;{  reg = r;  lexstart = lexptr = s;  lexleft = len;  caret_allowed = 1;  closure_allowed = 0;  if (! syntax_bits_set)    reg_error("No syntax specified");  tok = lex();  depth = r->depth;  if (tok == _ALLBEGLINE)    {      addtok(_BEGLINE);      tok = lex();      regexp();      addtok(_CAT);    }  else    regexp();  if (tok == _ALLENDLINE)    {      addtok(_ENDLINE);      addtok(_CAT);      tok = lex();    }  if (tok != _END)    reg_error("Unbalanced )");  addtok(_END - r->nregexps);  addtok(_CAT);  if (r->nregexps)    addtok(_OR);  ++r->nregexps;}/* Some primitives for operating on sets of positions. *//* Copy one set to another; the destination must be large enough. */static voidcopy(src, dst)     const _position_set *src;     _position_set *dst;{  int i;  for (i = 0; i < src->nelem; ++i)    dst->elems[i] = src->elems[i];  dst->nelem = src->nelem;}/* Insert a position in a set.  Position sets are maintained in sorted   order according to index.  If position already exists in the set with   the same index then their constraints are logically or'd together.   S->elems must point to an array large enough to hold the resulting set. */static voidinsert(p, s)     _position p;     _position_set *s;{  int i;  _position t1, t2;  for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)    ;  if (i < s->nelem && p.index == s->elems[i].index)    s->elems[i].constraint |= p.constraint;  else    {      t1 = p;      ++s->nelem;      while (i < s->nelem)	{	  t2 = s->elems[i];	  s->elems[i++] = t1;	  t1 = t2;	}    }}/* Merge two sets of positions into a third.  The result is exactly as if   the positions of both sets were inserted into an initially empty set. */static voidmerge(s1, s2, m)     _position_set *s1;     _position_set *s2;     _position_set *m;{  int i = 0, j = 0;  m->nelem = 0;  while (i < s1->nelem && j < s2->nelem)    if (s1->elems[i].index > s2->elems[j].index)      m->elems[m->nelem++] = s1->elems[i++];    else if (s1->elems[i].index < s2->elems[j].index)      m->elems[m->nelem++] = s2->elems[j++];    else      {	m->elems[m->nelem] = s1->elems[i++];	m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;      }  while (i < s1->nelem)    m->elems[m->nelem++] = s1->elems[i++];  while (j < s2->nelem)    m->elems[m->nelem++] = s2->elems[j++];}/* Delete a position from a set. */static voiddelete(p, s)     _position p;     _position_set *s;{  int i;  for (i = 0; i < s->nelem; ++i)    if (p.index == s->elems[i].index)      break;  if (i < s->nelem)    for (--s->nelem; i < s->nelem; ++i)      s->elems[i] = s->elems[i + 1];}/* Find the index of the state corresponding to the given position set with   the given preceding context, or create a new state if there is no such   state.  Newline and letter tell whether we got here on a newline or   letter, respectively. */static intstate_index(r, s, newline, letter)     struct regexp *r;     _position_set *s;     int newline;     int letter;{  int lhash = 0;  int constraint;  int i, j;  newline = newline ? 1 : 0;  letter = letter ? 1 : 0;  for (i = 0; i < s->nelem; ++i)    lhash ^= s->elems[i].index + s->elems[i].constraint;  /* Try to find a state that exactly matches the proposed one. */  for (i = 0; i < r->sindex; ++i)    {      if (lhash != r->states[i].hash || s->nelem != r->states[i].elems.nelem	  || newline != r->states[i].newline || letter != r->states[i].letter)	continue;      for (j = 0; j < s->nelem; ++j)	if (s->elems[j].constraint	    != r->states[i].elems.elems[j].constraint	    || s->elems[j].index != r->states[i].elems.elems[j].index)	  break;      if (j == s->nelem)	return i;    }  /* We'll have to create a new state. */  REALLOC_IF_NECESSARY(r->states, _dfa_state, r->salloc, r->sindex);  r->states[i].hash = lhash;  MALLOC(r->states[i].elems.elems, _position, s->nelem);  copy(s, &r->states[i].elems);  r->states[i].newline = newline;  r->states[i].letter = letter;  r->states[i].backref = 0;  r->states[i].constraint = 0;  r->states[i].first_end = 0;  for (j = 0; j < s->nelem; ++j)    if (r->tokens[s->elems[j].index] < 0)      {	constraint = s->elems[j].constraint;	if (_SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)	    || _SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)	    || _SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)	    || _SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))	  r->states[i].constraint |= constraint;	if (! r->states[i].first_end)	  r->states[i].first_end = r->tokens[s->elems[j].index];      }    else if (r->tokens[s->elems[j].index] == _BACKREF)      {	r->states[i].constraint = _NO_CONSTRAINT;	r->states[i].backref = 1;      }  ++r->sindex;  return i;}/* Find the epsilon closure of a set of positions.  If any position of the set   contains a symbol that matches the empty string in some context, replace   that position with the elements of its follow labeled with an appropriate   constraint.  Repeat exhaustively until no funny positions are left.   S->elems must be large enough to hold the result. */static voidepsclosure(s, r)     _position_set *s;     struct regexp *r;{  int i, j;  int *visited;  _position p, old;  MALLOC(visited, int, r->tindex);  for (i = 0; i < r->tindex; ++i)    visited[i] = 0;  for (i = 0; i < s->nelem; ++i)    if (r->tokens[s->elems[i].index] >= _NOTCHAR	&& r->tokens[s->elems[i].index] != _BACKREF	&& r->tokens[s->elems[i].index] < _SET)      {	old = s->elems[i];	p.constraint = old.constraint;	delete(s->elems[i], s);	if (visited[old.index])	  {	    --i;	    continue;	  }	visited[old.index] = 1;	switch (r->tokens[old.index])	  {	  case _BEGLINE:	    p.constraint &= _BEGLINE_CONSTRAINT;	    break;	  case _ENDLINE:	    p.constraint &= _ENDLINE_CONSTRAINT;	    break;	  case _BEGWORD:	    p.constraint &= _BEGWORD_CONSTRAINT;	    break;	  case _ENDWORD:	    p.constraint &= _ENDWORD_CONSTRAINT;	    break;	  case _LIMWORD:	    p.constraint &= _ENDWORD_CONSTRAINT;	    break;	  case _NOTLIMWORD:	    p.constraint &= _NOTLIMWORD_CONSTRAINT;	    break;	  default:	    break;	  }	for (j = 0; j < r->follows[old.index].nelem; ++j)	  {	    p.index = r->follows[old.index].elems[j].index;	    insert(p, s);	  }	/* Force rescan to start at the beginning. */	i = -1;      }  free(visited);}/* Perform bottom-up analysis on the parse tree, computing various functions.   Note that at this point, we're pretending constructs like \< are real   characters rather than constraints on what can follow them.   Nullable:  A node is nullable if it is at the root of a regexp that can   match the empty string.   *  _EMPTY leaves are nullable.   * No other leaf is nullable.   * A _QMARK or _STAR node is nullable.   * A _PLUS node is nullable if its argument is nullable.   * A _CAT node is nullable if both its arguments are nullable.   * An _OR node is nullable if either argument is nullable.   Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)   that could correspond to the first character of a string matching the   regexp rooted at the given node.   * _EMPTY leaves have empty firstpos.   * The firstpos of a nonempty leaf is that leaf itself.   * The firstpos of a _QMARK, _STAR, or _PLUS node is the firstpos of its     argument.   * The firstpos of a _CAT node is the firstpos of the left argument, union     the firstpos of the right if the left argument is nullable.   * The firstpos of an _OR node is the union of firstpos of each argument.   Lastpos:  The lastpos of a node is the set of positions that could   correspond to the last character of a string matching the regexp at   the given node.   * _EMPTY leaves have empty lastpos.   * The lastpos of a nonempty leaf is that leaf itself.   * The lastpos of a _QMARK, _STAR, or _PLUS node is the lastpos of its     argument.   * The lastpos of a _CAT node is the lastpos of its right argument, union     the lastpos of the left if the right argument is nullable.   * The lastpos of an _OR node is the union of the lastpos of each argument.   Follow:  The follow of a position is the set of positions that could   correspond to the character following a character matching the node in   a string matching the regexp.  At this point we consider special symbols   that match the empty string in some context to be just normal characters.   Later, if we find that a special symbol is in a follow set, we will   replace it with the elements of its follow, labeled with an appropriate   constraint.   * Every node in the firstpos of the argument of a _STAR or _PLUS node is in     the follow of every node in the lastpos.   * Every node in the firstpos of the second argument of a _CAT node is in     the follow of every node in the lastpos of the first argument.   Because of the postfix representation of the parse tree, the depth-first   analysis is conveniently done by a linear scan with the aid of a stack.   Sets are stored as arrays of the elements, obeying a stack-like allocation   scheme; the number of elements in each set deeper in the stack can be   used to determine the address of a particular set's array. */voidreganalyze(r, searchflag)     struct regexp *r;     int searchflag;{  int *nullable;		/* Nullable stack. */  int *nfirstpos;		/* Element count stack for firstpos sets. */  _position *firstpos;		/* Array where firstpos elements are stored. */  int *nlastpos;		/* Element count stack for lastpos sets. */  _position *lastpos;		/* Array where lastpos elements are stored. */  int *nalloc;			/* Sizes of arrays allocated to follow sets. */  _position_set tmp;		/* Temporary set for merging sets. */  _position_set merged;		/* Result of merging sets. */  int wants_newline;		/* True if some position wants newline info. */  int *o_nullable;  int *o_nfirst, *o_nlast;  _position *o_firstpos, *o_lastpos;  int i, j;  _position *pos;  r->searchflag = searchflag;  MALLOC(nullable, int, r->depth);  o_nullable = nullable;  MALLOC(nfirstpos, int, r->depth);  o_nfirst = nfirstpos;  MALLOC(firstpos, _position, r->nleaves);  o_firstpos = firstpos, firstpos += r->nleaves;  MALLOC(nlastpos, int, r->depth);  o_nlast = nlastpos;  MALLOC(lastpos, _position, r->nleaves);  o_lastpos = lastpos, lastpos += r->nleaves;  MALLOC(nalloc, int, r->tindex);  for (i = 0; i < r->tindex; ++i)    nalloc[i] = 0;  MALLOC(merged.elems, _position, r->nleaves);  CALLOC(r->follows, _position_set, r->tindex);  for (i = 0; i < r->tindex; ++i)    switch (r->tokens[i])      {      case _EMPTY:	/* The empty set is nullable. */	*nullable++ = 1;	/* The firstpos and lastpos of the empty leaf are both empty. */	*nfirstpos++ = *nlastpos++ = 0;	break;      case _STAR:      case _PLUS:	/* Every element in the firstpos of the argument is in the follow	   of every element in the lastpos. */	tmp.nelem = nfirstpos[-1];	tmp.elems = firstpos;	pos = lastpos;	for (j = 0; j < nlastpos[-1]; ++j)	  {	    merge(&tmp, &r->follows[pos[j].index], &merged);	    REALLOC_IF_NECESSARY(r->follows[pos[j].index].elems, _position,				 nalloc[pos[j].index], merged.nelem - 1);	    copy(&merged, &r->follows[pos[j].index]);	  }      case _QMARK:	/* A _QMARK or _STAR node is automatically nullable. */	if (r->tokens[i] != _PLUS)	  nullable[-1] = 1;	break;      case _CAT:	/* Every element in the firstpos of the second argument is in the	   follow of every element in the lastpos of the first argument. */	tmp.nelem = nfirstpos[-1];	tmp.elems = firstpos;	pos = lastpos + nlastpos[-1];	for (j = 0; j < nlastpos[-2]; ++j)	  {	    merge(&tmp, &r->follows[pos[j].index], &merged);	    REALLOC_IF_NECESSARY(r->follows[pos[j].index].elems, _position,				 nalloc[pos[j].index], merged.nelem - 1);	    copy(&merged, &r->follows[pos[j].index]);	  }	/* The firstpos of a _CAT node is the firstpos of the first argument,	   union that of the second argument if the first is nullable. */	if (nullable[-2])	  nfirstpos[-2] += nfirstpos[-1];	else	  firstpos += nfirstpos[-1];	--nfirstpos;	/* The lastpos of a _CAT node is the lastpos of the second argument,	   union that of the first argument if the second is nullable. */	if (nullable[-1])	  nlastpos[-2] += nlastpos[-1];	else	  {	    pos = lastpos + nlastpos[-2];	    for (j = nlastpos[-1] - 1; j >= 0; --j)	      pos[j] = lastpos[j];	    lastpos += nlastpos[-2];	    nlastpos[-2] = nlastpos[-1];	  }	--nlastpos;	/* A _CAT node is nullable if both arguments are nullable. */	nullable[-2] = nullable[-1] && nullable[-2];	--nullable;
💿 文件大小 40554 K
👤 上传用户 luyibo54618
📂 所属分类 Linux/Unix编程
📄 代码行数 2,292 行
💻 语言类型 C语言
🏷️ 相关标签

#freebsd
更多freebsd资源 →
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -