📄 awkcompiler.java

📁 java正这表达式,简单.好用.
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
	  }	  atom = new QuestionNode(atom._clone(startPosition));	  count = max-min;	  if(count == 1)	    catNode._right = atom;	  else {	    catNode._right = new CatNode();	    catNode = (CatNode)catNode._right;	    catNode._left = atom;	    while(--count > 1) {	      atom = atom._clone(startPosition);	      catNode._right = new CatNode();	      catNode       = (CatNode)catNode._right;	      catNode._left  = atom;	    }	    catNode._right = atom._clone(startPosition);	  }	}      }    } else      throw	new MalformedPatternException("Parse error: unexpected character " +		__lookahead + " in interval at position "	+ __bytesRead);    __position = startPosition[0];    return root;  }  private SyntaxNode __backslashToken() throws MalformedPatternException {    SyntaxNode current;    char token;    int number;    __match('\\');    if(__lookahead == 'x'){      __match('x');      // Parse a hexadecimal number      current = _newTokenNode((char)__parseUnsignedInteger(16, 2, 2),			     __position++);    } else if(__lookahead == 'c') {      __match('c');      // Create a control character      token = Character.toUpperCase(__lookahead);      token = (char)(token > 63 ? token - 64 : token + 64);      current = new TokenNode(token, __position++);      __match(__lookahead);    } else if(__lookahead >= '0' && __lookahead <= '9') {      __match(__lookahead);      if(__lookahead >= '0' && __lookahead <= '9'){	// We have an octal character or a multi-digit backreference.	// Assume octal character for now.	__putback();	number = __parseUnsignedInteger(10, 2, 3);	number = Integer.parseInt(Integer.toString(number), 8);	current =  _newTokenNode((char)number, __position++);      } else {	// We have either \0, an escaped digit, or a backreference.	__putback();	if(__lookahead == '0'){	  // \0 matches the null character	  __match('0');	  current = new TokenNode('\0', __position++);	} else {	  // Either an escaped digit or backreference.	  number = Character.digit(__lookahead, 10);	  current =  _newTokenNode(__lookahead, __position++);	  __match(__lookahead);	}      }    } else if(__lookahead == 'b') {      // Inside of a character class the \b means backspace, otherwise      // it means a word boundary      //if(__inCharacterClass)      // \b always means backspace      current = new TokenNode('\b', __position++);      /*      else 	current = new TokenNode((char)LeafNode._WORD_BOUNDARY_MARKER_TOKEN,				position++);				*/      __match('b');    } /*else if(__lookahead == 'B' && !__inCharacterClass){      current = new TokenNode((char)LeafNode._NONWORD_BOUNDARY_MARKER_TOKEN,			      position++);      __match('B');    } */ else {      CharacterClassNode characterSet;      token = __lookahead;      switch(__lookahead){      case 'n' : token = '\n'; break;      case 'r' : token = '\r'; break;      case 't' : token = '\t'; break;      case 'f' : token = '\f'; break;      }      switch(token) {      case 'd' :	characterSet = new CharacterClassNode(__position++);	characterSet._addTokenRange('0', '9');	current = characterSet;	break;      case 'D' :	characterSet = new NegativeCharacterClassNode(__position++);	characterSet._addTokenRange('0', '9');	current = characterSet;	break;      case 'w' :	characterSet = new CharacterClassNode(__position++);	characterSet._addTokenRange('0', '9');	characterSet._addTokenRange('a', 'z');	characterSet._addTokenRange('A', 'Z');	characterSet._addToken('_');	current = characterSet;	break;      case 'W' :	characterSet = new NegativeCharacterClassNode(__position++);	characterSet._addTokenRange('0', '9');	characterSet._addTokenRange('a', 'z');	characterSet._addTokenRange('A', 'Z');	characterSet._addToken('_');	current = characterSet;	break;      case 's' :	characterSet = new CharacterClassNode(__position++);	characterSet._addToken(' ');	characterSet._addToken('\f');	characterSet._addToken('\n');	characterSet._addToken('\r');	characterSet._addToken('\t');	current = characterSet;	break;      case 'S' :	characterSet = new NegativeCharacterClassNode(__position++);	characterSet._addToken(' ');	characterSet._addToken('\f');	characterSet._addToken('\n');	characterSet._addToken('\r');	characterSet._addToken('\t');	current = characterSet;	break;	default  : current = _newTokenNode(token, __position++); break;      }      __match(__lookahead);    }    return current;  }  private SyntaxNode __atom() throws MalformedPatternException {    SyntaxNode current;    if(__lookahead == '(') {      __match('(');      ++__openParen;      current = __regex();      __match(')');      ++__closeParen;    } else if(__lookahead == '[')      current = __characterClass();    else if(__lookahead == '.') {      CharacterClassNode characterSet;      __match('.');      characterSet = new NegativeCharacterClassNode(__position++);      characterSet._addToken('\n');      current = characterSet;    } else if(__lookahead == '\\') {      current = __backslashToken();    } /*else if(__lookahead == '^') {      current =	new TokenNode((char)LeafNode._BEGIN_LINE_MARKER_TOKEN, __position++);      __match('^');    } else if(__lookahead == '$') {      current =	new TokenNode((char)LeafNode._END_LINE_MARKER_TOKEN, __position++);      __match('$');    } */ else if(!__isMetachar(__lookahead)) {      current = _newTokenNode(__lookahead, __position++);      __match(__lookahead);    } else      throw	new MalformedPatternException("Parse error: unexpected character " +				__lookahead + " at position " + __bytesRead);    return current;  }  private SyntaxNode __characterClass() throws MalformedPatternException {    char lastToken, token;    SyntaxNode node;    CharacterClassNode current;    __match('[');    __inCharacterClass = true;    if(__lookahead == '^'){      __match('^');      current = new NegativeCharacterClassNode(__position++);    } else      current = new CharacterClassNode(__position++);    while(__lookahead != ']' && __lookahead != _END_OF_INPUT) {      if(__lookahead == '\\'){	node = __backslashToken();	--__position;	// __backslashToken() (actually newTokenNode()) does not take care of        // case insensitivity when __inCharacterClass is true.	if(node instanceof TokenNode){	  lastToken = ((TokenNode)node)._token;	  current._addToken(lastToken);	  if(!__caseSensitive)	    current._addToken(_toggleCase(lastToken));	} else {	  CharacterClassNode slash;	  slash = (CharacterClassNode)node;	  // This could be made more efficient by manipulating the	  // characterSet elements of the CharacterClassNodes but	  // for the moment, this is more clear.	  for(token=0; token < LeafNode._NUM_TOKENS; token++){	    if(slash._matches(token))	      current._addToken(token);	  }	  // A byproduct of this act is that when a '-' occurs after	  // a \d, \w, etc. it is not interpreted as a range and no	  // parse exception is thrown.	  // This is considered a feature and not a bug for now.	  continue;	}      } else {	lastToken = __lookahead;	current._addToken(__lookahead);	if(!__caseSensitive)	  current._addToken(_toggleCase(__lookahead));	__match(__lookahead);      }      // In Perl, a - is a token if it occurs at the beginning      // or end of the character class.  Anywhere else, it indicates      // a range.      // A byproduct of this implementation is that if a '-' occurs      // after the end of a range, it is interpreted as a '-' and no      // exception is thrown. e.g., the second dash in [a-z-x]      // This is considered a feature and not a bug for now.      if(__lookahead == '-'){	__match('-');	if(__lookahead == ']'){	  current._addToken('-');	  break;	} else if(__lookahead == '\\') {	  node = __backslashToken();	  --__position;	  if(node instanceof TokenNode)	    token = ((TokenNode)node)._token;	  else	    throw new MalformedPatternException(	   "Parse error: invalid range specified at position " + __bytesRead);	} else {	  token = __lookahead;	  __match(__lookahead);	}	if(token < lastToken)	  throw new MalformedPatternException(	 "Parse error: invalid range specified at position " + __bytesRead);	current._addTokenRange(lastToken + 1, token);	if(!__caseSensitive)	  current._addTokenRange(_toggleCase((char)(lastToken + 1)),				_toggleCase(token));      }    }    __match(']');    __inCharacterClass = false;    return current;  }  SyntaxNode _newTokenNode(char token, int position){    if(!__inCharacterClass && !__caseSensitive &&       (_isUpperCase(token) || _isLowerCase(token))){      CharacterClassNode node = new CharacterClassNode(position);      node._addToken(token);      node._addToken(_toggleCase(token));      return node;    }    return new TokenNode(token, position);  }  SyntaxTree _parse(char[] expression) throws MalformedPatternException {    SyntaxTree tree;    __openParen = __closeParen = 0;    __regularExpression = expression;    __bytesRead = 0;    __expressionLength = expression.length;    __inCharacterClass = false;    __position = 0;    __match(__lookahead); // Call match to read first input.    if(__lookahead == '^') {      __beginAnchor = true;      __match(__lookahead);    }    if(__expressionLength > 0 && expression[__expressionLength - 1] == '$') {      --__expressionLength;      __endAnchor = true;    }    if(__expressionLength > 1 || (__expressionLength == 1 && !__beginAnchor)) {      CatNode root;      root = new CatNode();      root._left  = __regex();      // end marker      root._right =	new TokenNode((char)LeafNode._END_MARKER_TOKEN, __position++);      tree = new SyntaxTree(root, __position);    } else       tree = new	SyntaxTree(new TokenNode((char)LeafNode._END_MARKER_TOKEN, 0), 1);    tree._computeFollowPositions();    return tree;  }  /**   * Compiles an Awk regular expression into an AwkPattern instance that   * can be used by an AwkMatcher object to perform pattern matching.   * <p>   * @param pattern  An Awk regular expression to compile.   * @param options  A set of flags giving the compiler instructions on   *                 how to treat the regular expression.  Currently the   *                 only meaningful flag is AwkCompiler.CASE_INSENSITIVE_MASK.   * @return A Pattern instance constituting the compiled regular expression.   *         This instance will always be an AwkPattern and can be reliably   *         be casted to an AwkPattern.   * @exception MalformedPatternException  If the compiled expression   *  is not a valid Awk regular expression.   */  public Pattern compile(char[] pattern, int options)       throws MalformedPatternException  {    SyntaxTree tree;    AwkPattern regexp;    __beginAnchor = __endAnchor = false;    __caseSensitive = ((options & CASE_INSENSITIVE_MASK) == 0);    tree   = _parse(pattern);    regexp = new AwkPattern(new String(pattern), tree);    regexp._options = options;    regexp._hasBeginAnchor = __beginAnchor;    regexp._hasEndAnchor   = __endAnchor;    return regexp;  }  /**   * Compiles an Awk regular expression into an AwkPattern instance that   * can be used by an AwkMatcher object to perform pattern matching.   * <p>   * @param pattern  An Awk regular expression to compile.   * @param options  A set of flags giving the compiler instructions on   *                 how to treat the regular expression.  Currently the   *                 only meaningful flag is AwkCompiler.CASE_INSENSITIVE_MASK.   * @return A Pattern instance constituting the compiled regular expression.   *         This instance will always be an AwkPattern and can be reliably   *         be casted to an AwkPattern.   * @exception MalformedPatternException  If the compiled expression   *  is not a valid Awk regular expression.   */  public Pattern compile(String pattern, int options)       throws MalformedPatternException  {    SyntaxTree tree;    AwkPattern regexp;    __beginAnchor = __endAnchor = false;    __caseSensitive = ((options & CASE_INSENSITIVE_MASK) == 0);    tree   = _parse(pattern.toCharArray());    regexp = new AwkPattern(pattern, tree);    regexp._options = options;    regexp._hasBeginAnchor = __beginAnchor;    regexp._hasEndAnchor   = __endAnchor;    return regexp;  }  /**   * Same as calling <b>compile(pattern, AwkCompiler.DEFAULT_MASK);</b>   * <p>   * @param pattern  A regular expression to compile.   * @return A Pattern instance constituting the compiled regular expression.   *         This instance will always be an AwkPattern and can be reliably   *         be casted to an AwkPattern.   * @exception MalformedPatternException  If the compiled expression   *  is not a valid Awk regular expression.   */  public Pattern compile(char[] pattern) throws MalformedPatternException {    return compile(pattern, DEFAULT_MASK);  }  /**   * Same as calling <b>compile(pattern, AwkCompiler.DEFAULT_MASK);</b>   * <p>   * @param pattern  A regular expression to compile.   * @return A Pattern instance constituting the compiled regular expression.   *         This instance will always be an AwkPattern and can be reliably   *         be casted to an AwkPattern.   * @exception MalformedPatternException  If the compiled expression   *  is not a valid Awk regular expression.   */  public Pattern compile(String pattern) throws MalformedPatternException {    return compile(pattern, DEFAULT_MASK);  }}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -