📄 awkcompiler.java
字号:
} atom = new QuestionNode(atom._clone(startPosition)); count = max-min; if(count == 1) catNode._right = atom; else { catNode._right = new CatNode(); catNode = (CatNode)catNode._right; catNode._left = atom; while(--count > 1) { atom = atom._clone(startPosition); catNode._right = new CatNode(); catNode = (CatNode)catNode._right; catNode._left = atom; } catNode._right = atom._clone(startPosition); } } } } else throw new MalformedPatternException("Parse error: unexpected character " + __lookahead + " in interval at position " + __bytesRead); __position = startPosition[0]; return root; } private SyntaxNode __backslashToken() throws MalformedPatternException { SyntaxNode current; char token; int number; __match('\\'); if(__lookahead == 'x'){ __match('x'); // Parse a hexadecimal number current = _newTokenNode((char)__parseUnsignedInteger(16, 2, 2), __position++); } else if(__lookahead == 'c') { __match('c'); // Create a control character token = Character.toUpperCase(__lookahead); token = (char)(token > 63 ? token - 64 : token + 64); current = new TokenNode(token, __position++); __match(__lookahead); } else if(__lookahead >= '0' && __lookahead <= '9') { __match(__lookahead); if(__lookahead >= '0' && __lookahead <= '9'){ // We have an octal character or a multi-digit backreference. // Assume octal character for now. __putback(); number = __parseUnsignedInteger(10, 2, 3); number = Integer.parseInt(Integer.toString(number), 8); current = _newTokenNode((char)number, __position++); } else { // We have either \0, an escaped digit, or a backreference. __putback(); if(__lookahead == '0'){ // \0 matches the null character __match('0'); current = new TokenNode('\0', __position++); } else { // Either an escaped digit or backreference. number = Character.digit(__lookahead, 10); current = _newTokenNode(__lookahead, __position++); __match(__lookahead); } } } else if(__lookahead == 'b') { // Inside of a character class the \b means backspace, otherwise // it means a word boundary //if(__inCharacterClass) // \b always means backspace current = new TokenNode('\b', __position++); /* else current = new TokenNode((char)LeafNode._WORD_BOUNDARY_MARKER_TOKEN, position++); */ __match('b'); } /*else if(__lookahead == 'B' && !__inCharacterClass){ current = new TokenNode((char)LeafNode._NONWORD_BOUNDARY_MARKER_TOKEN, position++); __match('B'); } */ else { CharacterClassNode characterSet; token = __lookahead; switch(__lookahead){ case 'n' : token = '\n'; break; case 'r' : token = '\r'; break; case 't' : token = '\t'; break; case 'f' : token = '\f'; break; } switch(token) { case 'd' : characterSet = new CharacterClassNode(__position++); characterSet._addTokenRange('0', '9'); current = characterSet; break; case 'D' : characterSet = new NegativeCharacterClassNode(__position++); characterSet._addTokenRange('0', '9'); current = characterSet; break; case 'w' : characterSet = new CharacterClassNode(__position++); characterSet._addTokenRange('0', '9'); characterSet._addTokenRange('a', 'z'); characterSet._addTokenRange('A', 'Z'); characterSet._addToken('_'); current = characterSet; break; case 'W' : characterSet = new NegativeCharacterClassNode(__position++); characterSet._addTokenRange('0', '9'); characterSet._addTokenRange('a', 'z'); characterSet._addTokenRange('A', 'Z'); characterSet._addToken('_'); current = characterSet; break; case 's' : characterSet = new CharacterClassNode(__position++); characterSet._addToken(' '); characterSet._addToken('\f'); characterSet._addToken('\n'); characterSet._addToken('\r'); characterSet._addToken('\t'); current = characterSet; break; case 'S' : characterSet = new NegativeCharacterClassNode(__position++); characterSet._addToken(' '); characterSet._addToken('\f'); characterSet._addToken('\n'); characterSet._addToken('\r'); characterSet._addToken('\t'); current = characterSet; break; default : current = _newTokenNode(token, __position++); break; } __match(__lookahead); } return current; } private SyntaxNode __atom() throws MalformedPatternException { SyntaxNode current; if(__lookahead == '(') { __match('('); ++__openParen; current = __regex(); __match(')'); ++__closeParen; } else if(__lookahead == '[') current = __characterClass(); else if(__lookahead == '.') { CharacterClassNode characterSet; __match('.'); characterSet = new NegativeCharacterClassNode(__position++); characterSet._addToken('\n'); current = characterSet; } else if(__lookahead == '\\') { current = __backslashToken(); } /*else if(__lookahead == '^') { current = new TokenNode((char)LeafNode._BEGIN_LINE_MARKER_TOKEN, __position++); __match('^'); } else if(__lookahead == '$') { current = new TokenNode((char)LeafNode._END_LINE_MARKER_TOKEN, __position++); __match('$'); } */ else if(!__isMetachar(__lookahead)) { current = _newTokenNode(__lookahead, __position++); __match(__lookahead); } else throw new MalformedPatternException("Parse error: unexpected character " + __lookahead + " at position " + __bytesRead); return current; } private SyntaxNode __characterClass() throws MalformedPatternException { char lastToken, token; SyntaxNode node; CharacterClassNode current; __match('['); __inCharacterClass = true; if(__lookahead == '^'){ __match('^'); current = new NegativeCharacterClassNode(__position++); } else current = new CharacterClassNode(__position++); while(__lookahead != ']' && __lookahead != _END_OF_INPUT) { if(__lookahead == '\\'){ node = __backslashToken(); --__position; // __backslashToken() (actually newTokenNode()) does not take care of // case insensitivity when __inCharacterClass is true. if(node instanceof TokenNode){ lastToken = ((TokenNode)node)._token; current._addToken(lastToken); if(!__caseSensitive) current._addToken(_toggleCase(lastToken)); } else { CharacterClassNode slash; slash = (CharacterClassNode)node; // This could be made more efficient by manipulating the // characterSet elements of the CharacterClassNodes but // for the moment, this is more clear. for(token=0; token < LeafNode._NUM_TOKENS; token++){ if(slash._matches(token)) current._addToken(token); } // A byproduct of this act is that when a '-' occurs after // a \d, \w, etc. it is not interpreted as a range and no // parse exception is thrown. // This is considered a feature and not a bug for now. continue; } } else { lastToken = __lookahead; current._addToken(__lookahead); if(!__caseSensitive) current._addToken(_toggleCase(__lookahead)); __match(__lookahead); } // In Perl, a - is a token if it occurs at the beginning // or end of the character class. Anywhere else, it indicates // a range. // A byproduct of this implementation is that if a '-' occurs // after the end of a range, it is interpreted as a '-' and no // exception is thrown. e.g., the second dash in [a-z-x] // This is considered a feature and not a bug for now. if(__lookahead == '-'){ __match('-'); if(__lookahead == ']'){ current._addToken('-'); break; } else if(__lookahead == '\\') { node = __backslashToken(); --__position; if(node instanceof TokenNode) token = ((TokenNode)node)._token; else throw new MalformedPatternException( "Parse error: invalid range specified at position " + __bytesRead); } else { token = __lookahead; __match(__lookahead); } if(token < lastToken) throw new MalformedPatternException( "Parse error: invalid range specified at position " + __bytesRead); current._addTokenRange(lastToken + 1, token); if(!__caseSensitive) current._addTokenRange(_toggleCase((char)(lastToken + 1)), _toggleCase(token)); } } __match(']'); __inCharacterClass = false; return current; } SyntaxNode _newTokenNode(char token, int position){ if(!__inCharacterClass && !__caseSensitive && (_isUpperCase(token) || _isLowerCase(token))){ CharacterClassNode node = new CharacterClassNode(position); node._addToken(token); node._addToken(_toggleCase(token)); return node; } return new TokenNode(token, position); } SyntaxTree _parse(char[] expression) throws MalformedPatternException { SyntaxTree tree; __openParen = __closeParen = 0; __regularExpression = expression; __bytesRead = 0; __expressionLength = expression.length; __inCharacterClass = false; __position = 0; __match(__lookahead); // Call match to read first input. if(__lookahead == '^') { __beginAnchor = true; __match(__lookahead); } if(__expressionLength > 0 && expression[__expressionLength - 1] == '$') { --__expressionLength; __endAnchor = true; } if(__expressionLength > 1 || (__expressionLength == 1 && !__beginAnchor)) { CatNode root; root = new CatNode(); root._left = __regex(); // end marker root._right = new TokenNode((char)LeafNode._END_MARKER_TOKEN, __position++); tree = new SyntaxTree(root, __position); } else tree = new SyntaxTree(new TokenNode((char)LeafNode._END_MARKER_TOKEN, 0), 1); tree._computeFollowPositions(); return tree; } /** * Compiles an Awk regular expression into an AwkPattern instance that * can be used by an AwkMatcher object to perform pattern matching. * <p> * @param pattern An Awk regular expression to compile. * @param options A set of flags giving the compiler instructions on * how to treat the regular expression. Currently the * only meaningful flag is AwkCompiler.CASE_INSENSITIVE_MASK. * @return A Pattern instance constituting the compiled regular expression. * This instance will always be an AwkPattern and can be reliably * be casted to an AwkPattern. * @exception MalformedPatternException If the compiled expression * is not a valid Awk regular expression. */ public Pattern compile(char[] pattern, int options) throws MalformedPatternException { SyntaxTree tree; AwkPattern regexp; __beginAnchor = __endAnchor = false; __caseSensitive = ((options & CASE_INSENSITIVE_MASK) == 0); tree = _parse(pattern); regexp = new AwkPattern(new String(pattern), tree); regexp._options = options; regexp._hasBeginAnchor = __beginAnchor; regexp._hasEndAnchor = __endAnchor; return regexp; } /** * Compiles an Awk regular expression into an AwkPattern instance that * can be used by an AwkMatcher object to perform pattern matching. * <p> * @param pattern An Awk regular expression to compile. * @param options A set of flags giving the compiler instructions on * how to treat the regular expression. Currently the * only meaningful flag is AwkCompiler.CASE_INSENSITIVE_MASK. * @return A Pattern instance constituting the compiled regular expression. * This instance will always be an AwkPattern and can be reliably * be casted to an AwkPattern. * @exception MalformedPatternException If the compiled expression * is not a valid Awk regular expression. */ public Pattern compile(String pattern, int options) throws MalformedPatternException { SyntaxTree tree; AwkPattern regexp; __beginAnchor = __endAnchor = false; __caseSensitive = ((options & CASE_INSENSITIVE_MASK) == 0); tree = _parse(pattern.toCharArray()); regexp = new AwkPattern(pattern, tree); regexp._options = options; regexp._hasBeginAnchor = __beginAnchor; regexp._hasEndAnchor = __endAnchor; return regexp; } /** * Same as calling <b>compile(pattern, AwkCompiler.DEFAULT_MASK);</b> * <p> * @param pattern A regular expression to compile. * @return A Pattern instance constituting the compiled regular expression. * This instance will always be an AwkPattern and can be reliably * be casted to an AwkPattern. * @exception MalformedPatternException If the compiled expression * is not a valid Awk regular expression. */ public Pattern compile(char[] pattern) throws MalformedPatternException { return compile(pattern, DEFAULT_MASK); } /** * Same as calling <b>compile(pattern, AwkCompiler.DEFAULT_MASK);</b> * <p> * @param pattern A regular expression to compile. * @return A Pattern instance constituting the compiled regular expression. * This instance will always be an AwkPattern and can be reliably * be casted to an AwkPattern. * @exception MalformedPatternException If the compiled expression * is not a valid Awk regular expression. */ public Pattern compile(String pattern) throws MalformedPatternException { return compile(pattern, DEFAULT_MASK); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -