📄 regexparser.java
字号:
/* * Copyright 1999-2004 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package com.sun.org.apache.xerces.internal.impl.xpath.regex;import java.util.Locale;import java.util.MissingResourceException;import java.util.ResourceBundle;import java.util.Vector;/** * A Regular Expression Parser. * * @xerces.internal * * @version $Id: RegexParser.java,v 1.2.6.1 2005/09/06 11:46:34 neerajbj Exp $ */class RegexParser { static final int T_CHAR = 0; static final int T_EOF = 1; static final int T_OR = 2; // '|' static final int T_STAR = 3; // '*' static final int T_PLUS = 4; // '+' static final int T_QUESTION = 5; // '?' static final int T_LPAREN = 6; // '(' static final int T_RPAREN = 7; // ')' static final int T_DOT = 8; // '.' static final int T_LBRACKET = 9; // '[' static final int T_BACKSOLIDUS = 10; // '\' static final int T_CARET = 11; // '^' static final int T_DOLLAR = 12; // '$' static final int T_LPAREN2 = 13; // '(?:' static final int T_LOOKAHEAD = 14; // '(?=' static final int T_NEGATIVELOOKAHEAD = 15; // '(?!' static final int T_LOOKBEHIND = 16; // '(?<=' static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!' static final int T_INDEPENDENT = 18; // '(?>' static final int T_SET_OPERATIONS = 19; // '(?[' static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class static final int T_COMMENT = 21; // '(?#' static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z] static final int T_CONDITION = 23; // '(?(' static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class static class ReferencePosition { int refNumber; int position; ReferencePosition(int n, int pos) { this.refNumber = n; this.position = pos; } } int offset; String regex; int regexlen; int options; ResourceBundle resources; int chardata; int nexttoken; static protected final int S_NORMAL = 0; static protected final int S_INBRACKETS = 1; static protected final int S_INXBRACKETS = 2; int context = S_NORMAL; int parennumber = 1; boolean hasBackReferences; Vector references = null; public RegexParser() { this.setLocale(Locale.getDefault()); } public RegexParser(Locale locale) { this.setLocale(locale); } public void setLocale(Locale locale) { try { this.resources = ResourceBundle.getBundle("com.sun.org.apache.xerces.internal.impl.xpath.regex.message", locale); } catch (MissingResourceException mre) { throw new RuntimeException("Installation Problem??? Couldn't load messages: " +mre.getMessage()); } } final ParseException ex(String key, int loc) { return new ParseException(this.resources.getString(key), loc); } private final boolean isSet(int flag) { return (this.options & flag) == flag; } synchronized Token parse(String regex, int options) throws ParseException { this.options = options; this.offset = 0; this.setContext(S_NORMAL); this.parennumber = 1; this.hasBackReferences = false; this.regex = regex; if (this.isSet(RegularExpression.EXTENDED_COMMENT)) this.regex = REUtil.stripExtendedComment(this.regex); this.regexlen = this.regex.length(); this.next(); Token ret = this.parseRegex(); if (this.offset != this.regexlen) throw ex("parser.parse.1", this.offset); if (this.references != null) { for (int i = 0; i < this.references.size(); i ++) { ReferencePosition position = (ReferencePosition)this.references.elementAt(i); if (this.parennumber <= position.refNumber) throw ex("parser.parse.2", position.position); } this.references.removeAllElements(); } return ret; } /* public RegularExpression createRegex(String regex, int options) throws ParseException { Token tok = this.parse(regex, options); return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options); } */ protected final void setContext(int con) { this.context = con; } final int read() { return this.nexttoken; } final void next() { if (this.offset >= this.regexlen) { this.chardata = -1; this.nexttoken = T_EOF; return; } int ret; int ch = this.regex.charAt(this.offset++); this.chardata = ch; if (this.context == S_INBRACKETS) { // In a character class, this.chardata has one character, that is to say, // a pair of surrogates is composed and stored to this.chardata. switch (ch) { case '\\': ret = T_BACKSOLIDUS; if (this.offset >= this.regexlen) throw ex("parser.next.1", this.offset-1); this.chardata = this.regex.charAt(this.offset++); break; case '-': if (this.isSet(RegularExpression.XMLSCHEMA_MODE) && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') { this.offset++; ret = T_XMLSCHEMA_CC_SUBTRACTION; } else ret = T_CHAR; break; case '[': if (!this.isSet(RegularExpression.XMLSCHEMA_MODE) && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') { this.offset++; ret = T_POSIX_CHARCLASS_START; break; } // Through down default: if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) { int low = this.regex.charAt(this.offset); if (REUtil.isLowSurrogate(low)) { this.chardata = REUtil.composeFromSurrogates(ch, low); this.offset ++; } } ret = T_CHAR; } this.nexttoken = ret; return; } switch (ch) { case '|': ret = T_OR; break; case '*': ret = T_STAR; break; case '+': ret = T_PLUS; break; case '?': ret = T_QUESTION; break; case ')': ret = T_RPAREN; break; case '.': ret = T_DOT; break; case '[': ret = T_LBRACKET; break; case '^': ret = T_CARET; break; case '$': ret = T_DOLLAR; break; case '(': ret = T_LPAREN; if (this.offset >= this.regexlen) break; if (this.regex.charAt(this.offset) != '?') break; if (++this.offset >= this.regexlen) throw ex("parser.next.2", this.offset-1); ch = this.regex.charAt(this.offset++); switch (ch) { case ':': ret = T_LPAREN2; break; case '=': ret = T_LOOKAHEAD; break; case '!': ret = T_NEGATIVELOOKAHEAD; break; case '[': ret = T_SET_OPERATIONS; break; case '>': ret = T_INDEPENDENT; break; case '<': if (this.offset >= this.regexlen) throw ex("parser.next.2", this.offset-3); ch = this.regex.charAt(this.offset++); if (ch == '=') { ret = T_LOOKBEHIND; } else if (ch == '!') { ret = T_NEGATIVELOOKBEHIND; } else throw ex("parser.next.3", this.offset-3); break; case '#': while (this.offset < this.regexlen) { ch = this.regex.charAt(this.offset++); if (ch == ')') break; } if (ch != ')') throw ex("parser.next.4", this.offset-1); ret = T_COMMENT; break; default: if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options this.offset --; ret = T_MODIFIERS; break; } else if (ch == '(') { // conditional ret = T_CONDITION; // this.offsets points the next of '('. break; } throw ex("parser.next.2", this.offset-2); } break; case '\\': ret = T_BACKSOLIDUS; if (this.offset >= this.regexlen) throw ex("parser.next.1", this.offset-1); this.chardata = this.regex.charAt(this.offset++); break; default: ret = T_CHAR; } this.nexttoken = ret; } /** * regex ::= term (`|` term)* * term ::= factor+ * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' * | atom (('*' | '+' | '?' | minmax ) '?'? )?) * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')' * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9] * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block */ Token parseRegex() throws ParseException { Token tok = this.parseTerm(); Token parent = null; while (this.read() == T_OR) { this.next(); // '|' if (parent == null) { parent = Token.createUnion(); parent.addChild(tok); tok = parent; } tok.addChild(this.parseTerm()); } return tok; } /** * term ::= factor+ */ Token parseTerm() throws ParseException { int ch = this.read(); if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) { return Token.createEmpty(); } else { Token tok = this.parseFactor(); Token concat = null; while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) { if (concat == null) { concat = Token.createConcat(); concat.addChild(tok); tok = concat; } concat.addChild(this.parseFactor()); //tok = Token.createConcat(tok, this.parseFactor()); } return tok; } } // ---------------------------------------------------------------- Token processCaret() throws ParseException { this.next(); return Token.token_linebeginning; } Token processDollar() throws ParseException { this.next(); return Token.token_lineend; } Token processLookahead() throws ParseException { this.next(); Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex()); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); // ')' return tok; } Token processNegativelookahead() throws ParseException { this.next(); Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex()); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); // ')' return tok; } Token processLookbehind() throws ParseException { this.next(); Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex()); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); // ')' return tok; } Token processNegativelookbehind() throws ParseException { this.next(); Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex()); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); // ')' return tok; } Token processBacksolidus_A() throws ParseException { this.next(); return Token.token_stringbeginning; } Token processBacksolidus_Z() throws ParseException { this.next(); return Token.token_stringend2;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -