📄 regcomp.java
字号:
/* * Copyright (c) 1998-2008 Caucho Technology -- all rights reserved * * This file is part of Resin(R) Open Source * * Each copy or derived work must preserve the copyright notice and this * notice unmodified. * * Resin Open Source is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * Resin Open Source is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty * of NON-INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with Resin Open Source; if not, write to the * * Free Software Foundation, Inc. * 59 Temple Place, Suite 330 * Boston, MA 02111-1307 USA * * @author Scott Ferguson *//* * XXX: anchored expressions should have flags for quick matching. */package com.caucho.quercus.lib.regexp;import java.util.*;import java.util.logging.*;import com.caucho.quercus.env.StringValue;import com.caucho.quercus.env.StringBuilderValue;import com.caucho.util.*;/** * Regular expression compilation. */class Regcomp { private static final Logger log = Logger.getLogger(Regcomp.class.getName()); private static final L10N L = new L10N(RegexpNode.class); // #2526, JIT issues with Integer.MAX_VALUE private static final int INTEGER_MAX = Integer.MAX_VALUE - 1; static final int MULTILINE = 0x1; static final int SINGLE_LINE = 0x2; static final int IGNORE_CASE = 0x4; static final int IGNORE_WS = 0x8; static final int GLOBAL = 0x10; static final int ANCHORED = 0x20; static final int END_ONLY = 0x40; static final int UNGREEDY = 0x80; static final int STRICT = 0x100; static final HashMap<String,Integer> _characterClassMap = new HashMap<String,Integer>(); int _nGroup; int _nLoop; int _maxGroup; int _flags; HashMap<Integer,StringValue> _groupNameMap = new HashMap<Integer,StringValue>(); HashMap<StringValue,Integer> _groupNameReverseMap = new HashMap<StringValue,Integer>(); RegexpNode _groupTail; boolean _isLookbehind; boolean _isOr; Regcomp(int flags) { _flags = flags; } boolean isGreedy() { return (_flags & UNGREEDY) != UNGREEDY; } boolean isIgnoreCase() { return (_flags & IGNORE_CASE) == IGNORE_CASE; } boolean isIgnoreWs() { return (_flags & IGNORE_WS) == IGNORE_WS; } boolean isMultiline() { return (_flags & MULTILINE) == MULTILINE; } boolean isDollarEndOnly() { return (_flags & END_ONLY) == END_ONLY; } int nextLoopIndex() { return _nLoop++; } RegexpNode parse(PeekStream pattern) throws IllegalRegexpException { _nGroup = 1; RegexpNode begin = null; if ((_flags & ANCHORED) != 0) begin = RegexpNode.ANCHOR_BEGIN; RegexpNode value = parseRec(pattern, begin); int ch; while ((ch = pattern.read()) == '|') { value = RegexpNode.Or.create(value, parseRec(pattern, begin)); } value = value != null ? value.getHead() : RegexpNode.N_END; if (_maxGroup < _nGroup) _maxGroup = _nGroup; if (log.isLoggable(Level.FINEST)) log.finest("regexp[] " + value); return value; } /** * Recursively compile a RegexpNode. * * first -- The first node of this sub-RegexpNode * prev -- The previous node of this sub-RegexpNode * last_begin -- When the last grouping began * last_end -- When the last grouping ended * * head -> node * v -- rest * ... * v -- rest * node * * last -> node * v -- rest * ... * v -- rest * node */ private RegexpNode parseRec(PeekStream pattern, RegexpNode tail) throws IllegalRegexpException { int ch = pattern.read(); RegexpNode next; RegexpNode groupTail; switch (ch) { case -1: return tail != null ? tail.getHead() : null; case '?': if (tail == null) throw error(L.l("'?' requires a preceeding regexp")); tail = createLoop(pattern, tail, 0, 1); return parseRec(pattern, tail.getTail()); case '*': if (tail == null) throw error(L.l("'*' requires a preceeding regexp")); tail = createLoop(pattern, tail, 0, INTEGER_MAX); return parseRec(pattern, tail.getTail()); case '+': if (tail == null) throw error(L.l("'+' requires a preceeding regexp")); tail = createLoop(pattern, tail, 1, INTEGER_MAX); return parseRec(pattern, tail.getTail()); case '{': if (tail == null || ! ('0' <= pattern.peek() && pattern.peek() <= '9')) { next = parseString('{', pattern); return concat(tail, parseRec(pattern, next)); } return parseRec(pattern, parseBrace(pattern, tail).getTail()); case '.': if ((_flags & SINGLE_LINE) == 0) next = RegexpNode.DOT; else next = RegexpNode.ANY_CHAR; return concat(tail, parseRec(pattern, next)); case '|': pattern.ungetc(ch); if (_groupTail != null) return concat(tail, _groupTail); else return tail.getHead(); case '(': { switch (pattern.peek()) { case '?': pattern.read(); switch (pattern.peek()) { case ':': pattern.read(); return parseGroup(pattern, tail, 0, _flags); case '#': parseCommentGroup(pattern); return parseRec(pattern, tail); case '(': return parseConditional(pattern, tail); case '=': case '!': ch = pattern.read(); boolean isPositive = (ch == '='); groupTail = _groupTail; _groupTail = null; next = parseRec(pattern, null); while ((ch = pattern.read()) == '|') { RegexpNode nextHead = parseRec(pattern, null); next = next.createOr(nextHead); } if (isPositive) next = new RegexpNode.Lookahead(next); else next = new RegexpNode.NotLookahead(next); if (ch != ')') throw error(L.l("expected ')' at '{0}'", String.valueOf((char) ch))); _groupTail = groupTail; return concat(tail, parseRec(pattern, next)); case '<': pattern.read(); switch (pattern.read()) { case '=': isPositive = true; break; case '!': isPositive = false; break; default: throw error(L.l("expected '=' or '!'")); } groupTail = _groupTail; _groupTail = null; next = parseRec(pattern, null); if (next == null) { } else if (isPositive) next = new RegexpNode.Lookbehind(next); else next = new RegexpNode.NotLookbehind(next); while ((ch = pattern.read()) == '|') { RegexpNode second = parseRec(pattern, null); if (second == null) { } else if (isPositive) second = new RegexpNode.Lookbehind(second); else second = new RegexpNode.NotLookbehind(second); if (second != null) next = next.createOr(second); } if (ch != ')') throw error(L.l("expected ')' at '{0}'", String.valueOf((char) ch))); _groupTail = groupTail; return concat(tail, parseRec(pattern, next)); // XXX: once-only subpatterns (mostly an optimization feature) case '>': pattern.read(); return parseGroup(pattern, tail, 0, _flags); case 'P': pattern.read(); return parseNamedGroup(pattern, tail); case 'm': case 's': case 'i': case 'x': case 'g': case 'U': case 'X': { int flags = _flags; while ((ch = pattern.read()) > 0 && ch != ')') { switch (ch) { case 'm': _flags |= MULTILINE; break; case 's': _flags |= SINGLE_LINE; break; case 'i': _flags |= IGNORE_CASE; break; case 'x': _flags |= IGNORE_WS; break; case 'g': _flags |= GLOBAL; break; case 'U': _flags |= UNGREEDY; break; case 'X': _flags |= STRICT; break; case ':': { return parseGroup(pattern, tail, 0, flags); } default: throw error(L.l("'{0}' is an unknown (? code", String.valueOf((char) ch))); } } if (ch != ')') throw error(L.l("expected ')' at '{0}'", String.valueOf((char) ch))); RegexpNode node = parseRec(pattern, tail); _flags = flags; return node; } default: throw error(L.l("'{0}' is an unknown (? code", String.valueOf((char) pattern.peek()))); } default: return parseGroup(pattern, tail, _nGroup++, _flags); } } case ')': pattern.ungetc(ch); if (_groupTail != null) return concat(tail, _groupTail); else return tail; case '[': next = parseSet(pattern); return concat(tail, parseRec(pattern, next)); case '\\': next = parseSlash(pattern); return concat(tail, parseRec(pattern, next)); case '^': if (isMultiline()) next = RegexpNode.ANCHOR_BEGIN_OR_NEWLINE; else next = RegexpNode.ANCHOR_BEGIN; return concat(tail, parseRec(pattern, next)); case '$': if (isMultiline()) next = RegexpNode.ANCHOR_END_OR_NEWLINE; else if (isDollarEndOnly()) next = RegexpNode.ANCHOR_END_ONLY; else next = RegexpNode.ANCHOR_END; return concat(tail, parseRec(pattern, next)); case ' ': case '\n': case '\t': case '\r': if (isIgnoreWs()) { while (Character.isSpace((char) pattern.peek())) pattern.read(); return parseRec(pattern, tail); } else { next = parseString(ch, pattern); return concat(tail, parseRec(pattern, next)); } case '#': if (isIgnoreWs()) { while ((ch = pattern.read()) > 0 && ch!= '\n') { } return parseRec(pattern, tail); } else { next = parseString(ch, pattern); return concat(tail, parseRec(pattern, next)); } default: next = parseString(ch, pattern); return concat(tail, parseRec(pattern, next)); } } private void parseCommentGroup(PeekStream pattern) { int ch; // (?#...) Comment while ((ch = pattern.read()) >= 0 && ch != ')') { } } private RegexpNode parseNamedGroup(PeekStream pattern, RegexpNode tail) throws IllegalRegexpException { int ch = pattern.read(); if (ch == '=') { StringBuilder sb = new StringBuilder(); while ((ch = pattern.read()) != ')' && ch >= 0) { sb.append((char) ch); } if (ch != ')') throw error(L.l("expected ')'")); String name = sb.toString(); Integer v = _groupNameReverseMap.get(new StringBuilderValue(name)); if (v != null) { RegexpNode next = new RegexpNode.GroupRef(v); return concat(tail, parseRec(pattern, next)); } else throw error(L.l("'{0}' is an unknown regexp group", name)); } else if (ch == '<') { StringBuilder sb = new StringBuilder(); while ((ch = pattern.read()) != '>' && ch >= 0) { sb.append((char) ch); } if (ch != '>') throw error(L.l("expected '>'")); String name = sb.toString(); int group = _nGroup++; _groupNameMap.put(group, new StringBuilderValue(name)); _groupNameReverseMap.put(new StringBuilderValue(name), group); return parseGroup(pattern, tail, group, _flags); } else throw error(L.l("Expected '(?:P=name' or '(?:P<name' for named group")); } private RegexpNode parseConditional(PeekStream pattern, RegexpNode tail) throws IllegalRegexpException { int ch = pattern.read(); if (ch != '(') throw error(L.l("expected '('")); RegexpNode.ConditionalHead groupHead = null;; RegexpNode groupTail = null; if ('1' <= (ch = pattern.peek()) && ch <= '9') { int value = 0; while ('0' <= (ch = pattern.read()) && ch <= '9') { value = 10 * value + ch - '0'; } if (ch != ')') throw error(L.l("expected ')'")); if (_nGroup <= value) throw error(L.l("conditional value less than number of groups")); groupHead = new RegexpNode.ConditionalHead(value);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -