📄 perl5compiler.java
字号:
package org.apache.oro.text.regex;/* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2000 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro" * must not be used to endorse or promote products derived from this * software without prior written permission. For written * permission, please contact apache@apache.org. * * 5. Products derived from this software may not be called "Apache" * or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their * name, without prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. * * Portions of this software are based upon software originally written * by Daniel F. Savarese. We appreciate his contributions. *//** * The Perl5Compiler class is used to create compiled regular expressions * conforming to the Perl5 regular expression syntax. It generates * Perl5Pattern instances upon compilation to be used in conjunction * with a Perl5Matcher instance. Please see the user's guide for more * information about Perl5 regular expressions. @author <a href="mailto:dfs@savarese.org">Daniel F. Savarese</a> @version $Id: Perl5Compiler.java,v 1.1 2004/01/10 00:58:23 mikedemmer Exp $ * @see PatternCompiler * @see MalformedPatternException * @see Perl5Pattern * @see Perl5Matcher */public final class Perl5Compiler implements PatternCompiler { private static final int __WORSTCASE = 0, __NONNULL = 0x1, __SIMPLE = 0x2, __SPSTART = 0x4, __TRYAGAIN = 0x8; private static final char __CASE_INSENSITIVE = 0x0001, __GLOBAL = 0x0002, __KEEP = 0x0004, __MULTILINE = 0x0008, __SINGLELINE = 0x0010, __EXTENDED = 0x0020, __READ_ONLY = 0x8000; private static final String __META_CHARS = "^$.[()|?+*\\"; private static final String __HEX_DIGIT = "0123456789abcdef0123456789ABCDEFx"; private CharStringPointer __input; private boolean __sawBackreference; private char[] __modifierFlags = { 0 }; // IMPORTANT: __numParentheses starts out equal to 1 during compilation. // It is always one greater than the number of parentheses encountered // so far in the regex. That is because it refers to the number of groups // to save, and the entire match is always saved (group 0) private int __numParentheses, __programSize, __cost; // When doing the second pass and actually generating code, __programSize // keeps track of the current offset. private char[] __program; /** * The default mask for the {@link #compile compile} methods. * It is equal to 0. * The default behavior is for a regular expression to be case sensitive * and to not specify if it is multiline or singleline. When MULITLINE_MASK * and SINGLINE_MASK are not defined, the <b>^</b>, <b>$</b>, and <b>.</b> * metacharacters are * interpreted according to the value of isMultiline() in Perl5Matcher. * The default behavior of Perl5Matcher is to treat the Perl5Pattern * as though MULTILINE_MASK were enabled. If isMultiline() returns false, * then the pattern is treated as though SINGLINE_MASK were set. However, * compiling a pattern with the MULTILINE_MASK or SINGLELINE_MASK masks * will ALWAYS override whatever behavior is specified by the setMultiline() * in Perl5Matcher. */ public static final int DEFAULT_MASK = 0; /** * A mask passed as an option to the {@link #compile compile} methods * to indicate a compiled regular expression should be case insensitive. */ public static final int CASE_INSENSITIVE_MASK = __CASE_INSENSITIVE; /** * A mask passed as an option to the {@link #compile compile} methods * to indicate a compiled regular expression should treat input as having * multiple lines. This option affects the interpretation of * the <b>^</b> and <b>$</b> metacharacters. When this mask is used, * the <b>^</b> metacharacter matches at the beginning of every line, * and the <b>$</b> metacharacter matches at the end of every line. * Additionally the <b> . </b> metacharacter will not match newlines when * an expression is compiled with <b> MULTILINE_MASK </b>, which is its * default behavior. * The <b>SINGLELINE_MASK</b> and <b>MULTILINE_MASK</b> should not be * used together. */ public static final int MULTILINE_MASK = __MULTILINE; /** * A mask passed as an option to the {@link #compile compile} methods * to indicate a compiled regular expression should treat input as being * a single line. This option affects the interpretation of * the <b>^</b> and <b>$</b> metacharacters. When this mask is used, * the <b>^</b> metacharacter matches at the beginning of the input, * and the <b>$</b> metacharacter matches at the end of the input. * The <b>^</b> and <b>$</b> metacharacters will not match at the beginning * and end of lines occurring between the begnning and end of the input. * Additionally, the <b> . </b> metacharacter will match newlines when * an expression is compiled with <b> SINGLELINE_MASK </b>, unlike its * default behavior. * The <b>SINGLELINE_MASK</b> and <b>MULTILINE_MASK</b> should not be * used together. */ public static final int SINGLELINE_MASK = __SINGLELINE; /** * A mask passed as an option to the {@link #compile compile} methods * to indicate a compiled regular expression should be treated as a Perl5 * extended pattern (i.e., a pattern using the <b>/x</b> modifier). This * option tells the compiler to ignore whitespace that is not backslashed or * within a character class. It also tells the compiler to treat the * <b>#</b> character as a metacharacter introducing a comment as in * Perl. In other words, the <b>#</b> character will comment out any * text in the regular expression between it and the next newline. * The intent of this option is to allow you to divide your patterns * into more readable parts. It is provided to maintain compatibility * with Perl5 regular expressions, although it will not often * make sense to use it in Java. */ public static final int EXTENDED_MASK = __EXTENDED; /** * A mask passed as an option to the {@link #compile compile} methods * to indicate that the resulting Perl5Pattern should be treated as a * read only data structure by Perl5Matcher, making it safe to share * a single Perl5Pattern instance among multiple threads without needing * synchronization. Without this option, Perl5Matcher reserves the right * to store heuristic or other information in Perl5Pattern that might * accelerate future matches. When you use this option, Perl5Matcher will * not store or modify any information in a Perl5Pattern. Use this option * when you want to share a Perl5Pattern instance among multiple threads * using different Perl5Matcher instances. */ public static final int READ_ONLY_MASK = __READ_ONLY; /** * Given a character string, returns a Perl5 expression that interprets * each character of the original string literally. In other words, all * special metacharacters are quoted/escaped. This method is useful for * converting user input meant for literal interpretation into a safe * regular expression representing the literal input. * <p> * In effect, this method is the analog of the Perl5 quotemeta() builtin * method. * <p> * @param expression The expression to convert. * @return A String containing a Perl5 regular expression corresponding to * a literal interpretation of the pattern. */ public static final String quotemeta(char[] expression) { int ch; StringBuffer buffer; buffer = new StringBuffer(2*expression.length); for(ch = 0; ch < expression.length; ch++) { if(!OpCode._isWordCharacter(expression[ch])) buffer.append('\\'); buffer.append(expression[ch]); } return buffer.toString(); } /** * Given a character string, returns a Perl5 expression that interprets * each character of the original string literally. In other words, all * special metacharacters are quoted/escaped. This method is useful for * converting user input meant for literal interpretation into a safe * regular expression representing the literal input. * <p> * In effect, this method is the analog of the Perl5 quotemeta() builtin * method. * <p> * @param pattern The pattern to convert. * @return A String containing a Perl5 regular expression corresponding to * a literal interpretation of the pattern. */ public static final String quotemeta(String expression) { return quotemeta(expression.toCharArray()); } private static boolean __isSimpleRepetitionOp(char ch) { return (ch == '*' || ch == '+' || ch == '?'); } private static boolean __isComplexRepetitionOp(char[] ch, int offset) { if(offset < ch.length && offset >= 0) return (ch[offset] == '*' || ch[offset] == '+' || ch[offset] == '?' || (ch[offset] == '{' && __parseRepetition(ch, offset))); return false; } // determines if {\d+,\d*} is the next part of the string private static boolean __parseRepetition(char[] str, int offset) { if(str[offset] != '{') return false; ++offset; if(offset >= str.length || !Character.isDigit(str[offset])) return false; while(offset < str.length && Character.isDigit(str[offset])) ++offset; if(offset < str.length && str[offset] == ',') ++offset; while(offset < str.length && Character.isDigit(str[offset])) ++offset; if(offset >= str.length || str[offset] != '}') return false; return true; } private static int __parseHex(char[] str, int offset, int maxLength, int[] scanned) { int val = 0, index; scanned[0] = 0; while(offset < str.length && maxLength-- > 0 && (index = __HEX_DIGIT.indexOf(str[offset])) != -1) { val <<= 4; val |= (index & 15); ++offset; ++scanned[0]; } return val; } private static int __parseOctal(char[] str, int offset, int maxLength, int[] scanned) { int val = 0, index; scanned[0] = 0; while(offset < str.length && maxLength > 0 && str[offset] >= '0' && str[offset] <= '7') { val <<= 3; val |= (str[offset] - '0'); --maxLength; ++offset; ++scanned[0]; } return val; } private static void __setModifierFlag(char[] flags, char ch) { switch(ch) { case 'i' : flags[0] |= __CASE_INSENSITIVE; return; case 'g' : flags[0] |= __GLOBAL; return; case 'o' : flags[0] |= __KEEP; return; case 'm' : flags[0] |= __MULTILINE; return; case 's' : flags[0] |= __SINGLELINE; return; case 'x' : flags[0] |= __EXTENDED; return; } } // Emit a specific character code. private void __emitCode(char code) { if(__program != null) __program[__programSize] = code; ++__programSize; } // Emit an operator with no arguments. // Return an offset into the __program array as a pointer to node. private int __emitNode(char operator) { int offset; offset = __programSize; if(__program == null) __programSize+=2; else { __program[__programSize++] = operator; __program[__programSize++] = OpCode._NULL_POINTER; } return offset; } // Emit an operator with arguments. // Return an offset into the __programarray as a pointer to node. private int __emitArgNode(char operator, char arg) { int offset; offset = __programSize; if(__program== null) __programSize+=3; else { __program[__programSize++] = operator; __program[__programSize++] = OpCode._NULL_POINTER; __program[__programSize++] = arg; } return offset; } // Insert an operator at a given offset. private void __programInsertOperator(char operator, int operand) { int src, dest, offset; offset = (OpCode._opType[operator] == OpCode._CURLY ? 2 : 0); if(__program== null) { __programSize+=(2 + offset); return; } src = __programSize; __programSize+=(2 + offset); dest = __programSize; while(src > operand) { --src; --dest; __program[dest] = __program[src]; } __program[operand++] = operator; __program[operand++] = OpCode._NULL_POINTER; while(offset-- > 0) __program[operand++] = OpCode._NULL_POINTER; } private void __programAddTail(int current, int value) { int scan, temp, offset; if(__program== null || current == OpCode._NULL_OFFSET) return; scan = current; while(true) { temp = OpCode._getNext(__program, scan); if(temp == OpCode._NULL_OFFSET) break; scan = temp; } if(__program[scan] == OpCode._BACK) offset = scan - value; else offset = value - scan; __program[scan + 1] = (char)offset; } private void __programAddOperatorTail(int current, int value) { if(__program== null || current == OpCode._NULL_OFFSET || OpCode._opType[__program[current]] != OpCode._BRANCH) return; __programAddTail(OpCode._getNextOperator(current), value); } private char __getNextChar() { char ret, value; ret = __input._postIncrement(); while(true) { value = __input._getValue(); if(value == '(' && __input._getValueRelative(1) == '?' && __input._getValueRelative(2) == '#') { // Skip comments while(value != CharStringPointer._END_OF_STRING && value != ')') value = __input._increment();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -