📄 re.java
字号:
* If it ever is fixed, test #137 in RETest.txt should be updated.
* </ul>
*
* </font>
*
* @see recompile
* @see RECompiler
*
* @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
* @version $Id: RE.java,v 1.1.1.1 2002/01/31 03:14:36 rcm Exp $
*/
public class RE
{
/**
* Specifies normal, case-sensitive matching behaviour.
*/
public static final int MATCH_NORMAL = 0x0000;
/**
* Flag to indicate that matching should be case-independent (folded)
*/
public static final int MATCH_CASEINDEPENDENT = 0x0001;
/**
* Newlines should match as BOL/EOL (^ and $)
*/
public static final int MATCH_MULTILINE = 0x0002;
/**
* Consider all input a single body of text - newlines are matched by .
*/
public static final int MATCH_SINGLELINE = 0x0004;
/************************************************
* *
* The format of a node in a program is: *
* *
* [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
* *
* char OPCODE - instruction *
* char OPDATA - modifying data *
* char OPNEXT - next node (relative offset) *
* *
************************************************/
// Opcode Char Opdata/Operand Meaning
// ---------- ---------- --------------- --------------------------------------------------
static final char OP_END = 'E'; // end of program
static final char OP_BOL = '^'; // match only if at beginning of line
static final char OP_EOL = '$'; // match only if at end of line
static final char OP_ANY = '.'; // match any single character except newline
static final char OP_ANYOF = '['; // count/ranges match any char in the list of ranges
static final char OP_BRANCH = '|'; // node match this alternative or the next one
static final char OP_ATOM = 'A'; // length/string length of string followed by string itself
static final char OP_STAR = '*'; // node kleene closure
static final char OP_PLUS = '+'; // node positive closure
static final char OP_MAYBE = '?'; // node optional closure
static final char OP_ESCAPE = '\\'; // escape special escape code char class (escape is E_* code)
static final char OP_OPEN = '('; // number nth opening paren
static final char OP_CLOSE = ')'; // number nth closing paren
static final char OP_BACKREF = '#'; // number reference nth already matched parenthesized string
static final char OP_GOTO = 'G'; // nothing but a (back-)pointer
static final char OP_NOTHING = 'N'; // match null string such as in '(a|)'
static final char OP_RELUCTANTSTAR = '8'; // none/expr reluctant '*' (mnemonic for char is unshifted '*')
static final char OP_RELUCTANTPLUS = '='; // none/expr reluctant '+' (mnemonic for char is unshifted '+')
static final char OP_RELUCTANTMAYBE = '/'; // none/expr reluctant '?' (mnemonic for char is unshifted '?')
static final char OP_POSIXCLASS = 'P'; // classid one of the posix character classes
// Escape codes
static final char E_ALNUM = 'w'; // Alphanumeric
static final char E_NALNUM = 'W'; // Non-alphanumeric
static final char E_BOUND = 'b'; // Word boundary
static final char E_NBOUND = 'B'; // Non-word boundary
static final char E_SPACE = 's'; // Whitespace
static final char E_NSPACE = 'S'; // Non-whitespace
static final char E_DIGIT = 'd'; // Digit
static final char E_NDIGIT = 'D'; // Non-digit
// Posix character classes
static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics
static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics
static final char POSIX_CLASS_BLANK = 'b'; // Blanks
static final char POSIX_CLASS_CNTRL = 'c'; // Control characters
static final char POSIX_CLASS_DIGIT = 'd'; // Digits
static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters
static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters
static final char POSIX_CLASS_PRINT = 'p'; // Printable characters
static final char POSIX_CLASS_PUNCT = '!'; // Punctuation
static final char POSIX_CLASS_SPACE = 's'; // Spaces
static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters
static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits
static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start
static final char POSIX_CLASS_JPART = 'k'; // Java identifier part
// Limits
static final int maxNode = 65536; // Maximum number of nodes in a program
static final int maxParen = 16; // Number of paren pairs (only 9 can be backrefs)
// Node layout constants
static final int offsetOpcode = 0; // Opcode offset (first character)
static final int offsetOpdata = 1; // Opdata offset (second char)
static final int offsetNext = 2; // Next index offset (third char)
static final int nodeSize = 3; // Node size (in chars)
/** Line Separator */
static final String NEWLINE = System.getProperty("line.separator");
// State of current program
REProgram program; // Compiled regular expression 'program'
CharacterIterator search; // The string being matched against
int idx; // Current index in string being searched
int matchFlags; // Match behaviour flags
// Parenthesized subexpressions
int parenCount; // Number of subexpressions matched (num open parens + 1)
int start0; // Cache of start[0]
int end0; // Cache of start[0]
int start1; // Cache of start[1]
int end1; // Cache of start[1]
int start2; // Cache of start[2]
int end2; // Cache of start[2]
int[] startn; // Lazy-alloced array of sub-expression starts
int[] endn; // Lazy-alloced array of sub-expression ends
// Backreferences
int[] startBackref; // Lazy-alloced array of backref starts
int[] endBackref; // Lazy-alloced array of backref ends
/**
* Constructs a regular expression matcher from a String by compiling it
* using a new instance of RECompiler. If you will be compiling many
* expressions, you may prefer to use a single RECompiler object instead.
* @param pattern The regular expression pattern to compile.
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
* @see RECompiler
* @see recompile
*/
public RE(String pattern) throws RESyntaxException
{
this(pattern, MATCH_NORMAL);
}
/**
* Constructs a regular expression matcher from a String by compiling it
* using a new instance of RECompiler. If you will be compiling many
* expressions, you may prefer to use a single RECompiler object instead.
* @param pattern The regular expression pattern to compile.
* @param matchFlags The matching style
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
* @see RECompiler
* @see recompile
*/
public RE(String pattern, int matchFlags) throws RESyntaxException
{
this(new RECompiler().compile(pattern));
setMatchFlags(matchFlags);
}
/**
* Construct a matcher for a pre-compiled regular expression from program
* (bytecode) data. Permits special flags to be passed in to modify matching
* behaviour.
* @param program Compiled regular expression program (see RECompiler and/or recompile)
* @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
*
* <pre>
*
* MATCH_NORMAL // Normal (case-sensitive) matching
* MATCH_CASEINDEPENDENT // Case folded comparisons
* MATCH_MULTILINE // Newline matches as BOL/EOL
*
* </pre>
*
* @see RECompiler
* @see REProgram
* @see recompile
*/
public RE(REProgram program, int matchFlags)
{
setProgram(program);
setMatchFlags(matchFlags);
}
/**
* Construct a matcher for a pre-compiled regular expression from program
* (bytecode) data.
* @param program Compiled regular expression program
* @see RECompiler
* @see recompile
*/
public RE(REProgram program)
{
this(program, MATCH_NORMAL);
}
/**
* Constructs a regular expression matcher with no initial program.
* This is likely to be an uncommon practice, but is still supported.
*/
public RE()
{
this((REProgram)null, MATCH_NORMAL);
}
/**
* Converts a 'simplified' regular expression to a full regular expression
* @param pattern The pattern to convert
* @return The full regular expression
*/
public static String simplePatternToFullRegularExpression(String pattern)
{
StringBuffer buf = new StringBuffer();
for (int i = 0; i < pattern.length(); i++)
{
char c = pattern.charAt(i);
switch (c)
{
case '*':
buf.append(".*");
break;
case '.':
case '[':
case ']':
case '\\':
case '+':
case '?':
case '{':
case '}':
case '$':
case '^':
case '|':
case '(':
case ')':
buf.append('\\');
default:
buf.append(c);
break;
}
}
return buf.toString();
}
/**
* Sets match behaviour flags which alter the way RE does matching.
* @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
*
* <pre>
*
* MATCH_NORMAL // Normal (case-sensitive) matching
* MATCH_CASEINDEPENDENT // Case folded comparisons
* MATCH_MULTILINE // Newline matches as BOL/EOL
*
* </pre>
*
*/
public void setMatchFlags(int matchFlags)
{
this.matchFlags = matchFlags;
}
/**
* Returns the current match behaviour flags.
* @return Current match behaviour flags (RE.MATCH_*).
*
* <pre>
*
* MATCH_NORMAL // Normal (case-sensitive) matching
* MATCH_CASEINDEPENDENT // Case folded comparisons
* MATCH_MULTILINE // Newline matches as BOL/EOL
*
* </pre>
*
* @see #setMatchFlags
*
*/
public int getMatchFlags()
{
return matchFlags;
}
/**
* Sets the current regular expression program used by this matcher object.
* @param program Regular expression program compiled by RECompiler.
* @see RECompiler
* @see REProgram
* @see recompile
*/
public void setProgram(REProgram program)
{
this.program = program;
}
/**
* Returns the current regular expression program in use by this matcher object.
* @return Regular expression program
* @see #setProgram
*/
public REProgram getProgram()
{
return program;
}
/**
* Returns the number of parenthesized subexpressions available after a successful match.
* @return Number of available parenthesized subexpressions
*/
public int getParenCount()
{
return parenCount;
}
/**
* Gets the contents of a parenthesized subexpression after a successful match.
* @param which Nesting level of subexpression
* @return String
*/
public String getParen(int which)
{
int start;
if (which < parenCount && (start = getParenStart(which)) >= 0)
{
return search.substring(start, getParenEnd(which));
}
return null;
}
/**
* Returns the start index of a given paren level.
* @param which Nesting level of subexpression
* @return String index
*/
public final int getParenStart(int which)
{
if (which < parenCount)
{
switch (which)
{
case 0:
return start0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -