📄 recompiler.java
字号:
}
// Must have close brace
if (idx >= len || pattern.charAt(idx++) != '}')
{
syntaxError("Missing close brace");
}
}
/**
* Match an escape sequence. Handles quoted chars and octal escapes as well
* as normal escape characters. Always advances the input stream by the
* right amount. This code "understands" the subtle difference between an
* octal escape and a backref. You can access the type of ESC_CLASS or
* ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
* @return ESC_* code or character if simple escape
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
*/
char escape() throws RESyntaxException
{
// "Shouldn't" happen
if (pattern.charAt(idx) != '\\')
{
internalError();
}
// Escape shouldn't occur as last character in string!
if (idx + 1 == len)
{
syntaxError("Escape terminates string");
}
// Switch on character after backslash
idx += 2;
char escapeChar = pattern.charAt(idx - 1);
switch (escapeChar)
{
case RE.E_BOUND:
case RE.E_NBOUND:
return ESC_COMPLEX;
case RE.E_ALNUM:
case RE.E_NALNUM:
case RE.E_SPACE:
case RE.E_NSPACE:
case RE.E_DIGIT:
case RE.E_NDIGIT:
return ESC_CLASS;
case 'u':
case 'x':
{
// Exact required hex digits for escape type
int hexDigits = (escapeChar == 'u' ? 4 : 2);
// Parse up to hexDigits characters from input
int val = 0;
for ( ; idx < len && hexDigits-- > 0; idx++)
{
// Get char
char c = pattern.charAt(idx);
// If it's a hexadecimal digit (0-9)
if (c >= '0' && c <= '9')
{
// Compute new value
val = (val << 4) + c - '0';
}
else
{
// If it's a hexadecimal letter (a-f)
c = Character.toLowerCase(c);
if (c >= 'a' && c <= 'f')
{
// Compute new value
val = (val << 4) + (c - 'a') + 10;
}
else
{
// If it's not a valid digit or hex letter, the escape must be invalid
// because hexDigits of input have not been absorbed yet.
syntaxError("Expected " + hexDigits + " hexadecimal digits after \\" + escapeChar);
}
}
}
return (char)val;
}
case 't':
return '\t';
case 'n':
return '\n';
case 'r':
return '\r';
case 'f':
return '\f';
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
// An octal escape starts with a 0 or has two digits in a row
if ((idx < len && Character.isDigit(pattern.charAt(idx))) || escapeChar == '0')
{
// Handle \nnn octal escapes
int val = escapeChar - '0';
if (idx < len && Character.isDigit(pattern.charAt(idx)))
{
val = ((val << 3) + (pattern.charAt(idx++) - '0'));
if (idx < len && Character.isDigit(pattern.charAt(idx)))
{
val = ((val << 3) + (pattern.charAt(idx++) - '0'));
}
}
return (char)val;
}
// It's actually a backreference (\[1-9]), not an escape
return ESC_BACKREF;
default:
// Simple quoting of a character
return escapeChar;
}
}
/**
* Compile a character class
* @return Index of class node
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
*/
int characterClass() throws RESyntaxException
{
// Check for bad calling or empty class
if (pattern.charAt(idx) != '[')
{
internalError();
}
// Check for unterminated or empty class
if ((idx + 1) >= len || pattern.charAt(++idx) == ']')
{
syntaxError("Empty or unterminated class");
}
// Check for POSIX character class
if (idx < len && pattern.charAt(idx) == ':')
{
// Skip colon
idx++;
// POSIX character classes are denoted with lowercase ASCII strings
int idxStart = idx;
while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z')
{
idx++;
}
// Should be a ":]" to terminate the POSIX character class
if ((idx + 1) < len && pattern.charAt(idx) == ':' && pattern.charAt(idx + 1) == ']')
{
// Get character class
String charClass = pattern.substring(idxStart, idx);
// Select the POSIX class id
Character i = (Character)hashPOSIX.get(charClass);
if (i != null)
{
// Move past colon and right bracket
idx += 2;
// Return new POSIX character class node
return node(RE.OP_POSIXCLASS, i.charValue());
}
syntaxError("Invalid POSIX character class '" + charClass + "'");
}
syntaxError("Invalid POSIX character class syntax");
}
// Try to build a class. Create OP_ANYOF node
int ret = node(RE.OP_ANYOF, 0);
// Parse class declaration
char CHAR_INVALID = Character.MAX_VALUE;
char last = CHAR_INVALID;
char simpleChar = 0;
boolean include = true;
boolean definingRange = false;
int idxFirst = idx;
char rangeStart = Character.MIN_VALUE;
char rangeEnd;
RERange range = new RERange();
while (idx < len && pattern.charAt(idx) != ']')
{
switchOnCharacter:
// Switch on character
switch (pattern.charAt(idx))
{
case '^':
include = !include;
if (idx == idxFirst)
{
range.include(Character.MIN_VALUE, Character.MAX_VALUE, true);
}
idx++;
continue;
case '\\':
{
// Escape always advances the stream
char c;
switch (c = escape ())
{
case ESC_COMPLEX:
case ESC_BACKREF:
// Word boundaries and backrefs not allowed in a character class!
syntaxError("Bad character class");
case ESC_CLASS:
// Classes can't be an endpoint of a range
if (definingRange)
{
syntaxError("Bad character class");
}
// Handle specific type of class (some are ok)
switch (pattern.charAt(idx - 1))
{
case RE.E_NSPACE:
case RE.E_NDIGIT:
case RE.E_NALNUM:
syntaxError("Bad character class");
case RE.E_SPACE:
range.include('\t', include);
range.include('\r', include);
range.include('\f', include);
range.include('\n', include);
range.include('\b', include);
range.include(' ', include);
break;
case RE.E_ALNUM:
range.include('a', 'z', include);
range.include('A', 'Z', include);
range.include('_', include);
// Fall through!
case RE.E_DIGIT:
range.include('0', '9', include);
break;
}
// Make last char invalid (can't be a range start)
last = CHAR_INVALID;
break;
default:
// Escape is simple so treat as a simple char
simpleChar = c;
break switchOnCharacter;
}
}
continue;
case '-':
// Start a range if one isn't already started
if (definingRange)
{
syntaxError("Bad class range");
}
definingRange = true;
// If no last character, start of range is 0
rangeStart = (last == CHAR_INVALID ? 0 : last);
// Premature end of range. define up to Character.MAX_VALUE
if ((idx + 1) < len && pattern.charAt(++idx) == ']')
{
simpleChar = Character.MAX_VALUE;
break;
}
continue;
default:
simpleChar = pattern.charAt(idx++);
break;
}
// Handle simple character simpleChar
if (definingRange)
{
// if we are defining a range make it now
rangeEnd = simpleChar;
// Actually create a range if the range is ok
if (rangeStart >= rangeEnd)
{
syntaxError("Bad character class");
}
range.include(rangeStart, rangeEnd, include);
// We are done defining the range
last = CHAR_INVALID;
definingRange = false;
}
else
{
// If simple character and not start of range, include it
if ((idx + 1) >= len || pattern.charAt(idx + 1) != '-')
{
range.include(simpleChar, include);
}
last = simpleChar;
}
}
// Shouldn't be out of input
if (idx == len)
{
syntaxError("Unterminated character class");
}
// Absorb the ']' end of class marker
idx++;
// Emit character class definition
instruction[ret + RE.offsetOpdata] = (char)range.num;
for (int i = 0; i < range.num; i++)
{
emit((char)range.minRange[i]);
emit((char)range.maxRange[i]);
}
return ret;
}
/**
* Absorb an atomic character string. This method is a little tricky because
* it can un-include the last character of string if a closure operator follows.
* This is correct because *+? have higher precedence than concatentation (thus
* ABC* means AB(C*) and NOT (ABC)*).
* @return Index of new atom node
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
*/
int atom() throws RESyntaxException
{
// Create a string node
int ret = node(RE.OP_ATOM, 0);
// Length of atom
int lenAtom = 0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -