📄 re.java
字号:
// ALTERNATION OPERATOR // \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT) // not available if RE_LIMITED_OPS is set // TODO: the '\n' literal here should be a test against REToken.newline, // which unfortunately may be more than a single character. if ( ( (unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ (unit.bk || quot))) || (syntax.get(RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') && !(unit.bk || quot)) ) && !syntax.get(RESyntax.RE_LIMITED_OPS)) { // make everything up to here be a branch. create vector if nec. addToken(currentToken); RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength, maximumLength); minimumLength = 0; maximumLength = 0; if (branches == null) { branches = new Vector(); } branches.addElement(theBranch); firstToken = lastToken = currentToken = null; } // INTERVAL OPERATOR: // {x} | {x,} | {x,y} (RE_INTERVALS && RE_NO_BK_BRACES) // \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES) // // OPEN QUESTION: // what is proper interpretation of '{' at start of string? // // This method used to check "repeat.empty.token" to avoid such regexp // as "(a*){2,}", but now "repeat.empty.token" is allowed. else if ((unit.ch == '{') && syntax.get(RESyntax.RE_INTERVALS) && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ (unit.bk || quot))) { int newIndex = getMinMax(pattern,index,minMax,syntax); if (newIndex > index) { if (minMax.first > minMax.second) throw new REException(getLocalizedMessage("interval.order"),REException.REG_BADRPT,newIndex); if (currentToken == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,newIndex); if (currentToken instanceof RETokenRepeated) throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,newIndex); if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary) throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,newIndex); index = newIndex; currentToken = setRepeated(currentToken,minMax.first,minMax.second,index); } else { addToken(currentToken); currentToken = new RETokenChar(subIndex,unit.ch,insens); } } // LIST OPERATOR: // [...] | [^...] else if ((unit.ch == '[') && !(unit.bk || quot)) { // Create a new RETokenOneOf ParseCharClassResult result = parseCharClass( subIndex, pattern, index, pLength, cflags, syntax, 0); addToken(currentToken); currentToken = result.token; index = result.index; } // SUBEXPRESSIONS // (...) | \(...\) depending on RE_NO_BK_PARENS else if ((unit.ch == '(') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) { boolean pure = false; boolean comment = false; boolean lookAhead = false; boolean lookBehind = false; boolean independent = false; boolean negativelh = false; boolean negativelb = false; if ((index+1 < pLength) && (pattern[index] == '?')) { switch (pattern[index+1]) { case '!': if (syntax.get(RESyntax.RE_LOOKAHEAD)) { pure = true; negativelh = true; lookAhead = true; index += 2; } break; case '=': if (syntax.get(RESyntax.RE_LOOKAHEAD)) { pure = true; lookAhead = true; index += 2; } break; case '<': // We assume that if the syntax supports look-ahead, // it also supports look-behind. if (syntax.get(RESyntax.RE_LOOKAHEAD)) { index++; switch (pattern[index +1]) { case '!': pure = true; negativelb = true; lookBehind = true; index += 2; break; case '=': pure = true; lookBehind = true; index += 2; } } break; case '>': // We assume that if the syntax supports look-ahead, // it also supports independent group. if (syntax.get(RESyntax.RE_LOOKAHEAD)) { pure = true; independent = true; index += 2; } break; case 'i': case 'd': case 'm': case 's': // case 'u': not supported // case 'x': not supported case '-': if (!syntax.get(RESyntax.RE_EMBEDDED_FLAGS)) break; // Set or reset syntax flags. int flagIndex = index + 1; int endFlag = -1; RESyntax newSyntax = new RESyntax(syntax); int newCflags = cflags; boolean negate = false; while (flagIndex < pLength && endFlag < 0) { switch(pattern[flagIndex]) { case 'i': if (negate) newCflags &= ~REG_ICASE; else newCflags |= REG_ICASE; flagIndex++; break; case 'd': if (negate) newSyntax.setLineSeparator(RESyntax.DEFAULT_LINE_SEPARATOR); else newSyntax.setLineSeparator("\n"); flagIndex++; break; case 'm': if (negate) newCflags &= ~REG_MULTILINE; else newCflags |= REG_MULTILINE; flagIndex++; break; case 's': if (negate) newCflags &= ~REG_DOT_NEWLINE; else newCflags |= REG_DOT_NEWLINE; flagIndex++; break; // case 'u': not supported // case 'x': not supported case '-': negate = true; flagIndex++; break; case ':': case ')': endFlag = pattern[flagIndex]; break; default: throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index); } } if (endFlag == ')') { syntax = newSyntax; cflags = newCflags; insens = ((cflags & REG_ICASE) > 0); // This can be treated as though it were a comment. comment = true; index = flagIndex - 1; break; } if (endFlag == ':') { savedSyntax = syntax; savedCflags = cflags; flagsSaved = true; syntax = newSyntax; cflags = newCflags; insens = ((cflags & REG_ICASE) > 0); index = flagIndex -1; // Fall through to the next case. } else { throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index); } case ':': if (syntax.get(RESyntax.RE_PURE_GROUPING)) { pure = true; index += 2; } break; case '#': if (syntax.get(RESyntax.RE_COMMENTS)) { comment = true; } break; default: throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index); } } if (index >= pLength) { throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index); } // find end of subexpression int endIndex = index; int nextIndex = index; int nested = 0; while ( ((nextIndex = getCharUnit(pattern,endIndex,unit,false)) > 0) && !(nested == 0 && (unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) ) { if ((endIndex = nextIndex) >= pLength) throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex); else if ((unit.ch == '[') && !(unit.bk || quot)) { // I hate to do something similar to the LIST OPERATOR matters // above, but ... int listIndex = nextIndex; if (listIndex < pLength && pattern[listIndex] == '^') listIndex++; if (listIndex < pLength && pattern[listIndex] == ']') listIndex++; int listEndIndex = -1; int listNest = 0; while (listIndex < pLength && listEndIndex < 0) { switch(pattern[listIndex++]) { case '\\': listIndex++; break; case '[': // Sun's API document says that regexp like "[a-d[m-p]]" // is legal. Even something like "[[[^]]]]" is accepted. listNest++; if (listIndex < pLength && pattern[listIndex] == '^') listIndex++; if (listIndex < pLength && pattern[listIndex] == ']') listIndex++; break; case ']': if (listNest == 0) listEndIndex = listIndex; listNest--; break; } } if (listEndIndex >= 0) { nextIndex = listEndIndex; if ((endIndex = nextIndex) >= pLength) throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex); else continue; } throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex); } else if (unit.ch == '(' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) nested++; else if (unit.ch == ')' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) nested--; } // endIndex is now position at a ')','\)' // nextIndex is end of string or position after ')' or '\)' if (comment) index = nextIndex; else { // not a comment // create RE subexpression as token. addToken(currentToken); if (!pure) { numSubs++; } int useIndex = (pure || lookAhead || lookBehind || independent) ? 0 : nextSub + numSubs; currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs); numSubs += ((RE) currentToken).getNumSubs(); if (lookAhead) { currentToken = new RETokenLookAhead(currentToken,negativelh); } else if (lookBehind) { currentToken = new RETokenLookBehind(currentToken,negativelb); } else if (independent) { currentToken = new RETokenIndependent(currentToken); } index = nextIndex; if (flagsSaved) { syntax = savedSyntax; cflags = savedCflags; insens = ((cflags & REG_ICASE) > 0); flagsSaved = false; } } // not a comment } // subexpression // UNMATCHED RIGHT PAREN // ) or \) throw exception if // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) && ((unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))) { throw new REException(getLocalizedMessage("unmatched.paren"),REException.REG_EPAREN,index); } // START OF LINE OPERATOR // ^ else if ((unit.ch == '^') && !(unit.bk || quot)) { addToken(currentToken); currentToken = null; addToken(new RETokenStart(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null)); } // END OF LINE OPERATOR // $ else if ((unit.ch == '$') && !(unit.bk || quot)) { addToken(currentToken); currentToken = null; addToken(new RETokenEnd(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null)); } // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null) // . else if ((unit.ch == '.') && !(unit.bk || quot)) { addToken(currentToken); currentToken = new RETokenAny(subIndex,syntax.get(RESyntax.RE_DOT_NEWLINE) || ((cflags & REG_DOT_NEWLINE) > 0),syntax.get(RESyntax.RE_DOT_NOT_NULL)); } // ZERO-OR-MORE REPEAT OPERATOR // * // // This method used to check "repeat.empty.token" to avoid such regexp // as "(a*)*", but now "repeat.empty.token" is allowed. else if ((unit.ch == '*') && !(unit.bk || quot)) { if (currentToken == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index); if (currentToken instanceof RETokenRepeated) throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index); if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary) throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index); currentToken = setRepeated(currentToken,0,Integer.MAX_VALUE,index); } // ONE-OR-MORE REPEAT OPERATOR / POSSESSIVE MATCHING OPERATOR // + | \+ depending on RE_BK_PLUS_QM // not available if RE_LIMITED_OPS is set // // This method used to check "repeat.empty.token" to avoid such regexp // as "(a*)+", but now "repeat.empty.token" is allowed. else if ((unit.ch == '+') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) { if (currentToken == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index); // Check for possessive matching on RETokenRepeated if (currentToken instanceof RETokenRepeated) { RETokenRepeated tokenRep = (RETokenRepeated)currentToken; if (syntax.get(RESyntax.RE_POSSESSIVE_OPS) && !tokenRep.isPossessive() && !tokenRep.isStingy()) tokenRep.makePossessive(); else throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -