📄 token.java
字号:
if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE) break; return ret; case UNION: if (this.size() == 0) return FC_CONTINUE; /* * a|b|c -> FC_TERMINAL * a|.|c -> FC_ANY * a|b| -> FC_CONTINUE */ int ret2 = FC_CONTINUE; boolean hasEmpty = false; for (int i = 0; i < this.size(); i ++) { ret2 = this.getChild(i).analyzeFirstCharacter(result, options); if (ret2 == FC_ANY) break; else if (ret2 == FC_CONTINUE) hasEmpty = true; } return hasEmpty ? FC_CONTINUE : ret2; case CONDITION: int ret3 = this.getChild(0).analyzeFirstCharacter(result, options); if (this.size() == 1) return FC_CONTINUE; if (ret3 == FC_ANY) return ret3; int ret4 = this.getChild(1).analyzeFirstCharacter(result, options); if (ret4 == FC_ANY) return ret4; return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL; case CLOSURE: case NONGREEDYCLOSURE: this.getChild(0).analyzeFirstCharacter(result, options); return FC_CONTINUE; case EMPTY: case ANCHOR: return FC_CONTINUE; case CHAR: int ch = this.getChar(); result.addRange(ch, ch); if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { ch = Character.toUpperCase((char)ch); result.addRange(ch, ch); ch = Character.toLowerCase((char)ch); result.addRange(ch, ch); } return FC_TERMINAL; case DOT: // **** if (isSet(options, RegularExpression.SINGLE_LINE)) { return FC_CONTINUE; // **** We can not optimize. } else { return FC_CONTINUE; /* result.addRange(0, RegularExpression.LINE_FEED-1); result.addRange(RegularExpression.LINE_FEED+1, RegularExpression.CARRIAGE_RETURN-1); result.addRange(RegularExpression.CARRIAGE_RETURN+1, RegularExpression.LINE_SEPARATOR-1); result.addRange(RegularExpression.PARAGRAPH_SEPARATOR+1, UTF16_MAX); return 1; */ } case RANGE: if (isSet(options, RegularExpression.IGNORE_CASE)) { result.mergeRanges(((RangeToken)this).getCaseInsensitiveToken()); } else { result.mergeRanges(this); } return FC_TERMINAL; case NRANGE: // **** if (isSet(options, RegularExpression.IGNORE_CASE)) { result.mergeRanges(Token.complementRanges(((RangeToken)this).getCaseInsensitiveToken())); } else { result.mergeRanges(Token.complementRanges(this)); } return FC_TERMINAL; case INDEPENDENT: case PAREN: return this.getChild(0).analyzeFirstCharacter(result, options); case MODIFIERGROUP: options |= ((ModifierToken)this).getOptions(); options &= ~((ModifierToken)this).getOptionsMask(); return this.getChild(0).analyzeFirstCharacter(result, options); case BACKREFERENCE: result.addRange(0, UTF16_MAX); // **** We can not optimize. return FC_ANY; case STRING: int cha = this.getString().charAt(0); int ch2; if (REUtil.isHighSurrogate(cha) && this.getString().length() >= 2 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1)))) cha = REUtil.composeFromSurrogates(cha, ch2); result.addRange(cha, cha); if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { cha = Character.toUpperCase((char)cha); result.addRange(cha, cha); cha = Character.toLowerCase((char)cha); result.addRange(cha, cha); } return FC_TERMINAL; case LOOKAHEAD: case NEGATIVELOOKAHEAD: case LOOKBEHIND: case NEGATIVELOOKBEHIND: return FC_CONTINUE; default: throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type); } } private final boolean isShorterThan(Token tok) { if (tok == null) return false; /* int mylength; if (this.type == STRING) mylength = this.getString().length(); else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1; else throw new RuntimeException("Internal Error: Illegal type: "+this.type); int otherlength; if (tok.type == STRING) otherlength = tok.getString().length(); else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1; else throw new RuntimeException("Internal Error: Illegal type: "+tok.type); */ int mylength; if (this.type == STRING) mylength = this.getString().length(); else throw new RuntimeException("Internal Error: Illegal type: "+this.type); int otherlength; if (tok.type == STRING) otherlength = tok.getString().length(); else throw new RuntimeException("Internal Error: Illegal type: "+tok.type); return mylength < otherlength; } static class FixedStringContainer { Token token = null; int options = 0; FixedStringContainer() { } } final void findFixedString(FixedStringContainer container, int options) { switch (this.type) { case CONCAT: Token prevToken = null; int prevOptions = 0; for (int i = 0; i < this.size(); i ++) { this.getChild(i).findFixedString(container, options); if (prevToken == null || prevToken.isShorterThan(container.token)) { prevToken = container.token; prevOptions = container.options; } } container.token = prevToken; container.options = prevOptions; return; case UNION: case CLOSURE: case NONGREEDYCLOSURE: case EMPTY: case ANCHOR: case RANGE: case DOT: case NRANGE: case BACKREFERENCE: case LOOKAHEAD: case NEGATIVELOOKAHEAD: case LOOKBEHIND: case NEGATIVELOOKBEHIND: case CONDITION: container.token = null; return; case CHAR: // Ignore CHAR tokens. container.token = null; // ** return; // ** case STRING: container.token = this; container.options = options; return; case INDEPENDENT: case PAREN: this.getChild(0).findFixedString(container, options); return; case MODIFIERGROUP: options |= ((ModifierToken)this).getOptions(); options &= ~((ModifierToken)this).getOptionsMask(); this.getChild(0).findFixedString(container, options); return; default: throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type); } } boolean match(int ch) { throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type); } // ------------------------------------------------------ private final static Hashtable categories = new Hashtable(); private final static Hashtable categories2 = new Hashtable(); private static final String[] categoryNames = { "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd", "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs", "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28 "Pi", "Pf", // 29, 30 "L", "M", "N", "Z", "C", "P", "S", // 31-37 }; // Schema Rec. {Datatypes} - Punctuation static final int CHAR_INIT_QUOTE = 29; // Pi - initial quote static final int CHAR_FINAL_QUOTE = 30; // Pf - final quote static final int CHAR_LETTER = 31; static final int CHAR_MARK = 32; static final int CHAR_NUMBER = 33; static final int CHAR_SEPARATOR = 34; static final int CHAR_OTHER = 35; static final int CHAR_PUNCTUATION = 36; static final int CHAR_SYMBOL = 37; //blockNames in UNICODE 3.1 that supported by XML Schema REC private static final String[] blockNames = { /*0000..007F;*/ "Basic Latin", /*0080..00FF;*/ "Latin-1 Supplement", /*0100..017F;*/ "Latin Extended-A", /*0180..024F;*/ "Latin Extended-B", /*0250..02AF;*/ "IPA Extensions", /*02B0..02FF;*/ "Spacing Modifier Letters", /*0300..036F;*/ "Combining Diacritical Marks", /*0370..03FF;*/ "Greek", /*0400..04FF;*/ "Cyrillic", /*0530..058F;*/ "Armenian", /*0590..05FF;*/ "Hebrew", /*0600..06FF;*/ "Arabic", /*0700..074F;*/ "Syriac", /*0780..07BF;*/ "Thaana", /*0900..097F;*/ "Devanagari", /*0980..09FF;*/ "Bengali", /*0A00..0A7F;*/ "Gurmukhi", /*0A80..0AFF;*/ "Gujarati", /*0B00..0B7F;*/ "Oriya", /*0B80..0BFF;*/ "Tamil", /*0C00..0C7F;*/ "Telugu", /*0C80..0CFF;*/ "Kannada", /*0D00..0D7F;*/ "Malayalam", /*0D80..0DFF;*/ "Sinhala", /*0E00..0E7F;*/ "Thai", /*0E80..0EFF;*/ "Lao", /*0F00..0FFF;*/ "Tibetan", /*1000..109F;*/ "Myanmar", /*10A0..10FF;*/ "Georgian", /*1100..11FF;*/ "Hangul Jamo", /*1200..137F;*/ "Ethiopic", /*13A0..13FF;*/ "Cherokee", /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics", /*1680..169F;*/ "Ogham", /*16A0..16FF;*/ "Runic", /*1780..17FF;*/ "Khmer", /*1800..18AF;*/ "Mongolian", /*1E00..1EFF;*/ "Latin Extended Additional", /*1F00..1FFF;*/ "Greek Extended", /*2000..206F;*/ "General Punctuation", /*2070..209F;*/ "Superscripts and Subscripts", /*20A0..20CF;*/ "Currency Symbols", /*20D0..20FF;*/ "Combining Marks for Symbols", /*2100..214F;*/ "Letterlike Symbols", /*2150..218F;*/ "Number Forms", /*2190..21FF;*/ "Arrows", /*2200..22FF;*/ "Mathematical Operators", /*2300..23FF;*/ "Miscellaneous Technical", /*2400..243F;*/ "Control Pictures", /*2440..245F;*/ "Optical Character Recognition", /*2460..24FF;*/ "Enclosed Alphanumerics", /*2500..257F;*/ "Box Drawing", /*2580..259F;*/ "Block Elements", /*25A0..25FF;*/ "Geometric Shapes", /*2600..26FF;*/ "Miscellaneous Symbols", /*2700..27BF;*/ "Dingbats", /*2800..28FF;*/ "Braille Patterns", /*2E80..2EFF;*/ "CJK Radicals Supplement", /*2F00..2FDF;*/ "Kangxi Radicals", /*2FF0..2FFF;*/ "Ideographic Description Characters", /*3000..303F;*/ "CJK Symbols and Punctuation", /*3040..309F;*/ "Hiragana", /*30A0..30FF;*/ "Katakana", /*3100..312F;*/ "Bopomofo", /*3130..318F;*/ "Hangul Compatibility Jamo", /*3190..319F;*/ "Kanbun", /*31A0..31BF;*/ "Bopomofo Extended", /*3200..32FF;*/ "Enclosed CJK Letters and Months", /*3300..33FF;*/ "CJK Compatibility", /*3400..4DB5;*/ "CJK Unified Ideographs Extension A", /*4E00..9FFF;*/ "CJK Unified Ideographs", /*A000..A48F;*/ "Yi Syllables", /*A490..A4CF;*/ "Yi Radicals", /*AC00..D7A3;*/ "Hangul Syllables", /*E000..F8FF;*/ "Private Use", /*F900..FAFF;*/ "CJK Compatibility Ideographs", /*FB00..FB4F;*/ "Alphabetic Presentation Forms", /*FB50..FDFF;*/ "Arabic Presentation Forms-A", /*FE20..FE2F;*/ "Combining Half Marks", /*FE30..FE4F;*/ "CJK Compatibility Forms", /*FE50..FE6F;*/ "Small Form Variants", /*FE70..FEFE;*/ "Arabic Presentation Forms-B", /*FEFF..FEFF;*/ "Specials", /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms", //missing Specials add manually /*10300..1032F;*/ "Old Italic", // 84 /*10330..1034F;*/ "Gothic", /*10400..1044F;*/ "Deseret", /*1D000..1D0FF;*/ "Byzantine Musical Symbols", /*1D100..1D1FF;*/ "Musical Symbols", /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols", /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B", /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement", /*E0000..E007F;*/ "Tags", //missing 2 private use add manually }; //ADD THOSE MANUALLY //F0000..FFFFD; "Private Use", //100000..10FFFD; "Private Use" //FFF0..FFFD; "Specials", static final String blockRanges = "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F" +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF" +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF" +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF" +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF" +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF" +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF" +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F" +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF" +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF" +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF"; static final int[] nonBMPBlockRanges = { 0x10300, 0x1032F, // 84 0x10330, 0x1034F, 0x10400, 0x1044F, 0x1D000, 0x1D0FF, 0x1D100, 0x1D1FF, 0x1D400, 0x1D7FF, 0x20000, 0x2A6D6, 0x2F800, 0x2FA1F, 0xE0000, 0xE007F }; private static final int NONBMP_BLOCK_START = 84; static protected RangeToken getRange(String name, boolean positive) { if (Token.categories.size() == 0) { synchronized (Token.categories) { Token[] ranges = new Token[Token.categoryNames.length]; for (int i = 0; i < ranges.length; i ++) { ranges[i] = Token.createRange(); } int type; for (int i = 0; i < 0x10000; i ++) { type = Character.getType((char)i); if (type == Character.START_PUNCTUATION || type == Character.END_PUNCTUATION) { //build table of Pi values if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C || i == 0x201F || i == 0x2039) { type = CHAR_INIT_QUOTE; } //build table of Pf values if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) { type = CHAR_FINAL_QUOTE; } } ranges[type].addRange(i, i); switch (type) { case Character.UPPERCASE_LETTER: case Character.LOWERCASE_LETTER:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -