📄 token.java
字号:
case Character.TITLECASE_LETTER: case Character.MODIFIER_LETTER: case Character.OTHER_LETTER: type = CHAR_LETTER; break; case Character.NON_SPACING_MARK: case Character.COMBINING_SPACING_MARK: case Character.ENCLOSING_MARK: type = CHAR_MARK; break; case Character.DECIMAL_DIGIT_NUMBER: case Character.LETTER_NUMBER: case Character.OTHER_NUMBER: type = CHAR_NUMBER; break; case Character.SPACE_SEPARATOR: case Character.LINE_SEPARATOR: case Character.PARAGRAPH_SEPARATOR: type = CHAR_SEPARATOR; break; case Character.CONTROL: case Character.FORMAT: case Character.SURROGATE: case Character.PRIVATE_USE: case Character.UNASSIGNED: type = CHAR_OTHER; break; case Character.CONNECTOR_PUNCTUATION: case Character.DASH_PUNCTUATION: case Character.START_PUNCTUATION: case Character.END_PUNCTUATION: case CHAR_INIT_QUOTE: case CHAR_FINAL_QUOTE: case Character.OTHER_PUNCTUATION: type = CHAR_PUNCTUATION; break; case Character.MATH_SYMBOL: case Character.CURRENCY_SYMBOL: case Character.MODIFIER_SYMBOL: case Character.OTHER_SYMBOL: type = CHAR_SYMBOL; break; default: throw new RuntimeException("com.sun.org.apache.xerces.internal.utils.regex.Token#getRange(): Unknown Unicode category: "+type); } ranges[type].addRange(i, i); } // for all characters ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX); for (int i = 0; i < ranges.length; i ++) { if (Token.categoryNames[i] != null) { if (i == Character.UNASSIGNED) { // Unassigned ranges[i].addRange(0x10000, Token.UTF16_MAX); } Token.categories.put(Token.categoryNames[i], ranges[i]); Token.categories2.put(Token.categoryNames[i], Token.complementRanges(ranges[i])); } } //REVISIT: do we really need to support block names as in Unicode 3.1 // or we can just create all the names in IsBLOCKNAME format (XML Schema REC)? // StringBuffer buffer = new StringBuffer(50); for (int i = 0; i < Token.blockNames.length; i ++) { Token r1 = Token.createRange(); int location; if (i < NONBMP_BLOCK_START) { location = i*2; int rstart = Token.blockRanges.charAt(location); int rend = Token.blockRanges.charAt(location+1); //DEBUGING //System.out.println(n+" " +Integer.toHexString(rstart) // +"-"+ Integer.toHexString(rend)); r1.addRange(rstart, rend); } else { location = (i - NONBMP_BLOCK_START) * 2; r1.addRange(Token.nonBMPBlockRanges[location], Token.nonBMPBlockRanges[location + 1]); } String n = Token.blockNames[i]; if (n.equals("Specials")) r1.addRange(0xfff0, 0xfffd); if (n.equals("Private Use")) { r1.addRange(0xF0000,0xFFFFD); r1.addRange(0x100000,0x10FFFD); } Token.categories.put(n, r1); Token.categories2.put(n, Token.complementRanges(r1)); buffer.setLength(0); buffer.append("Is"); if (n.indexOf(' ') >= 0) { for (int ci = 0; ci < n.length(); ci ++) if (n.charAt(ci) != ' ') buffer.append((char)n.charAt(ci)); } else { buffer.append(n); } Token.setAlias(buffer.toString(), n, true); } // TR#18 1.2 Token.setAlias("ASSIGNED", "Cn", false); Token.setAlias("UNASSIGNED", "Cn", true); Token all = Token.createRange(); all.addRange(0, Token.UTF16_MAX); Token.categories.put("ALL", all); Token.categories2.put("ALL", Token.complementRanges(all)); Token.registerNonXS("ASSIGNED"); Token.registerNonXS("UNASSIGNED"); Token.registerNonXS("ALL"); Token isalpha = Token.createRange(); isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo Token.categories.put("IsAlpha", isalpha); Token.categories2.put("IsAlpha", Token.complementRanges(isalpha)); Token.registerNonXS("IsAlpha"); Token isalnum = Token.createRange(); isalnum.mergeRanges(isalpha); // Lu Ll Lo isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd Token.categories.put("IsAlnum", isalnum); Token.categories2.put("IsAlnum", Token.complementRanges(isalnum)); Token.registerNonXS("IsAlnum"); Token isspace = Token.createRange(); isspace.mergeRanges(Token.token_spaces); isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z Token.categories.put("IsSpace", isspace); Token.categories2.put("IsSpace", Token.complementRanges(isspace)); Token.registerNonXS("IsSpace"); Token isword = Token.createRange(); isword.mergeRanges(isalnum); // Lu Ll Lo Nd isword.addRange('_', '_'); Token.categories.put("IsWord", isword); Token.categories2.put("IsWord", Token.complementRanges(isword)); Token.registerNonXS("IsWord"); Token isascii = Token.createRange(); isascii.addRange(0, 127); Token.categories.put("IsASCII", isascii); Token.categories2.put("IsASCII", Token.complementRanges(isascii)); Token.registerNonXS("IsASCII"); Token isnotgraph = Token.createRange(); isnotgraph.mergeRanges(ranges[CHAR_OTHER]); isnotgraph.addRange(' ', ' '); Token.categories.put("IsGraph", Token.complementRanges(isnotgraph)); Token.categories2.put("IsGraph", isnotgraph); Token.registerNonXS("IsGraph"); Token isxdigit = Token.createRange(); isxdigit.addRange('0', '9'); isxdigit.addRange('A', 'F'); isxdigit.addRange('a', 'f'); Token.categories.put("IsXDigit", Token.complementRanges(isxdigit)); Token.categories2.put("IsXDigit", isxdigit); Token.registerNonXS("IsXDigit"); Token.setAlias("IsDigit", "Nd", true); Token.setAlias("IsUpper", "Lu", true); Token.setAlias("IsLower", "Ll", true); Token.setAlias("IsCntrl", "C", true); Token.setAlias("IsPrint", "C", false); Token.setAlias("IsPunct", "P", true); Token.registerNonXS("IsDigit"); Token.registerNonXS("IsUpper"); Token.registerNonXS("IsLower"); Token.registerNonXS("IsCntrl"); Token.registerNonXS("IsPrint"); Token.registerNonXS("IsPunct"); Token.setAlias("alpha", "IsAlpha", true); Token.setAlias("alnum", "IsAlnum", true); Token.setAlias("ascii", "IsASCII", true); Token.setAlias("cntrl", "IsCntrl", true); Token.setAlias("digit", "IsDigit", true); Token.setAlias("graph", "IsGraph", true); Token.setAlias("lower", "IsLower", true); Token.setAlias("print", "IsPrint", true); Token.setAlias("punct", "IsPunct", true); Token.setAlias("space", "IsSpace", true); Token.setAlias("upper", "IsUpper", true); Token.setAlias("word", "IsWord", true); // Perl extension Token.setAlias("xdigit", "IsXDigit", true); Token.registerNonXS("alpha"); Token.registerNonXS("alnum"); Token.registerNonXS("ascii"); Token.registerNonXS("cntrl"); Token.registerNonXS("digit"); Token.registerNonXS("graph"); Token.registerNonXS("lower"); Token.registerNonXS("print"); Token.registerNonXS("punct"); Token.registerNonXS("space"); Token.registerNonXS("upper"); Token.registerNonXS("word"); Token.registerNonXS("xdigit"); } // synchronized } // if null RangeToken tok = positive ? (RangeToken)Token.categories.get(name) : (RangeToken)Token.categories2.get(name); //if (tok == null) System.out.println(name); return tok; } static protected RangeToken getRange(String name, boolean positive, boolean xs) { RangeToken range = Token.getRange(name, positive); if (xs && range != null && Token.isRegisterNonXS(name)) range = null; return range; } static Hashtable nonxs = null; /** * This method is called by only getRange(). * So this method need not MT-safe. */ static protected void registerNonXS(String name) { if (Token.nonxs == null) Token.nonxs = new Hashtable(); Token.nonxs.put(name, name); } static protected boolean isRegisterNonXS(String name) { if (Token.nonxs == null) return false; //DEBUG //System.err.println("isRegisterNonXS: "+name); return Token.nonxs.containsKey(name); } private static void setAlias(String newName, String name, boolean positive) { Token t1 = (Token)Token.categories.get(name); Token t2 = (Token)Token.categories2.get(name); if (positive) { Token.categories.put(newName, t1); Token.categories2.put(newName, t2); } else { Token.categories2.put(newName, t1); Token.categories.put(newName, t2); } } // ------------------------------------------------------ static final String viramaString = "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;; +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;; static private Token token_grapheme = null; static synchronized Token getGraphemePattern() { if (Token.token_grapheme != null) return Token.token_grapheme; Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}] base_char.mergeRanges(Token.getRange("ASSIGNED", true)); base_char.subtractRanges(Token.getRange("M", true)); base_char.subtractRanges(Token.getRange("C", true)); Token virama = Token.createRange(); for (int i = 0; i < Token.viramaString.length(); i ++) { int ch = viramaString.charAt(i); virama.addRange(i, i); } Token combiner_wo_virama = Token.createRange(); combiner_wo_virama.mergeRanges(Token.getRange("M", true)); combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras Token left = Token.createUnion(); // base_char? left.addChild(base_char); left.addChild(Token.token_empty); Token foo = Token.createUnion(); foo.addChild(Token.createConcat(virama, Token.getRange("L", true))); foo.addChild(combiner_wo_virama); foo = Token.createClosure(foo); foo = Token.createConcat(left, foo); Token.token_grapheme = foo; return Token.token_grapheme; } /** * Combing Character Sequence in Perl 5.6. */ static private Token token_ccs = null; static synchronized Token getCombiningCharacterSequence() { if (Token.token_ccs != null) return Token.token_ccs; Token foo = Token.createClosure(Token.getRange("M", true)); // \pM* foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM* Token.token_ccs = foo; return Token.token_ccs; } // ------------------------------------------------------ // ------------------------------------------------------ /** * This class represents a node in parse tree. */ static class StringToken extends Token implements java.io.Serializable { private static final long serialVersionUID = 3257288015452780086L; String string; int refNumber; StringToken(int type, String str, int n) { super(type); this.string = str; this.refNumber = n; } int getReferenceNumber() { // for STRING return this.refNumber; } String getString() { // for STRING return this.string; } public String toString(int options) { if (this.type == BACKREFERENCE) return "\\"+this.refNumber; else return REUtil.quoteMeta(this.string); } } /** * This class represents a node in parse tree. */ static class ConcatToken extends Token implements java.io.Serializable { private static final long serialVersionUID = 4050760502994940212L; Token child; Token child2; ConcatToken(Token t1, Token t2) { super(Token.CONCAT); this.child = t1; this.child2 = t2; } int size() { return 2; } Token getChild(int index) { return index == 0 ? this.child : this.child2; } public String toString(int options) { String ret; if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) { ret = this.child.toString(options)+"+"; } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) { ret = this.child.toString(options)+"+?"; } else ret = this.child.toString(options)+this.child2.toString(options); return ret; } } /** * This class represents a node in parse tree. */ static class CharToken extends Token implements java.io.Serializable { private static final long serialVersionUID = 3257284751277569842L; int chardata;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -