📄 charset.java
字号:
/* * @(#)CharSet.java 1.8 03/01/23 * * Copyright 2003 Sun Microsystems, Inc. All rights reserved. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. *//* * @(#)CharSet.java 1.1 99/02/18 * * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved * * The original version of this source code and documentation * is copyrighted and owned by Taligent, Inc., a wholly-owned * subsidiary of IBM. These materials are provided under terms * of a License Agreement between Taligent and Sun. This technology * is protected by multiple US and International patents. * * This notice and attribution to Taligent may not be removed. * Taligent is a registered trademark of Taligent, Inc. */package java.text;import java.util.Hashtable;/** * An object representing a set of characters. (This is a "set" in the * mathematical sense: an unduplicated list of characters on which set * operations such as union and intersection can be performed.) The * set information is stored in compressed, optimized form: The object * contains a String with an even number of characters. Each pair of * characters represents a range of characters contained in the set * (a pair of the same character represents a single character). The * characters are sorted in increasing order. */class CharSet implements Cloneable { /** * The structure containing the set information. The characters * in this string are organized into pairs, each pair representing * a range of characters contained in the set */ private String chars; //========================================================================== // parseString() and associated routines //========================================================================== /** * A cache which is used to speed up parseString() whenever it is * used to parse a description that has been parsed before */ private static Hashtable expressionCache = null; /** * Builds a CharSet based on a textual description. For the syntax of * the description, see the documentation of RuleBasedBreakIterator. * @see java.text.RuleBasedBreakIterator */ public static CharSet parseString(String s) { CharSet result = null; // if "s" is in the expression cache, pull the result out // of the expresison cache if (expressionCache != null) { result = (CharSet)expressionCache.get(s); } // otherwise, use doParseString() to actually parse the string, // and then add a corresponding entry to the expression cache if (result == null) { result = doParseString(s); if (expressionCache == null) { expressionCache = new Hashtable(); } expressionCache.put(s, result); } result = (CharSet)(result.clone()); return result; } /** * This function is used by parseString() to actually parse the string */ private static CharSet doParseString(String s) { CharSet result = new CharSet(); int p = 0; boolean haveDash = false; boolean haveTilde = false; boolean wIsReal = false; char w = '\u0000'; // for each character in the description... while (p < s.length()) { char c = s.charAt(p); // if it's an opening bracket... if (c == '[') { // flush the single-character cache if (wIsReal) { result.internalUnion(new CharSet(w)); } // locate the matching closing bracket int bracketLevel = 1; int q = p + 1; while (bracketLevel != 0) { // if no matching bracket by end of string then... if (q >= s.length()) { throw new IllegalArgumentException("Parse error at position " + p + " in " + s); } switch (s.charAt(q)) { case '\\': // need to step over next character ++q; break; case '[': ++bracketLevel; break; case ']': --bracketLevel; break; } ++q; } --q; // call parseString() recursively to parse the text inside // the brackets, then either add or subtract the result from // our running result depending on whether or not the [] // expresison was preceded by a ^ if (!haveTilde) { result.internalUnion(CharSet.parseString(s.substring(p + 1, q))); } else { result.internalDifference(CharSet.parseString(s.substring(p + 1, q))); } haveTilde = false; haveDash = false; wIsReal = false; p = q + 1; } // if the character is a colon... else if (c == ':') { // flush the single-character cache if (wIsReal) { result.internalUnion(new CharSet(w)); } // locate the matching colon (and throw an error if there // isn't one) int q = s.indexOf(':', p + 1); if (q == -1) { throw new IllegalArgumentException("Parse error at position " + p + " in " + s); } // use charSetForCategory() to parse the text in the colons, // and either add or substract the result from our running // result depending on whether the :: expression was // preceded by a ^ if (!haveTilde) { result.internalUnion(charSetForCategory(s.substring(p + 1, q))); } else { result.internalDifference(charSetForCategory(s.substring(p + 1, q))); } // reset everything and advance to the next character haveTilde = false; haveDash = false; wIsReal = false; p = q + 1; } // if the character is a dash, set an appropriate flag else if (c == '-') { if (wIsReal) { haveDash = true; } ++p; } // if the character is a caret, flush the single-character // cache and set an appropriate flag. If the set is empty // (i.e., if the expression begins with ^), invert the set // (i.e., set it to include everything). The idea here is // that a set that includes nothing but ^ expressions // means "everything but these things". else if (c == '^') { if (wIsReal) { result.internalUnion(new CharSet(w)); wIsReal = false; } haveTilde = true; ++p; if (result.empty()) { result.internalComplement(); } } // throw an exception on an illegal character else if (c >= ' ' && c < '\u007f' && !Character.isLetter(c) && !Character.isDigit(c) && c != '\\') { throw new IllegalArgumentException("Parse error at position " + p + " in " + s); } // otherwise, we end up here... else { // on a backslash, advance to the next character if (c == '\\') { ++p; } // if the preceding character was a dash, this character // defines the end of a range. Add or subtract that range // from the running result depending on whether or not it // was preceded by a ^ if (haveDash) { if (s.charAt(p) < w) { throw new IllegalArgumentException("U+" + Integer.toHexString(s.charAt(p)) + " is less than U+" + Integer.toHexString(w) + ". Dash expressions " + "can't have their endpoints in reverse order."); } if (!haveTilde) { result.internalUnion(new CharSet(w, s.charAt(p++))); } else { result.internalDifference(new CharSet(w, s.charAt(p++))); } haveDash = false; haveTilde = false; wIsReal = false; } // if the preceding character was a caret, remove this character // from the running result else if (haveTilde) { result.internalDifference(new CharSet(s.charAt(p++))); haveTilde = false; wIsReal = false; } // otherwise, flush the single-character cache and then // put this character into the cache else if (wIsReal) { result.internalUnion(new CharSet(w)); w = s.charAt(p++); wIsReal = true; } else { w = s.charAt(p++); wIsReal = true; } } } // finally, flush the single-character cache one last time if (wIsReal) { result.internalUnion(new CharSet(w)); } return result; } /** * Creates a CharSet containing all the characters in a particular * Unicode category. The text is either a two-character code from * the Unicode database or a single character that begins one or more
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -