decompositioniterator.java
来自「《移动Agent技术》一书的所有章节源代码。」· Java 代码 · 共 1,062 行 · 第 1/5 页
JAVA
1,062 行
/*
* @(#)DecompositionIterator.java 1.18 97/12/05
*
* (C) Copyright Taligent, Inc. 1996 - All Rights Reserved
* (C) Copyright IBM Corp. 1996 - All Rights Reserved
*
* Portions copyright (c) 1996 Sun Microsystems, Inc. All Rights Reserved.
*
* The original version of this source code and documentation is copyrighted
* and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These
* materials are provided under terms of a License Agreement between Taligent
* and Sun. This technology is protected by multiple US and International
* patents. This notice and attribution to Taligent may not be removed.
* Taligent is a registered trademark of Taligent, Inc.
*
* Permission to use, copy, modify, and distribute this software
* and its documentation for NON-COMMERCIAL purposes and without
* fee is hereby granted provided that this copyright notice
* appears in all copies. Please refer to the file "copyright.html"
* for further important copyright and licensing information.
*
* SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
* THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
* TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
* PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
* ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
* DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
*
*/
package java.text;
/**
* Contains statics for decomposing strings.
* Eventually ought to be in either Char or String. This class is only
* for internal use for now.
* <p>
* Unicode defines a set of non-spacing marks that can be combined with
* base characters to form a single "conceptual" character. Often these
* combinations have pre-composed equivalents elsewhere in Unicode. For
* operations such as locale-sensitive text comparison, it is necessary to
* be able to decompose a Unicode character into an equivalent string.
* The composition operation is also necessary. It is proposed that at
* least the decompose function be added as a static method on String, or
* StringBuffer or Character.
* @see Collator
* @see RuleBasedCollator
* @version 1.18 12/05/97
* @author Mark Davis, Helena Shih
*/
class DecompositionIterator {
/**
* Null order which indicates the end of string is reached by the
* cursor.
*/
public final static char NULLORDER = 0xffff;
/**
* Create a decomposed string iterator.
*/
public DecompositionIterator(String source, int start, int end, int mode) {
str = source;
getBeginIndex = start;
getEndIndex = end;
sIndex = start;
decmpMode = mode;
decmpLimit = (mode == Collator.CANONICAL_DECOMPOSITION) ?
maximumCanonical : SHORT_MAX_VALUE;
parsedStr = null; // lazy assignment
pIndex = 0;
backedUp = false;
}
public DecompositionIterator(String source, int mode)
{
this(source, 0, source.length(), mode);
}
/**
* Set the decomposition mode.
* @param mode the new decomposition mode.
* @see Collator
* @see RuleBasedCollator
*/
public void setDecomposition(int mode)
{
decmpMode = mode;
}
/**
* Sets the string to the new source string.
* @param source the new source string
*/
public void setText(String source)
{
str = source;
parsedStr.setLength(0);
sIndex = 0;
pIndex = 0;
getBeginIndex = 0;
getEndIndex = source.length();
}
/**
* Get the next character.
*/
public char next() {
char ch;
int index = 0; // index into contents table
byte type; // canonicalization type
char result;
if (backedUp) {
// Immediately after a call to previous(), just return the char it saved
result = prevChar;
backedUp = false;
}
else if (decmpMode == Collator.NO_DECOMPOSITION) {
// If we're not really doing decomposition, just return the current char
result = sIndex < getEndIndex ? str.charAt(sIndex++) : NULLORDER;
}
else if (decomposing) {
// push out previously decomposed characters
result = parsedStr.charAt(pIndex++);
if (pIndex >= parsedStr.length()) {
decomposing = false;
}
}
else if (sIndex >= getEndIndex) {
// Past the end of the string
result = NULLORDER;
}
else {
// We have to fetch the next character from the string and then
// try to decompose it
ch = str.charAt(sIndex++);
// First, retrieve this character's index. This is an index into
// the "contents" array, which contains a sequence of characters
// terminated by kSTerminator for each decomposing character.
// All of the sequences are packed into the one big "contents" array
// in order to save space.
index = startOffsets.elementAt(ch);
// The "canonicals" array stores the canonical ordering for each combining
// character in Unicode. These orderings are used to sort the diacritical
// marks after a base character into a canonical order. If a given Unicode
// character is *not* a combining character, its entry is BASE
if (index >= decmpLimit && // no decomposition
canonicals.elementAt(ch) == BASE) { // no canonicalization
result = ch;
}
else {
// We have to decompose, canonicalize or both.
// Do so to the shortest possible substring.
//
// The length of the parsed string is unbounded, since
// the input string could have an arbitrary number
// of adjacent non-BASE characters. In practice,
// howev+er, a base character will be followed very
// few non-base characters. For example, a base
// character may be followed by diacritical marks.
// These are limited in number, most frequently one,
// rarely more than two. Therefore the default
// StringBuffer size is adequate for all normal
// sequences but will expand as needed for
// pathological input.
//
// Refer to the following sections of The Unicode
// Standard, Version 2.0
//
// Section 2.5 Combining Characters ,
// Section 3.6 Decomposition
// Section 3.9 Canonical Ordering Behavior
// Section 4.2 Combining Classes
// Section 5.9 Normalization
// Section 5.15 Sorting and Searching
//
if (parsedStr == null) {
parsedStr = new StringBuffer();
} else {
parsedStr.setLength(0);
}
if (index < decmpLimit) { // Decompose
/* decompose */
while((ch = contents.charAt(index++)) != '\u0000') {
parsedStr.append(ch);
}
} else { // Combining character
parsedStr.append(ch);
}
// Any other combining chacters that immediately follow the decomposed
// character must be included in the buffer too, because they're
// conceptually part of the same logical character.
while (sIndex < getEndIndex &&
canonicals.elementAt(ch = str.charAt(sIndex)) != BASE) {
parsedStr.append(ch);
sIndex++;
}
if (parsedStr.length() > 1) {
// If there is more than one combining character in the buffer,
// put them into the canonical order.
fixCanonical(parsedStr);
pIndex = 1;
decomposing = true;
} else {
decomposing = false;
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?