decompositioniterator.java

来自「《移动Agent技术》一书的所有章节源代码。」· Java 代码 · 共 1,062 行 · 第 1/5 页

JAVA
1,062
字号
/*
 * @(#)DecompositionIterator.java	1.18 97/12/05
 *
 * (C) Copyright Taligent, Inc. 1996 - All Rights Reserved
 * (C) Copyright IBM Corp. 1996 - All Rights Reserved
 *
 * Portions copyright (c) 1996 Sun Microsystems, Inc. All Rights Reserved.
 *
 *   The original version of this source code and documentation is copyrighted
 * and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These
 * materials are provided under terms of a License Agreement between Taligent
 * and Sun. This technology is protected by multiple US and International
 * patents. This notice and attribution to Taligent may not be removed.
 *   Taligent is a registered trademark of Taligent, Inc.
 *
 * Permission to use, copy, modify, and distribute this software
 * and its documentation for NON-COMMERCIAL purposes and without
 * fee is hereby granted provided that this copyright notice
 * appears in all copies. Please refer to the file "copyright.html"
 * for further important copyright and licensing information.
 *
 * SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
 * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
 * TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
 * PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
 * ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
 * DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
 *
 */

package java.text;

/**
  * Contains statics for decomposing strings.
  * Eventually ought to be in either Char or String.  This class is only
  * for internal use for now.
  * <p>
  * Unicode defines a set of non-spacing marks that can be combined with
  * base characters to form a single "conceptual" character.  Often these
  * combinations have pre-composed equivalents elsewhere in Unicode.  For
  * operations such as locale-sensitive text comparison, it is necessary to
  * be able to decompose a Unicode character into an equivalent string.
  * The composition operation is also necessary.  It is proposed that at
  * least the decompose function be added as a static method on String, or
  * StringBuffer or Character.
  * @see        Collator
  * @see        RuleBasedCollator
  * @version    1.18 12/05/97
  * @author     Mark Davis, Helena Shih
*/
class DecompositionIterator {
    /**
     * Null order which indicates the end of string is reached by the
     * cursor.
     */
    public final static char NULLORDER = 0xffff;
    /**
     * Create a decomposed string iterator.
     */
    public DecompositionIterator(String source, int start, int end, int mode) {
        str = source;
        getBeginIndex = start;
        getEndIndex = end;
        sIndex = start;

        decmpMode = mode;
        decmpLimit = (mode == Collator.CANONICAL_DECOMPOSITION) ?
            maximumCanonical : SHORT_MAX_VALUE;

        parsedStr = null;     // lazy assignment
        pIndex = 0;
        backedUp = false;
    }

    public DecompositionIterator(String source, int mode)
    {
        this(source, 0, source.length(), mode);
    }

    /**
     * Set the decomposition mode.
     * @param mode the new decomposition mode.
     * @see Collator
     * @see RuleBasedCollator
     */
    public void setDecomposition(int mode)
    {
        decmpMode = mode;
    }

    /**
     * Sets the string to the new source string.
     * @param source            the new source string
     */
    public void setText(String source)
    {
        str = source;
        parsedStr.setLength(0);
        sIndex = 0;
        pIndex = 0;
        getBeginIndex = 0;
        getEndIndex = source.length();
    }

    /**
     * Get the next character.
     */
    public char next() {
        char ch;
        int index = 0;     // index into contents table
        byte type;         // canonicalization type
        char result;

        if (backedUp) {
            // Immediately after a call to previous(), just return the char it saved
            result = prevChar;
            backedUp = false;
        }
        else if (decmpMode == Collator.NO_DECOMPOSITION) {
            // If we're not really doing decomposition, just return the current char
            result = sIndex < getEndIndex ? str.charAt(sIndex++) : NULLORDER;
        }
        else if (decomposing) {
            // push out previously decomposed characters
            result = parsedStr.charAt(pIndex++);
            if (pIndex >= parsedStr.length()) {
                decomposing = false;
            }
        }
        else if (sIndex >= getEndIndex) {
            // Past the end of the string
            result = NULLORDER;
        }
        else {
            // We have to fetch the next character from the string and then
            // try to decompose it
            ch = str.charAt(sIndex++);

            // First, retrieve this character's index.  This is an index into
            // the "contents" array, which contains a sequence of characters
            // terminated by kSTerminator for each decomposing character.
            // All of the sequences are packed into the one big "contents" array
            // in order to save space.
            index = startOffsets.elementAt(ch);

            // The "canonicals" array stores the canonical ordering for each combining
            // character in Unicode.  These orderings are used to sort the diacritical
            // marks after a base character into a canonical order.  If a given Unicode
            // character is *not* a combining character, its entry is BASE

            if (index >= decmpLimit &&                      // no decomposition
                      canonicals.elementAt(ch) == BASE) {   // no canonicalization
                result = ch;
            }
            else {
                // We have to decompose, canonicalize or both.
                // Do so to the shortest possible substring.
                //
                // The length of the parsed string is unbounded, since
                // the input string could have an arbitrary number
                // of adjacent non-BASE characters. In practice,
                // howev+er, a base character will be followed very
                // few non-base characters. For example, a base
                // character may be followed by diacritical marks.
                // These are limited in number, most frequently one,
                // rarely more than two. Therefore the default
                // StringBuffer size is adequate for all normal
                // sequences but will expand as needed for
                // pathological input.
                //
                // Refer to the following sections of The Unicode
                // Standard, Version 2.0
                //
                // Section 2.5 Combining Characters ,
                // Section 3.6 Decomposition
                // Section 3.9 Canonical Ordering Behavior
                // Section 4.2 Combining Classes
                // Section 5.9 Normalization
                // Section 5.15 Sorting and Searching
                //
                if (parsedStr == null) {
                    parsedStr = new StringBuffer();
                } else {
                    parsedStr.setLength(0);
                }

                if (index < decmpLimit) {       // Decompose
                          /* decompose */
                    while((ch = contents.charAt(index++)) != '\u0000') {
                        parsedStr.append(ch);
                    }
                } else {                        // Combining character
                    parsedStr.append(ch);
                }

                // Any other combining chacters that immediately follow the decomposed
                // character must be included in the buffer too, because they're
                // conceptually part of the same logical character.
                while (sIndex < getEndIndex &&
                            canonicals.elementAt(ch = str.charAt(sIndex)) != BASE) {
                    parsedStr.append(ch);
                    sIndex++;
                }

                if (parsedStr.length() > 1) {
                    // If there is more than one combining character in the buffer,
                    // put them into the canonical order.
                    fixCanonical(parsedStr);
                    pIndex = 1;
                    decomposing = true;
                } else {
                    decomposing = false;
                }

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?