📄 dictionarybasedbreakiterator.java

📁 java源代码请看看啊提点宝贵的意见
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* * @(#)DictionaryBasedBreakIterator.java	1.10 03/01/23 * * Copyright 2003 Sun Microsystems, Inc. All rights reserved. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. *//* * @(#)DictionaryBasedBreakIterator.java	1.3 99/05/03 * * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved * * The original version of this source code and documentation * is copyrighted and owned by Taligent, Inc., a wholly-owned * subsidiary of IBM. These materials are provided under terms * of a License Agreement between Taligent and Sun. This technology * is protected by multiple US and International patents. * * This notice and attribution to Taligent may not be removed. * Taligent is a registered trademark of Taligent, Inc. */package java.text;import java.util.Vector;import java.util.Stack;import java.util.Hashtable;import java.text.CharacterIterator;import java.io.InputStream;import java.io.IOException;/** * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary * to further subdivide ranges of text beyond what is possible using just the * state-table-based algorithm.  This is necessary, for example, to handle * word and line breaking in Thai, which doesn't use spaces between words.  The * state-table-based algorithm used by RuleBasedBreakIterator is used to divide * up text as far as possible, and then contiguous ranges of letters are * repeatedly compared against a list of known words (i.e., the dictionary) * to divide them up into words. * * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator, * but adds one more special substitution name: &lt;dictionary&gt;.  This substitution * name is used to identify characters in words in the dictionary.  The idea is that * if the iterator passes over a chunk of text that includes two or more characters * in a row that are included in &lt;dictionary&gt;, it goes back through that range and * derives additional break positions (if possible) using the dictionary. * * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary * file.  It follows a prescribed search path to locate the dictionary (right now, * it looks for it in /com/ibm/text/resources in each directory in the classpath, * and won't find it in JAR files, but this location is likely to change).  The * dictionary file is in a serialized binary format.  We have a very primitive (and * slow) BuildDictionaryFile utility for creating dictionary files, but aren't * currently making it public.  Contact us for help. */class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {    /**     * a list of known words that is used to divide up contiguous ranges of letters,     * stored in a compressed, indexed, format that offers fast access     */    private BreakDictionary dictionary;    /**     * a list of flags indicating which character categories are contained in     * the dictionary file (this is used to determine which ranges of characters     * to apply the dictionary to)     */    private boolean[] categoryFlags;    /**     * a temporary hiding place for the number of dictionary characters in the     * last range passed over by next()     */    private int dictionaryCharCount;    /**     * when a range of characters is divided up using the dictionary, the break     * positions that are discovered are stored here, preventing us from having     * to use either the dictionary or the state table again until the iterator     * leaves this range of text     */    private int[] cachedBreakPositions;    /**     * if cachedBreakPositions is not null, this indicates which item in the     * cache the current iteration position refers to     */    private int positionInCache;    /**     * Constructs a DictionaryBasedBreakIterator.     * @param description Same as the description parameter on RuleBasedBreakIterator,     * except for the special meaning of "<dictionary>".  This parameter is just     * passed through to RuleBasedBreakIterator's constructor.     * @param dictionaryFilename The filename of the dictionary file to use     */    public DictionaryBasedBreakIterator(String description,                                        InputStream dictionaryStream) throws IOException {        super(description);        dictionary = new BreakDictionary(dictionaryStream);    }    /**     * Returns a Builder that is customized to build a DictionaryBasedBreakIterator.     * This is the same as RuleBasedBreakIterator.Builder, except for the extra code     * to handle the <dictionary> tag.     */    protected RuleBasedBreakIterator.Builder makeBuilder() {        return new Builder();    }    public void setText(CharacterIterator newText) {        super.setText(newText);        cachedBreakPositions = null;        dictionaryCharCount = 0;        positionInCache = 0;    }    /**     * Sets the current iteration position to the beginning of the text.     * (i.e., the CharacterIterator's starting offset).     * @return The offset of the beginning of the text.     */    public int first() {        cachedBreakPositions = null;        dictionaryCharCount = 0;        positionInCache = 0;        return super.first();    }    /**     * Sets the current iteration position to the end of the text.     * (i.e., the CharacterIterator's ending offset).     * @return The text's past-the-end offset.     */    public int last() {        cachedBreakPositions = null;        dictionaryCharCount = 0;        positionInCache = 0;        return super.last();    }    /**     * Advances the iterator one step backwards.     * @return The position of the last boundary position before the     * current iteration position     */    public int previous() {        CharacterIterator text = getText();        // if we have cached break positions and we're still in the range        // covered by them, just move one step backward in the cache        if (cachedBreakPositions != null && positionInCache > 0) {            --positionInCache;            text.setIndex(cachedBreakPositions[positionInCache]);            return cachedBreakPositions[positionInCache];        }        // otherwise, dump the cache and use the inherited previous() method to move        // backward.  This may fill up the cache with new break positions, in which        // case we have to mark our position in the cache        else {            cachedBreakPositions = null;            int result = super.previous();            if (cachedBreakPositions != null)                positionInCache = cachedBreakPositions.length - 2;            return result;        }    }    /**     * Sets the current iteration position to the last boundary position     * before the specified position.     * @param offset The position to begin searching from     * @return The position of the last boundary before "offset"     */    public int preceding(int offset) {        CharacterIterator text = getText();        checkOffset(offset, text);        // if we have no cached break positions, or "offset" is outside the        // range covered by the cache, we can just call the inherited routine        // (which will eventually call other routines in this class that may        // refresh the cache)        if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] ||                offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {            cachedBreakPositions = null;            return super.preceding(offset);        }        // on the other hand, if "offset" is within the range covered by the cache,        // then all we have to do is search the cache for the last break position        // before "offset"        else {            positionInCache = 0;            while (positionInCache < cachedBreakPositions.length                   && offset > cachedBreakPositions[positionInCache])                ++positionInCache;            --positionInCache;            text.setIndex(cachedBreakPositions[positionInCache]);            return text.getIndex();        }    }    /**     * Sets the current iteration position to the first boundary position after     * the specified position.     * @param offset The position to begin searching forward from     * @return The position of the first boundary after "offset"     */    public int following(int offset) {        CharacterIterator text = getText();        checkOffset(offset, text);        // if we have no cached break positions, or if "offset" is outside the        // range covered by the cache, then dump the cache and call our        // inherited following() method.  This will call other methods in this        // class that may refresh the cache.        if (cachedBreakPositions == null || offset < cachedBreakPositions[0] ||                offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {            cachedBreakPositions = null;            return super.following(offset);        }        // on the other hand, if "offset" is within the range covered by the        // cache, then just search the cache for the first break position        // after "offset"        else {            positionInCache = 0;            while (positionInCache < cachedBreakPositions.length                   && offset >= cachedBreakPositions[positionInCache])                ++positionInCache;            text.setIndex(cachedBreakPositions[positionInCache]);            return text.getIndex();        }    }    /**     * This is the implementation function for next().     */    protected int handleNext() {        CharacterIterator text = getText();        // if there are no cached break positions, or if we've just moved        // off the end of the range covered by the cache, we have to dump        // and possibly regenerate the cache        if (cachedBreakPositions == null || 	    positionInCache == cachedBreakPositions.length - 1) {            // start by using the inherited handleNext() to find a tentative return            // value.   dictionaryCharCount tells us how many dictionary characters            // we passed over on our way to the tentative return value            int startPos = text.getIndex();            dictionaryCharCount = 0;            int result = super.handleNext();            // if we passed over more than one dictionary character, then we use            // divideUpDictionaryRange() to regenerate the cached break positions            // for the new range            if (dictionaryCharCount > 1 && result - startPos > 1) {                divideUpDictionaryRange(startPos, result);            }            // otherwise, the value we got back from the inherited fuction            // is our return value, and we can dump the cache            else {                cachedBreakPositions = null;                return result;            }        }        // if the cache of break positions has been regenerated (or existed all        // along), then just advance to the next break position in the cache        // and return it        if (cachedBreakPositions != null) {            ++positionInCache;            text.setIndex(cachedBreakPositions[positionInCache]);
12 下一页
💿 文件大小 245 K
👤 上传用户 liu2000dz
📂 所属分类 Java编程
🏷️ 相关标签

#java #源代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -