📄 dictionarybasedbreakiterator.java
字号:
/* * @(#)DictionaryBasedBreakIterator.java 1.10 03/01/23 * * Copyright 2003 Sun Microsystems, Inc. All rights reserved. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. *//* * @(#)DictionaryBasedBreakIterator.java 1.3 99/05/03 * * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved * * The original version of this source code and documentation * is copyrighted and owned by Taligent, Inc., a wholly-owned * subsidiary of IBM. These materials are provided under terms * of a License Agreement between Taligent and Sun. This technology * is protected by multiple US and International patents. * * This notice and attribution to Taligent may not be removed. * Taligent is a registered trademark of Taligent, Inc. */package java.text;import java.util.Vector;import java.util.Stack;import java.util.Hashtable;import java.text.CharacterIterator;import java.io.InputStream;import java.io.IOException;/** * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary * to further subdivide ranges of text beyond what is possible using just the * state-table-based algorithm. This is necessary, for example, to handle * word and line breaking in Thai, which doesn't use spaces between words. The * state-table-based algorithm used by RuleBasedBreakIterator is used to divide * up text as far as possible, and then contiguous ranges of letters are * repeatedly compared against a list of known words (i.e., the dictionary) * to divide them up into words. * * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator, * but adds one more special substitution name: <dictionary>. This substitution * name is used to identify characters in words in the dictionary. The idea is that * if the iterator passes over a chunk of text that includes two or more characters * in a row that are included in <dictionary>, it goes back through that range and * derives additional break positions (if possible) using the dictionary. * * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary * file. It follows a prescribed search path to locate the dictionary (right now, * it looks for it in /com/ibm/text/resources in each directory in the classpath, * and won't find it in JAR files, but this location is likely to change). The * dictionary file is in a serialized binary format. We have a very primitive (and * slow) BuildDictionaryFile utility for creating dictionary files, but aren't * currently making it public. Contact us for help. */class DictionaryBasedBreakIterator extends RuleBasedBreakIterator { /** * a list of known words that is used to divide up contiguous ranges of letters, * stored in a compressed, indexed, format that offers fast access */ private BreakDictionary dictionary; /** * a list of flags indicating which character categories are contained in * the dictionary file (this is used to determine which ranges of characters * to apply the dictionary to) */ private boolean[] categoryFlags; /** * a temporary hiding place for the number of dictionary characters in the * last range passed over by next() */ private int dictionaryCharCount; /** * when a range of characters is divided up using the dictionary, the break * positions that are discovered are stored here, preventing us from having * to use either the dictionary or the state table again until the iterator * leaves this range of text */ private int[] cachedBreakPositions; /** * if cachedBreakPositions is not null, this indicates which item in the * cache the current iteration position refers to */ private int positionInCache; /** * Constructs a DictionaryBasedBreakIterator. * @param description Same as the description parameter on RuleBasedBreakIterator, * except for the special meaning of "<dictionary>". This parameter is just * passed through to RuleBasedBreakIterator's constructor. * @param dictionaryFilename The filename of the dictionary file to use */ public DictionaryBasedBreakIterator(String description, InputStream dictionaryStream) throws IOException { super(description); dictionary = new BreakDictionary(dictionaryStream); } /** * Returns a Builder that is customized to build a DictionaryBasedBreakIterator. * This is the same as RuleBasedBreakIterator.Builder, except for the extra code * to handle the <dictionary> tag. */ protected RuleBasedBreakIterator.Builder makeBuilder() { return new Builder(); } public void setText(CharacterIterator newText) { super.setText(newText); cachedBreakPositions = null; dictionaryCharCount = 0; positionInCache = 0; } /** * Sets the current iteration position to the beginning of the text. * (i.e., the CharacterIterator's starting offset). * @return The offset of the beginning of the text. */ public int first() { cachedBreakPositions = null; dictionaryCharCount = 0; positionInCache = 0; return super.first(); } /** * Sets the current iteration position to the end of the text. * (i.e., the CharacterIterator's ending offset). * @return The text's past-the-end offset. */ public int last() { cachedBreakPositions = null; dictionaryCharCount = 0; positionInCache = 0; return super.last(); } /** * Advances the iterator one step backwards. * @return The position of the last boundary position before the * current iteration position */ public int previous() { CharacterIterator text = getText(); // if we have cached break positions and we're still in the range // covered by them, just move one step backward in the cache if (cachedBreakPositions != null && positionInCache > 0) { --positionInCache; text.setIndex(cachedBreakPositions[positionInCache]); return cachedBreakPositions[positionInCache]; } // otherwise, dump the cache and use the inherited previous() method to move // backward. This may fill up the cache with new break positions, in which // case we have to mark our position in the cache else { cachedBreakPositions = null; int result = super.previous(); if (cachedBreakPositions != null) positionInCache = cachedBreakPositions.length - 2; return result; } } /** * Sets the current iteration position to the last boundary position * before the specified position. * @param offset The position to begin searching from * @return The position of the last boundary before "offset" */ public int preceding(int offset) { CharacterIterator text = getText(); checkOffset(offset, text); // if we have no cached break positions, or "offset" is outside the // range covered by the cache, we can just call the inherited routine // (which will eventually call other routines in this class that may // refresh the cache) if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] || offset > cachedBreakPositions[cachedBreakPositions.length - 1]) { cachedBreakPositions = null; return super.preceding(offset); } // on the other hand, if "offset" is within the range covered by the cache, // then all we have to do is search the cache for the last break position // before "offset" else { positionInCache = 0; while (positionInCache < cachedBreakPositions.length && offset > cachedBreakPositions[positionInCache]) ++positionInCache; --positionInCache; text.setIndex(cachedBreakPositions[positionInCache]); return text.getIndex(); } } /** * Sets the current iteration position to the first boundary position after * the specified position. * @param offset The position to begin searching forward from * @return The position of the first boundary after "offset" */ public int following(int offset) { CharacterIterator text = getText(); checkOffset(offset, text); // if we have no cached break positions, or if "offset" is outside the // range covered by the cache, then dump the cache and call our // inherited following() method. This will call other methods in this // class that may refresh the cache. if (cachedBreakPositions == null || offset < cachedBreakPositions[0] || offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) { cachedBreakPositions = null; return super.following(offset); } // on the other hand, if "offset" is within the range covered by the // cache, then just search the cache for the first break position // after "offset" else { positionInCache = 0; while (positionInCache < cachedBreakPositions.length && offset >= cachedBreakPositions[positionInCache]) ++positionInCache; text.setIndex(cachedBreakPositions[positionInCache]); return text.getIndex(); } } /** * This is the implementation function for next(). */ protected int handleNext() { CharacterIterator text = getText(); // if there are no cached break positions, or if we've just moved // off the end of the range covered by the cache, we have to dump // and possibly regenerate the cache if (cachedBreakPositions == null || positionInCache == cachedBreakPositions.length - 1) { // start by using the inherited handleNext() to find a tentative return // value. dictionaryCharCount tells us how many dictionary characters // we passed over on our way to the tentative return value int startPos = text.getIndex(); dictionaryCharCount = 0; int result = super.handleNext(); // if we passed over more than one dictionary character, then we use // divideUpDictionaryRange() to regenerate the cached break positions // for the new range if (dictionaryCharCount > 1 && result - startPos > 1) { divideUpDictionaryRange(startPos, result); } // otherwise, the value we got back from the inherited fuction // is our return value, and we can dump the cache else { cachedBreakPositions = null; return result; } } // if the cache of break positions has been regenerated (or existed all // along), then just advance to the next break position in the cache // and return it if (cachedBreakPositions != null) { ++positionInCache; text.setIndex(cachedBreakPositions[positionInCache]);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -