📄 dictionarybasedbreakiterator.java

📁 java源代码请看看啊提点宝贵的意见
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
            return cachedBreakPositions[positionInCache];        }        return -9999;   // SHOULD NEVER GET HERE!    }    /**     * Looks up a character category for a character.     */    protected int lookupCategory(char c) {        // this override of lookupCategory() exists only to keep track of whether we've        // passed over any dictionary characters.  It calls the inherited lookupCategory()        // to do the real work, and then checks whether its return value is one of the        // categories represented in the dictionary.  If it is, bump the dictionary-        // character count.        int result = super.lookupCategory(c);        if (result != RuleBasedBreakIterator.IGNORE && categoryFlags[result]) {            ++dictionaryCharCount;        }        return result;    }    /**     * This is the function that actually implements the dictionary-based     * algorithm.  Given the endpoints of a range of text, it uses the     * dictionary to determine the positions of any boundaries in this     * range.  It stores all the boundary positions it discovers in     * cachedBreakPositions so that we only have to do this work once     * for each time we enter the range.     */    private void divideUpDictionaryRange(int startPos, int endPos) {        CharacterIterator text = getText();        // the range we're dividing may begin or end with non-dictionary characters        // (i.e., for line breaking, we may have leading or trailing punctuation        // that needs to be kept with the word).  Seek from the beginning of the        // range to the first dictionary character        text.setIndex(startPos);        char c = text.current();        int category = lookupCategory(c);        while (category == IGNORE || !categoryFlags[category]) {            c = text.next();            category = lookupCategory(c);        }        // initialize.  We maintain two stacks: currentBreakPositions contains        // the list of break positions that will be returned if we successfully        // finish traversing the whole range now.  possibleBreakPositions lists        // all other possible word ends we've passed along the way.  (Whenever        // we reach an error [a sequence of characters that can't begin any word        // in the dictionary], we back up, possibly delete some breaks from        // currentBreakPositions, move a break from possibleBreakPositions        // to currentBreakPositions, and start over from there.  This process        // continues in this way until we either successfully make it all the way        // across the range, or exhaust all of our combinations of break        // positions.)        Stack currentBreakPositions = new Stack();        Stack possibleBreakPositions = new Stack();        Vector wrongBreakPositions = new Vector();        // the dictionary is implemented as a trie, which is treated as a state        // machine.  -1 represents the end of a legal word.  Every word in the        // dictionary is represented by a path from the root node to -1.  A path        // that ends in state 0 is an illegal combination of characters.        int state = 0;        // these two variables are used for error handling.  We keep track of the        // farthest we've gotten through the range being divided, and the combination        // of breaks that got us that far.  If we use up all possible break        // combinations, the text contains an error or a word that's not in the        // dictionary.  In this case, we "bless" the break positions that got us the        // farthest as real break positions, and then start over from scratch with        // the character where the error occurred.        int farthestEndPoint = text.getIndex();        Stack bestBreakPositions = null;        // initialize (we always exit the loop with a break statement)        c = text.current();        while (true) {            // if we can transition to state "-1" from our current state, we're            // on the last character of a legal word.  Push that position onto            // the possible-break-positions stack            if (dictionary.at(state, 0) == -1) {                possibleBreakPositions.push(new Integer(text.getIndex()));            }            // look up the new state to transition to in the dictionary            state = dictionary.at(state, c);            // if the character we're sitting on causes us to transition to            // the "end of word" state, then it was a non-dictionary character            // and we've successfully traversed the whole range.  Drop out            // of the loop.            if (state == -1) {                currentBreakPositions.push(new Integer(text.getIndex()));                break;            }            // if the character we're sitting on causes us to transition to            // the error state, or if we've gone off the end of the range            // without transitioning to the "end of word" state, we've hit            // an error...            else if (state == 0 || text.getIndex() >= endPos) {                // if this is the farthest we've gotten, take note of it in                // case there's an error in the text                if (text.getIndex() > farthestEndPoint) {                    farthestEndPoint = text.getIndex();                    bestBreakPositions = (Stack)(currentBreakPositions.clone());                }                // wrongBreakPositions is a list of all break positions 		// we've tried starting that didn't allow us to traverse		// all the way through the text.  Every time we pop a		//break position off of currentBreakPositions, we put it		// into wrongBreakPositions to avoid trying it again later.		// If we make it to this spot, we're either going to back		// up to a break in possibleBreakPositions and try starting		// over from there, or we've exhausted all possible break                // positions and are going to do the fallback procedure.		// This loop prevents us from messing with anything in		// possibleBreakPositions that didn't work as a starting		// point the last time we tried it (this is to prevent a bunch of                // repetitive checks from slowing down some extreme cases)                Integer newStartingSpot = null;                while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(                            possibleBreakPositions.peek())) {                    possibleBreakPositions.pop();                }                                // if we've used up all possible break-position combinations, there's                // an error or an unknown word in the text.  In this case, we start                // over, treating the farthest character we've reached as the beginning                // of the range, and "blessing" the break positions that got us that                // far as real break positions                if (possibleBreakPositions.isEmpty()) {                    if (bestBreakPositions != null) {                        currentBreakPositions = bestBreakPositions;                        if (farthestEndPoint < endPos) {                            text.setIndex(farthestEndPoint + 1);                        }                        else {                            break;                        }                    }                    else {                        if ((currentBreakPositions.size() == 0 ||			     ((Integer)(currentBreakPositions.peek())).intValue() != text.getIndex())			    && text.getIndex() != startPos) {                            currentBreakPositions.push(new Integer(text.getIndex()));                        }                        text.next();                        currentBreakPositions.push(new Integer(text.getIndex()));                    }                }                // if we still have more break positions we can try, then promote the                // last break in possibleBreakPositions into currentBreakPositions,                // and get rid of all entries in currentBreakPositions that come after                // it.  Then back up to that position and start over from there (i.e.,                // treat that position as the beginning of a new word)                else {                    Integer temp = (Integer)possibleBreakPositions.pop();                    Object temp2 = null;                    while (!currentBreakPositions.isEmpty() && temp.intValue() <                           ((Integer)currentBreakPositions.peek()).intValue()) {                        temp2 = currentBreakPositions.pop();                        wrongBreakPositions.addElement(temp2);                    }                    currentBreakPositions.push(temp);                    text.setIndex(((Integer)currentBreakPositions.peek()).intValue());                }                // re-sync "c" for the next go-round, and drop out of the loop if                // we've made it off the end of the range                c = text.current();                if (text.getIndex() >= endPos) {                    break;                }            }            // if we didn't hit any exceptional conditions on this last iteration,            // just advance to the next character and loop            else {                c = text.next();            }        }        // dump the last break position in the list, and replace it with the actual        // end of the range (which may be the same character, or may be further on        // because the range actually ended with non-dictionary characters we want to        // keep with the word)        if (!currentBreakPositions.isEmpty()) {            currentBreakPositions.pop();        }        currentBreakPositions.push(new Integer(endPos));        // create a regular array to hold the break positions and copy        // the break positions from the stack to the array (in addition,        // our starting position goes into this array as a break position).        // This array becomes the cache of break positions used by next()        // and previous(), so this is where we actually refresh the cache.        cachedBreakPositions = new int[currentBreakPositions.size() + 1];        cachedBreakPositions[0] = startPos;        for (int i = 0; i < currentBreakPositions.size(); i++) {            cachedBreakPositions[i + 1] = ((Integer)currentBreakPositions.elementAt(i)).intValue();        }        positionInCache = 0;    }    /**     * The Builder class for DictionaryBasedBreakIterator inherits almost all of     * its functionality from the Builder class for RuleBasedBreakIterator, but     * extends it with extra logic to handle the "<dictionary>" token     */    protected class Builder extends RuleBasedBreakIterator.Builder {        /**         * A CharSet that contains all the characters represented in the dictionary         */        private CharSet dictionaryChars = new CharSet();        private String dictionaryExpression = "";        /**         * No special initialization         */        public Builder() {	    DictionaryBasedBreakIterator.this.super();        }        /**         * We override handleSpecialSubstitution() to add logic to handle         * the <dictionary> tag.  If we see a substitution named "<dictionary>",         * parse the substitution expression and store the result in         * dictionaryChars.         */        protected void handleSpecialSubstitution(String replace, String replaceWith,                                                 int startPos, String description) {            super.handleSpecialSubstitution(replace, replaceWith, startPos, description);            if (replace.equals("<dictionary>")) {                if (replaceWith.charAt(0) == '(') {                    error("Dictionary group can't be enclosed in (", startPos, description);                }                dictionaryExpression = replaceWith;                dictionaryChars = CharSet.parseString(replaceWith);            }        }        /**         * The other half of the logic to handle the dictionary characters happens here.         * After the inherited builder has derived the real character categories, we         * set up the categoryFlags array in the iterator.  This array contains "true"         * for every character category that includes a dictionary character.         */        protected void buildCharCategories(Vector tempRuleList) {            super.buildCharCategories(tempRuleList);            categoryFlags = new boolean[categories.size()];            for (int i = 0; i < categories.size(); i++) {                CharSet cs = (CharSet)categories.elementAt(i);                if (!(cs.intersection(dictionaryChars).empty())) {                    categoryFlags[i] = true;                }            }        }        // This function is actually called by	// RuleBasedBreakIterator.buildCharCategories(), which is called	// by the function above.  This gives us a way to create a separate        // character category for the dictionary characters even when 	// RuleBasedBreakIterator isn't making a distinction.        protected void mungeExpressionList(Hashtable expressions) {            expressions.put(dictionaryExpression, dictionaryChars);        }    }}
上一页 12
💿 文件大小 245 K
👤 上传用户 liu2000dz
📂 所属分类 Java编程
🏷️ 相关标签

#java #源代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -