📄 rulebasedbreakiterator.java

📁 java源代码请看看啊提点宝贵的意见
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
            return BreakIterator.DONE;        }        // no matter what, we always advance at least one character forward        int result = text.getIndex() + 1;        int lookaheadResult = 0;        // begin in state 1        int state = START_STATE;        int category;        char c = text.current();        // loop until we reach the end of the text or transition to state 0        while (c != CharacterIterator.DONE && state != STOP_STATE) {            // look up the current character's character category (which tells us            // which column in the state table to look at)            category = lookupCategory(c);            // if the character isn't an ignore character, look up a state            // transition in the state table            if (category != IGNORE) {                state = lookupState(state, category);            }            // if the state we've just transitioned to is a lookahead state,            // (but not also an end state), save its position.  If it's            // both a lookahead state and an end state, update the break position            // to the last saved lookup-state position            if (lookaheadStates[state]) {                if (endStates[state]) {                    result = lookaheadResult;                }                else {                    lookaheadResult = text.getIndex() + 1;                }            }            // otherwise, if the state we've just transitioned to is an accepting            // state, update the break position to be the current iteration position            else {                if (endStates[state]) {                    result = text.getIndex() + 1;                }            }            c = text.next();        }        // if we've run off the end of the text, and the very last character took us into        // a lookahead state, advance the break position to the lookahead position        // (the theory here is that if there are no characters at all after the lookahead        // position, that always matches the lookahead criteria)        if (c == CharacterIterator.DONE && lookaheadResult == text.getEndIndex()) {            result = lookaheadResult;        }        text.setIndex(result);        return result;    }    /**     * This method backs the iterator back up to a "safe position" in the text.     * This is a position that we know, without any context, must be a break position.     * The various calling methods then iterate forward from this safe position to     * the appropriate position to return.  (For more information, see the description     * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)     */    protected int handlePrevious() {        CharacterIterator text = getText();        int state = START_STATE;        int category = 0;        int lastCategory = 0;        char c = text.current();        // loop until we reach the beginning of the text or transition to state 0        while (c != CharacterIterator.DONE && state != STOP_STATE) {            // save the last character's category and look up the current            // character's category            lastCategory = category;            category = lookupCategory(c);            // if the current character isn't an ignore character, look up a            // state transition in the backwards state table            if (category != IGNORE) {                state = lookupBackwardState(state, category);            }            // then advance one character backwards            c = text.previous();        }        // if we didn't march off the beginning of the text, we're either one or two        // positions away from the real break position.  (One because of the call to        // previous() at the end of the loop above, and another because the character        // that takes us into the stop state will always be the character BEFORE        // the break position.)        if (c != CharacterIterator.DONE) {            if (lastCategory != IGNORE) {                text.setIndex(text.getIndex() + 2);            }            else {                text.next();            }        }        return text.getIndex();    }    /**     * Looks up a character's category (i.e., its category for breaking purposes,     * not its Unicode category)     */    protected int lookupCategory(char c) {        return charCategoryTable.elementAt(c);    }    /**     * Given a current state and a character category, looks up the     * next state to transition to in the state table.     */    protected int lookupState(int state, int category) {        return stateTable[state * numCategories + category];    }    /**     * Given a current state and a character category, looks up the     * next state to transition to in the backwards state table.     */    protected int lookupBackwardState(int state, int category) {        return backwardsStateTable[state * numCategories + category];    }    //=======================================================================    // RuleBasedBreakIterator.Builder    //=======================================================================    /**     * The Builder class has the job of constructing a RuleBasedBreakIterator from a     * textual description.  A Builder is constructed by RuleBasedBreakIterator's     * constructor, which uses it to construct the iterator itself and then throws it     * away.     * <p>The construction logic is separated out into its own class for two primary     * reasons:     * <ul><li>The construction logic is quite sophisticated and large.  Separating it     * out into its own class means the code must only be loaded into memory while a     * RuleBasedBreakIterator is being constructed, and can be purged after that.     * <li>There is a fair amount of state that must be maintained throughout the     * construction process that is not needed by the iterator after construction.     * Separating this state out into another class prevents all of the functions that     * construct the iterator from having to have really long parameter lists,     * (hopefully) contributing to readability and maintainability.</ul>     * <p>It'd be really nice if this could be an independent class rather than an     * inner class, because that would shorten the source file considerably, but     * making Builder an inner class of RuleBasedBreakIterator allows it direct access     * to RuleBasedBreakIterator's private members, which saves us from having to     * provide some kind of "back door" to the Builder class that could then also be     * used by other classes.     */    protected class Builder {        /**         * A temporary holding place used for calculating the character categories.         * This object contains CharSet objects.         */        protected Vector categories = null;        /**         * A table used to map parts of regexp text to lists of character categories,         * rather than having to figure them out from scratch each time         */        protected Hashtable expressions = null;        /**         * A temporary holding place for the list of ignore characters         */        protected CharSet ignoreChars = null;        /**         * A temporary holding place where the forward state table is built         */        protected Vector tempStateTable = null;        /**         * A list of all the states that have to be filled in with transitions to the         * next state that is created.  Used when building the state table from the         * regular expressions.         */        protected Vector decisionPointList = null;        /**         * A stack for holding decision point lists.  This is used to handle nested         * parentheses and braces in regexps.         */        protected Stack decisionPointStack = null;        /**         * A list of states that loop back on themselves.  Used to handle .*?         */        protected Vector loopingStates = null;        /**         * Looping states actually have to be backfilled later in the process         * than everything else.  This is where a the list of states to backfill         * is accumulated.  This is also used to handle .*?         */        protected Vector statesToBackfill = null;        /**         * A list mapping pairs of state numbers for states that are to be combined         * to the state number of the state representing their combination.  Used         * in the process of making the state table deterministic to prevent         * infinite recursion.         */        protected Vector mergeList = null;        /**         * A flag that is used to indicate when the list of looping states can         * be reset.         */        protected boolean clearLoopingStates = false;        /**         * A bit mask used to indicate a bit in the table's flags column that marks a         * state as an accepting state.         */        protected static final int END_STATE_FLAG = 0x8000;        /**         * A bit mask used to indicate a bit in the table's flags column that marks a         * state as one the builder shouldn't loop to any looping states         */        protected static final int DONT_LOOP_FLAG = 0x4000;        /**         * A bit mask used to indicate a bit in the table's flags column that marks a         * state as a lookahead state.         */        protected static final int LOOKAHEAD_STATE_FLAG = 0x2000;        /**         * A bit mask representing the union of the mask values listed above.         * Used for clearing or masking off the flag bits.         */        protected static final int ALL_FLAGS = END_STATE_FLAG | LOOKAHEAD_STATE_FLAG                | DONT_LOOP_FLAG;        /**         * No special construction is required for the Builder.         */        public Builder() {        }        /**         * This is the main function for setting up the BreakIterator's tables.  It         * just vectors different parts of the job off to other functions.         */        public void buildBreakIterator() {            Vector tempRuleList = buildRuleList(description);            buildCharCategories(tempRuleList);            buildStateTable(tempRuleList);            buildBackwardsStateTable(tempRuleList);        }        /**         * Thus function has three main purposes:         * <ul><li>Perform general syntax checking on the description, so the rest of the         * build code can assume that it's parsing a legal description.         * <li>Split the description into separate rules         * <li>Perform variable-name substitutions (so that no one else sees variable names)         * </ul>         */        private Vector buildRuleList(String description) {            // invariants:            // - parentheses must be balanced: ()[]{}<>            // - nothing can be nested inside <>            // - nothing can be nested inside [] except more []s            // - pairs of ()[]{}<> must not be empty            // - ; can only occur at the outer level            // - | can only appear inside ()            // - only one = or / can occur in a single rule            // - = and / cannot both occur in the same rule            // - <> can only occur on the left side of a = expression            //   (because we'll perform substitutions to eliminate them other places)            // - the left-hand side of a = expression can only be a single character            //   (possibly with \) or text inside <>            // - the right-hand side of a = expression must be enclosed in [] or ()            // - * may not occur at the beginning of a rule, nor may it follow            //   =, /, (, (, |, }, ;, or *            // - ? may only follow *            // - the rule list must contain at least one / rule            // - no rule may be empty            // - all printing characters in the ASCII range except letters and digits            //   are reserved and must be preceded by \            // - ! may only occur at the beginning of a rule
💿 文件大小 245 K
👤 上传用户 liu2000dz
📂 所属分类 Java编程
🏷️ 相关标签

#java #源代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -