📄 rulebasedbreakiterator.java
字号:
return BreakIterator.DONE; } // no matter what, we always advance at least one character forward int result = text.getIndex() + 1; int lookaheadResult = 0; // begin in state 1 int state = START_STATE; int category; char c = text.current(); // loop until we reach the end of the text or transition to state 0 while (c != CharacterIterator.DONE && state != STOP_STATE) { // look up the current character's character category (which tells us // which column in the state table to look at) category = lookupCategory(c); // if the character isn't an ignore character, look up a state // transition in the state table if (category != IGNORE) { state = lookupState(state, category); } // if the state we've just transitioned to is a lookahead state, // (but not also an end state), save its position. If it's // both a lookahead state and an end state, update the break position // to the last saved lookup-state position if (lookaheadStates[state]) { if (endStates[state]) { result = lookaheadResult; } else { lookaheadResult = text.getIndex() + 1; } } // otherwise, if the state we've just transitioned to is an accepting // state, update the break position to be the current iteration position else { if (endStates[state]) { result = text.getIndex() + 1; } } c = text.next(); } // if we've run off the end of the text, and the very last character took us into // a lookahead state, advance the break position to the lookahead position // (the theory here is that if there are no characters at all after the lookahead // position, that always matches the lookahead criteria) if (c == CharacterIterator.DONE && lookaheadResult == text.getEndIndex()) { result = lookaheadResult; } text.setIndex(result); return result; } /** * This method backs the iterator back up to a "safe position" in the text. * This is a position that we know, without any context, must be a break position. * The various calling methods then iterate forward from this safe position to * the appropriate position to return. (For more information, see the description * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.) */ protected int handlePrevious() { CharacterIterator text = getText(); int state = START_STATE; int category = 0; int lastCategory = 0; char c = text.current(); // loop until we reach the beginning of the text or transition to state 0 while (c != CharacterIterator.DONE && state != STOP_STATE) { // save the last character's category and look up the current // character's category lastCategory = category; category = lookupCategory(c); // if the current character isn't an ignore character, look up a // state transition in the backwards state table if (category != IGNORE) { state = lookupBackwardState(state, category); } // then advance one character backwards c = text.previous(); } // if we didn't march off the beginning of the text, we're either one or two // positions away from the real break position. (One because of the call to // previous() at the end of the loop above, and another because the character // that takes us into the stop state will always be the character BEFORE // the break position.) if (c != CharacterIterator.DONE) { if (lastCategory != IGNORE) { text.setIndex(text.getIndex() + 2); } else { text.next(); } } return text.getIndex(); } /** * Looks up a character's category (i.e., its category for breaking purposes, * not its Unicode category) */ protected int lookupCategory(char c) { return charCategoryTable.elementAt(c); } /** * Given a current state and a character category, looks up the * next state to transition to in the state table. */ protected int lookupState(int state, int category) { return stateTable[state * numCategories + category]; } /** * Given a current state and a character category, looks up the * next state to transition to in the backwards state table. */ protected int lookupBackwardState(int state, int category) { return backwardsStateTable[state * numCategories + category]; } //======================================================================= // RuleBasedBreakIterator.Builder //======================================================================= /** * The Builder class has the job of constructing a RuleBasedBreakIterator from a * textual description. A Builder is constructed by RuleBasedBreakIterator's * constructor, which uses it to construct the iterator itself and then throws it * away. * <p>The construction logic is separated out into its own class for two primary * reasons: * <ul><li>The construction logic is quite sophisticated and large. Separating it * out into its own class means the code must only be loaded into memory while a * RuleBasedBreakIterator is being constructed, and can be purged after that. * <li>There is a fair amount of state that must be maintained throughout the * construction process that is not needed by the iterator after construction. * Separating this state out into another class prevents all of the functions that * construct the iterator from having to have really long parameter lists, * (hopefully) contributing to readability and maintainability.</ul> * <p>It'd be really nice if this could be an independent class rather than an * inner class, because that would shorten the source file considerably, but * making Builder an inner class of RuleBasedBreakIterator allows it direct access * to RuleBasedBreakIterator's private members, which saves us from having to * provide some kind of "back door" to the Builder class that could then also be * used by other classes. */ protected class Builder { /** * A temporary holding place used for calculating the character categories. * This object contains CharSet objects. */ protected Vector categories = null; /** * A table used to map parts of regexp text to lists of character categories, * rather than having to figure them out from scratch each time */ protected Hashtable expressions = null; /** * A temporary holding place for the list of ignore characters */ protected CharSet ignoreChars = null; /** * A temporary holding place where the forward state table is built */ protected Vector tempStateTable = null; /** * A list of all the states that have to be filled in with transitions to the * next state that is created. Used when building the state table from the * regular expressions. */ protected Vector decisionPointList = null; /** * A stack for holding decision point lists. This is used to handle nested * parentheses and braces in regexps. */ protected Stack decisionPointStack = null; /** * A list of states that loop back on themselves. Used to handle .*? */ protected Vector loopingStates = null; /** * Looping states actually have to be backfilled later in the process * than everything else. This is where a the list of states to backfill * is accumulated. This is also used to handle .*? */ protected Vector statesToBackfill = null; /** * A list mapping pairs of state numbers for states that are to be combined * to the state number of the state representing their combination. Used * in the process of making the state table deterministic to prevent * infinite recursion. */ protected Vector mergeList = null; /** * A flag that is used to indicate when the list of looping states can * be reset. */ protected boolean clearLoopingStates = false; /** * A bit mask used to indicate a bit in the table's flags column that marks a * state as an accepting state. */ protected static final int END_STATE_FLAG = 0x8000; /** * A bit mask used to indicate a bit in the table's flags column that marks a * state as one the builder shouldn't loop to any looping states */ protected static final int DONT_LOOP_FLAG = 0x4000; /** * A bit mask used to indicate a bit in the table's flags column that marks a * state as a lookahead state. */ protected static final int LOOKAHEAD_STATE_FLAG = 0x2000; /** * A bit mask representing the union of the mask values listed above. * Used for clearing or masking off the flag bits. */ protected static final int ALL_FLAGS = END_STATE_FLAG | LOOKAHEAD_STATE_FLAG | DONT_LOOP_FLAG; /** * No special construction is required for the Builder. */ public Builder() { } /** * This is the main function for setting up the BreakIterator's tables. It * just vectors different parts of the job off to other functions. */ public void buildBreakIterator() { Vector tempRuleList = buildRuleList(description); buildCharCategories(tempRuleList); buildStateTable(tempRuleList); buildBackwardsStateTable(tempRuleList); } /** * Thus function has three main purposes: * <ul><li>Perform general syntax checking on the description, so the rest of the * build code can assume that it's parsing a legal description. * <li>Split the description into separate rules * <li>Perform variable-name substitutions (so that no one else sees variable names) * </ul> */ private Vector buildRuleList(String description) { // invariants: // - parentheses must be balanced: ()[]{}<> // - nothing can be nested inside <> // - nothing can be nested inside [] except more []s // - pairs of ()[]{}<> must not be empty // - ; can only occur at the outer level // - | can only appear inside () // - only one = or / can occur in a single rule // - = and / cannot both occur in the same rule // - <> can only occur on the left side of a = expression // (because we'll perform substitutions to eliminate them other places) // - the left-hand side of a = expression can only be a single character // (possibly with \) or text inside <> // - the right-hand side of a = expression must be enclosed in [] or () // - * may not occur at the beginning of a rule, nor may it follow // =, /, (, (, |, }, ;, or * // - ? may only follow * // - the rule list must contain at least one / rule // - no rule may be empty // - all printing characters in the ASCII range except letters and digits // are reserved and must be preceded by \ // - ! may only occur at the beginning of a rule
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -