📄 rulebasedbreakiterator.java
字号:
if (replaceWith.charAt(0) == '(') { error("Ignore group can't be enclosed in (", startPos, description); } ignoreChars = CharSet.parseString(replaceWith); } } /** * This function builds the character category table. On entry, * tempRuleList is a vector of break rules that has had variable names substituted. * On exit, the charCategoryTable data member has been initialized to hold the * character category table, and tempRuleList's rules have been munged to contain * character category numbers everywhere a literal character or a [] expression * originally occurred. */ protected void buildCharCategories(Vector tempRuleList) { int bracketLevel = 0; int p = 0; int lineNum = 0; // build hash table of every literal character or [] expression in the rule list // and use CharSet.parseString() to derive a CharSet object representing the // characters each refers to expressions = new Hashtable(); while (lineNum < tempRuleList.size()) { String line = (String)(tempRuleList.elementAt(lineNum)); p = 0; while (p < line.length()) { char c = line.charAt(p); switch (c) { // skip over all syntax characters except [ case '{': case '}': case '(': case ')': case '*': case '.': case '/': case '|': case ';': case '?': case '!': break; // for [, find the matching ] (taking nested [] pairs into account) // and add the whole expression to the expression list case '[': int q = p + 1; ++bracketLevel; while (q < line.length() && bracketLevel != 0) { c = line.charAt(q); switch (c) { case '\\': q++; break; case '[': ++bracketLevel; break; case ']': --bracketLevel; break; } ++q; } if (expressions.get(line.substring(p, q)) == null) { expressions.put(line.substring(p, q), CharSet.parseString(line. substring(p, q))); } p = q - 1; break; // for \ sequences, just move to the next character and treat // it as a single character case '\\': ++p; c = line.charAt(p); // DON'T break; fall through into "default" clause // for an isolated single character, add it to the expression list default: expressions.put(line.substring(p, p + 1), CharSet.parseString(line. substring(p, p + 1))); break; } ++p; } ++lineNum; } // dump CharSet's internal expression cache CharSet.releaseExpressionCache(); // create the temporary category table (which is a vector of CharSet objects) categories = new Vector(); if (ignoreChars != null) { categories.addElement(ignoreChars); } else { categories.addElement(new CharSet()); } ignoreChars = null; // this is a hook to allow subclasses to add categories on their own mungeExpressionList(expressions); // Derive the character categories. Go through the existing character categories // looking for overlap. Any time there's overlap, we create a new character // category for the characters that overlapped and remove them from their original // category. At the end, any characters that are left in the expression haven't // been mentioned in any category, so another new category is created for them. // For example, if the first expression is [abc], then a, b, and c will be placed // into a single character category. If the next expression is [bcd], we will first // remove b and c from their existing category (leaving a behind), create a new // category for b and c, and then create another new category for d (which hadn't // been mentioned in the previous expression). // At no time should a character ever occur in more than one character category. // for each expression in the expressions list, do... Enumeration iter = expressions.elements(); while (iter.hasMoreElements()) { // initialize the working char set to the chars in the current expression CharSet e = (CharSet)iter.nextElement(); // for each category in the category list, do... for (int j = categories.size() - 1; !e.empty() && j > 0; j--) { // if there's overlap between the current working set of chars // and the current category... CharSet that = (CharSet)(categories.elementAt(j)); if (!that.intersection(e).empty()) { // add a new category for the characters that were in the // current category but not in the working char set CharSet temp = that.difference(e); if (!temp.empty()) { categories.addElement(temp); } // remove those characters from the working char set and replace // the current category with the characters that it did // have in common with the current working char set temp = e.intersection(that); e = e.difference(that); if (!temp.equals(that)) { categories.setElementAt(temp, j); } } } // if there are still characters left in the working char set, // add a new category containing them if (!e.empty()) { categories.addElement(e); } } // we have the ignore characters stored in position 0. Make an extra pass through // the character category list and remove anything from the ignore list that shows // up in some other category CharSet allChars = new CharSet(); for (int i = 1; i < categories.size(); i++) allChars = allChars.union((CharSet)(categories.elementAt(i))); CharSet ignoreChars = (CharSet)(categories.elementAt(0)); ignoreChars = ignoreChars.difference(allChars); categories.setElementAt(ignoreChars, 0); // now that we've derived the character categories, go back through the expression // list and replace each CharSet object with a String that represents the // character categories that expression refers to. The String is encoded: each // character is a character category number (plus 0x100 to avoid confusing them // with syntax characters in the rule grammar) iter = expressions.keys(); while (iter.hasMoreElements()) { String key = (String)iter.nextElement(); CharSet cs = (CharSet)expressions.get(key); StringBuffer cats = new StringBuffer(); // for each category... for (int j = 0; j < categories.size(); j++) { // if the current expression contains characters in that category... CharSet temp = cs.intersection((CharSet)(categories.elementAt(j))); if (!temp.empty()) { // then add the encoded category number to the String for this // expression cats.append((char)(0x100 + j)); if (temp.equals(cs)) { break; } } } // once we've finished building the encoded String for this expression, // replace the CharSet object with it expressions.put(key, cats.toString()); } // and finally, we turn the temporary category table into a permanent category // table, which is a CompactByteArray. (we skip category 0, which by definition // refers to all characters not mentioned specifically in the rules) charCategoryTable = new CompactByteArray((byte)0); // for each category... for (int i = 0; i < categories.size(); i++) { CharSet chars = (CharSet)(categories.elementAt(i)); // go through the character ranges in the category one by one... Enumeration enum = chars.getChars(); while (enum.hasMoreElements()) { char[] range = (char[])(enum.nextElement()); // and set the corresponding elements in the CompactArray accordingly if (i != 0) { charCategoryTable.setElementAt(range[0], range[1], (byte)i); } // (category 0 is special-- it's the hiding place for the ignore // characters, whose real category number in the CompactArray is // -1 [this is because category 0 contains all characters not // specifically mentioned anywhere in the rules] ) else { charCategoryTable.setElementAt(range[0], range[1], IGNORE); } } } // once we've populated the CompactArray, compact it charCategoryTable.compact(); // initialize numCategories numCategories = categories.size(); } protected void mungeExpressionList(Hashtable expressions) { // empty in the parent class. This function provides a hook for subclasses // to mess with the character category table. } /** * This is the function that builds the forward state table. Most of the real * work is done in parseRule(), which is called once for each rule in the * description. */ private void buildStateTable(Vector tempRuleList) { // initialize our temporary state table, and fill it with two states: // state 0 is a dummy state that allows state 1 to be the starting state // and 0 to represent "stop". State 1 is added here to seed things // before we start parsing tempStateTable = new Vector(); tempStateTable.addElement(new short[numCategories + 1]); tempStateTable.addElement(new short[numCategories + 1]); // call parseRule() for every rule in the rule list (except those which // start with !, which are actually backwards-iteration rules) for (int i = 0; i < tempRuleList.size(); i++) { String rule = (String)tempRuleList.elementAt(i); if (rule.charAt(0) != '!') { parseRule(rule, true); } } // finally, use finishBuildingStateTable() to minimize the number of // states in the table and perform some other cleanup work finishBuildingStateTable(true); } /** * This is where most of the work really happens. This routine parses a single * rule in the rule description, adding and modifying states in the state * table according to the new expression. The state table is kept deterministic * throughout the whole operation, although some ugly postprocessing is needed * to handle the *? token. */ private void parseRule(String rule, boolean forward) { // algorithm notes: // - The basic idea here is to read successive character-category groups // from the input string. For each group, you create a state and point // the appropriate entries in the previous state to it. This produces a // straight line from the start state to the end state. The {}, *, and (|) // idioms produce branches in this straight line. These branches (states // that can transition to more than one other state) are called "decision // points." A list of decision points is kept. This contains a list of // all states that can transition to the next state to be created. For a // straight line progression, the only thing in the decision-point list is // the current state. But if there's a branch, the decision-point list // will contain all of the beginning points of the branch when the next // state to be created represents the end point of the branch. A stack is // used to save decision point lists in the presence of nested parentheses // and the like. For example, when a { is encountered, the current decision // point list is saved on the stack and restored when the corresponding } // is encountered. This way, after the } is read, the decision point list // will contain both the state right before the } _and_ the state before // the whole {} expression. Both of these states can transition to the next // state after the {} expression. // - one complication arises when we have to stamp a transition value into // an array cell that already contains one. The updateStateTable() and // mergeStates() functions handle this case. Their basic approach is to // create a new state that combines the two states that conflict and point // at it when necessary. This happens recursively, so if the merged states // also conflict, they're resolved in the same way, and so on. There are // a number of tests aimed at preventing infinite recursion. // - another complication arises with repeating characters. It's somewhat // ambiguous whether the user wants
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -