📄 tokentowords.java
字号:
} else { String pName = (String) tokenItem.findFeature("p.name"); String nName = (String) tokenItem.findFeature("n.name"); char p0 = pName.charAt(0); char n0 = nName.charAt(0); if (isUppercaseLetter(p0) && isLowercaseLetter(n0)) { wordRelation.addWord(street); } else if (NumberExpander.isDigit(p0) && isLowercaseLetter(n0)) { wordRelation.addWord(street); } else if (isLowercaseLetter(p0) && isUppercaseLetter(n0)) { wordRelation.addWord(saint); } else { String whitespace = (String) tokenItem.findFeature("n.whitespace"); if (whitespace.equals(" ")) { wordRelation.addWord(saint); } else { wordRelation.addWord(street); } } } if (punctuation != null && punctuation.equals(".")) { featureSet.setString("punc", ""); } } /** * Converts US money string into (word) Items in the WordRelation. * * @param tokenVal the US money string */ private void usMoneyToWords(String tokenVal) { int dotIndex = tokenVal.indexOf('.'); if (matches(illionPattern, (String) tokenItem.findFeature("n.name"))) { NumberExpander.expandReal(tokenVal.substring(1), wordRelation); } else if (dotIndex == -1) { String aaa = tokenVal.substring(1); tokenToWords(aaa); if (aaa.equals("1")) { wordRelation.addWord("dollar"); } else { wordRelation.addWord("dollars"); } } else if (dotIndex == (tokenVal.length() - 1) || (tokenVal.length() - dotIndex) > 3) { /* simply read as mumble point mumble */ NumberExpander.expandReal(tokenVal.substring(1), wordRelation); wordRelation.addWord("dollars"); } else { String aaa = tokenVal.substring(1, dotIndex); aaa = Utilities.deleteChar(aaa, ','); String bbb = tokenVal.substring(dotIndex+1); NumberExpander.expandNumber(aaa, wordRelation); if (aaa.equals("1")) { wordRelation.addWord("dollar"); } else { wordRelation.addWord("dollars"); } if (bbb.equals("00")) { // add nothing to the word list } else { NumberExpander.expandNumber(bbb, wordRelation); if (bbb.equals("01")) { wordRelation.addWord("cent"); } else { wordRelation.addWord("cents"); } } } } /** * Convert the given apostrophed word into (word) Items in the Word * Relation. * * @param tokenVal the apostrophed word string */ private void postropheToWords(String tokenVal) { int index = tokenVal.indexOf('\''); String bbb = tokenVal.substring(index).toLowerCase(); if (inStringArray(bbb, postrophes)) { String aaa = tokenVal.substring(0, index); tokenToWords(aaa); wordRelation.addWord(bbb); } else if (bbb.equals("'tve")) { String aaa = tokenVal.substring(0, index-2); tokenToWords(aaa); wordRelation.addWord("'ve"); } else { /* internal single quote deleted */ StringBuffer buffer = new StringBuffer(tokenVal); buffer.deleteCharAt(index); tokenToWords(buffer.toString()); } } /** * Convert the given digits/digits string into word (Items) in the * WordRelation. * * @param tokenVal the digits/digits string */ private void digitsSlashDigitsToWords(String tokenVal) { /* might be fraction, or not */ int index = tokenVal.indexOf('/'); String aaa = tokenVal.substring(0, index); String bbb = tokenVal.substring(index+1); int a, b; // if the previous token is a number, add an "and" if (matches(digitsPattern, (String) tokenItem.findFeature("p.name")) && tokenItem.getPrevious() != null) { wordRelation.addWord("and"); } if (aaa.equals("1") && bbb.equals("2")) { wordRelation.addWord("a"); wordRelation.addWord("half"); } else if ((a = Integer.parseInt(aaa)) < (b = Integer.parseInt(bbb))) { NumberExpander.expandNumber(aaa, wordRelation); NumberExpander.expandOrdinal(bbb, wordRelation); if (a > 1) { wordRelation.addWord("'s"); } } else { NumberExpander.expandNumber(aaa, wordRelation); wordRelation.addWord("slash"); NumberExpander.expandNumber(bbb, wordRelation); } } /** * Convert the given dashed string (e.g. "aaa-bbb") into (word) Items * in the WordRelation. * * @param tokenVal the dashed string */ private void dashToWords(String tokenVal) { int index = tokenVal.indexOf('-'); String aaa = tokenVal.substring(0, index); String bbb = tokenVal.substring(index+1, tokenVal.length()); if (matches(digitsPattern, aaa) && matches(digitsPattern, bbb)) { FeatureSet featureSet = tokenItem.getFeatures(); featureSet.setString("name", aaa); tokenToWords(aaa); wordRelation.addWord("to"); featureSet.setString("name", bbb); tokenToWords(bbb); featureSet.setString("name", ""); } else { tokenToWords(aaa); tokenToWords(bbb); } } /** * Convert the given string (which does not only consist of alphabet) * into (word) Items in the WordRelation. * * @param tokenVal the string */ private void notJustAlphasToWords(String tokenVal) { /* its not just alphas */ int index = 0; int tokenLength = tokenVal.length(); for (; index < tokenLength; index++) { if (isTextSplitable(tokenVal, index)) { break; } } String aaa = tokenVal.substring(0, index+1); String bbb = tokenVal.substring(index+1, tokenLength); FeatureSet featureSet = tokenItem.getFeatures(); featureSet.setString("nsw", "nide"); tokenToWords(aaa); tokenToWords(bbb); } /** * Returns true if the given word is pronounceable. * This method is originally called us_aswd() in Flite 1.1. * * @param word the word to test * * @return true if the word is pronounceable, false otherwise */ public boolean isPronounceable(String word) { String lowerCaseWord = word.toLowerCase(); return (prefixFSM.accept(lowerCaseWord) && suffixFSM.accept(lowerCaseWord)); } /** * Returns true if the given token is the name of a US state. * If it is, it will add the name of the state to (word) Items in the * WordRelation. * * @param tokenVal the token string */ private boolean isStateName(String tokenVal) { String[] state = (String[]) usStatesHash.get(tokenVal); if (state != null) { boolean expandState = false; // check to see if the state initials are ambiguous // in the English language if (state[1].equals("ambiguous")) { String previous = (String) tokenItem.findFeature("p.name"); String next = (String) tokenItem.findFeature("n.name"); // System.out.println("previous = " + previous); // System.out.println("next = " + next); int nextLength = next.length(); FeatureSet featureSet = tokenItem.getFeatures(); // check if the previous word starts with a capital letter, // is at least 3 letters long, is an alphabet sequence, // and has a comma. boolean previousIsCity = (isUppercaseLetter(previous.charAt(0)) && previous.length() > 2 && matches(alphabetPattern, previous) && tokenItem.findFeature("p.punc").equals(",")); // check if next token starts with a lower case, or // this is the end of sentence, or if next token // is a period (".") or a zip code (5 or 10 digits). boolean nextIsGood = (isLowercaseLetter(next.charAt(0)) || tokenItem.getNext() == null || featureSet.getString("punc").equals(".") || ((nextLength == 5 || nextLength == 10) && matches(digitsPattern, next))); if (previousIsCity && nextIsGood) { expandState = true; } else { expandState = false; } } else { expandState = true; } if (expandState) { for (int j = 2; j < state.length; j++) { if (state[j] != null) { wordRelation.addWord(state[j]); } } return true; } } return false; } /** * Determines if the given input matches the given Pattern. * * @param pattern the pattern to match * @param input the string to test * * @return <code>true</code> if the input string matches the given Pattern; * <code>false</code> otherwise */ private static boolean matches(Pattern pattern, String input) { Matcher m = pattern.matcher(input); return m.matches(); } /** * Determines if the character at the given position of the given * input text is splittable. A character is splittable if: * <p> * 1) the character and the following character are not letters * in the English alphabet (A-Z and a-z) * <p> * 2) the character and the following character are not digits (0-9) * <p> * @param text the text containing the character of interest * @param index the index of the character of interest * * @return true if the position of the given text is splittable * false otherwise */ private static boolean isTextSplitable(String text, int index) { char c0 = text.charAt(index); char c1 = text.charAt(index+1); if (isLetter(c0) && isLetter(c1)) { return false; } else if (NumberExpander.isDigit(c0) && NumberExpander.isDigit(c1)) { return false; } else { return true; } } /** * Returns true if the given character is a letter (a-z or A-Z). * * @param ch the character to test * * @return true or false */ private static boolean isLetter(char ch) { return (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')); } /** * Returns true if the given character is an uppercase letter (A-Z). * * @param ch the character to test * * @return true or false */ private static boolean isUppercaseLetter(char ch) { return ('A' <= ch && ch <= 'Z'); } /** * Returns true if the given character is a lowercase letter (a-z). * * @param ch the character to test * * @return true or false */ private static boolean isLowercaseLetter(char ch) { return ('a' <= ch && ch <= 'z'); } /** * Converts this object to its String representation * * @return the string representation of this object */ public String toString() { return "TokenToWords"; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -