📄 tokentowords.java
字号:
/** * Portions Copyright 2001-2003 Sun Microsystems, Inc. * Portions Copyright 1999-2001 Language Technologies Institute, * Carnegie Mellon University. * All Rights Reserved. Use is subject to license terms. * * See the file "license.terms" for information on usage and * redistribution of this file, and for a DISCLAIMER OF ALL * WARRANTIES. */package com.sun.speech.freetts.en.us;import java.io.*;import java.util.Hashtable;import java.util.Iterator;import java.util.List;import java.util.LinkedList;import java.util.regex.Pattern;import java.util.regex.Matcher;import com.sun.speech.freetts.FeatureSet;import com.sun.speech.freetts.FeatureSetImpl;import com.sun.speech.freetts.Item;import com.sun.speech.freetts.PathExtractor;import com.sun.speech.freetts.PathExtractorImpl;import com.sun.speech.freetts.ProcessException;import com.sun.speech.freetts.Relation;import com.sun.speech.freetts.Utterance;import com.sun.speech.freetts.UtteranceProcessor;import com.sun.speech.freetts.cart.CART;import com.sun.speech.freetts.util.Utilities;/** * Converts the Tokens (in US English words) in an * Utterance into a list of words. It puts the produced list back * into the Utterance. Usually, the tokens that gets expanded are numbers * like "23" (to "twenty" "three"). * <p> * It translates the following code from flite: * <br> * <code> * lang/usenglish/us_text.c * </code> */public class TokenToWords implements UtteranceProcessor { /** Regular expression for something that has a vowel */ private static final String RX_HAS_VOWEL = ".*[aeiouAEIOU].*"; // Patterns for regular expression matching private static final Pattern alphabetPattern; private static final Pattern commaIntPattern; private static final Pattern digits2DashPattern; private static final Pattern digitsPattern; private static final Pattern digitsSlashDigitsPattern; private static final Pattern dottedAbbrevPattern; private static final Pattern doublePattern; private static final Pattern drStPattern; private static final Pattern fourDigitsPattern; private static final Pattern hasVowelPattern; private static final Pattern illionPattern; private static final Pattern numberTimePattern; private static final Pattern numessPattern; private static final Pattern ordinalPattern; private static final Pattern romanNumbersPattern; private static final Pattern sevenPhoneNumberPattern; private static final Pattern threeDigitsPattern; private static final Pattern usMoneyPattern; static { alphabetPattern = Pattern.compile(USEnglish.RX_ALPHABET); commaIntPattern = Pattern.compile(USEnglish.RX_COMMAINT); digits2DashPattern = Pattern.compile(USEnglish.RX_DIGITS2DASH); digitsPattern = Pattern.compile(USEnglish.RX_DIGITS); digitsSlashDigitsPattern = Pattern.compile(USEnglish.RX_DIGITSSLASHDIGITS); dottedAbbrevPattern = Pattern.compile(USEnglish.RX_DOTTED_ABBREV); doublePattern = Pattern.compile(USEnglish.RX_DOUBLE); drStPattern = Pattern.compile(USEnglish.RX_DRST); fourDigitsPattern = Pattern.compile(USEnglish.RX_FOUR_DIGIT); hasVowelPattern = Pattern.compile(USEnglish.RX_HAS_VOWEL); illionPattern = Pattern.compile(USEnglish.RX_ILLION); numberTimePattern = Pattern.compile(USEnglish.RX_NUMBER_TIME); numessPattern = Pattern.compile(USEnglish.RX_NUMESS); ordinalPattern = Pattern.compile(USEnglish.RX_ORDINAL_NUMBER); romanNumbersPattern = Pattern.compile(USEnglish.RX_ROMAN_NUMBER); sevenPhoneNumberPattern = Pattern.compile(USEnglish.RX_SEVEN_DIGIT_PHONE_NUMBER); threeDigitsPattern = Pattern.compile(USEnglish.RX_THREE_DIGIT); usMoneyPattern = Pattern.compile(USEnglish.RX_US_MONEY); } // King-like words private static final String[] kingNames = { "louis", "henry", "charles", "philip", "george", "edward", "pius", "william", "richard", "ptolemy", "john", "paul", "peter", "nicholas", "frederick", "james", "alfonso", "ivan", "napoleon", "leo", "gregory", "catherine", "alexandria", "pierre", "elizabeth", "mary" }; private static final String[] kingTitles = { "king", "queen", "pope", "duke", "tsar", "emperor", "shah", "caesar", "duchess", "tsarina", "empress", "baron", "baroness", "sultan", "count", "countess" }; // Section-like words private static final String[] sectionTypes = { "section", "chapter", "part", "phrase", "verse", "scene", "act", "book", "volume", "chap", "war", "apollo", "trek", "fortran" }; /** * Here we use a hashtable for constant time matching, instead of using * if (A.equals(B) || A.equals(C) || ...) to match Strings */ private static Hashtable kingSectionLikeHash = new Hashtable(); private static final String KING_NAMES = "kingNames"; private static final String KING_TITLES = "kingTitles"; private static final String SECTION_TYPES = "sectionTypes"; // Hashtable initialization static { for (int i = 0; i < kingNames.length; i++) { kingSectionLikeHash.put(kingNames[i], KING_NAMES); } for (int i = 0; i < kingTitles.length; i++) { kingSectionLikeHash.put(kingTitles[i], KING_TITLES); } for (int i = 0; i < sectionTypes.length; i++) { kingSectionLikeHash.put(sectionTypes[i], SECTION_TYPES); } } private static final String[] postrophes = { "'s", "'ll", "'ve", "'d" }; // Finite state machines to check if a Token is pronounceable private PronounceableFSM prefixFSM = null; private PronounceableFSM suffixFSM = null; // List of US states abbreviations and their full names private static final String[][] usStates = { { "AL", "ambiguous", "alabama" }, { "Al", "ambiguous", "alabama" }, { "Ala", "", "alabama" }, { "AK", "", "alaska" }, { "Ak", "", "alaska" }, { "AZ", "", "arizona" }, { "Az", "", "arizona" }, { "CA", "", "california" }, { "Ca", "", "california" }, { "Cal", "ambiguous", "california" }, { "Calif", "", "california" }, { "CO", "ambiguous", "colorado" }, { "Co", "ambiguous", "colorado" }, { "Colo", "", "colorado" }, { "DC", "", "d" , "c" }, { "DE", "", "delaware" }, { "De", "ambiguous", "delaware" }, { "Del", "ambiguous", "delaware" }, { "FL", "", "florida" }, { "Fl", "ambiguous", "florida" }, { "Fla", "", "florida" }, { "GA", "", "georgia" }, { "Ga", "", "georgia" }, { "HI", "ambiguous", "hawaii" }, { "Hi", "ambiguous", "hawaii" }, { "IA", "", "iowa" }, { "Ia", "ambiguous", "iowa" }, { "IN", "ambiguous", "indiana" }, { "In", "ambiguous", "indiana" }, { "Ind", "ambiguous", "indiana" }, { "ID", "ambiguous", "idaho" }, { "IL", "ambiguous", "illinois" }, { "Il", "ambiguous", "illinois" }, { "ILL", "ambiguous", "illinois" }, { "KS", "", "kansas" }, { "Ks", "", "kansas" }, { "Kans", "", "kansas" }, { "KY", "ambiguous", "kentucky" }, { "Ky", "ambiguous", "kentucky" }, { "LA", "ambiguous", "louisiana" }, { "La", "ambiguous", "louisiana" }, { "Lou", "ambiguous", "louisiana" }, { "Lous", "ambiguous", "louisiana" }, { "MA", "ambiguous", "massachusetts" }, { "Mass", "ambiguous", "massachusetts" }, { "Ma", "ambiguous", "massachusetts" }, { "MD", "ambiguous", "maryland" }, { "Md", "ambiguous", "maryland" }, { "ME", "ambiguous", "maine" }, { "Me", "ambiguous", "maine" }, { "MI", "", "michigan" }, { "Mi", "ambiguous", "michigan" }, { "Mich", "ambiguous", "michigan" }, { "MN", "ambiguous", "minnestota" }, { "Minn", "ambiguous", "minnestota" }, { "MS", "ambiguous", "mississippi" }, { "Miss", "ambiguous", "mississippi" }, { "MT", "ambiguous", "montanna" }, { "Mt", "ambiguous", "montanna" }, { "MO", "ambiguous", "missouri" }, { "Mo", "ambiguous", "missouri" }, { "NC", "ambiguous", "north" , "carolina" }, { "ND", "ambiguous", "north" , "dakota" }, { "NE", "ambiguous", "nebraska" }, { "Ne", "ambiguous", "nebraska" }, { "Neb", "ambiguous", "nebraska" }, { "NH", "ambiguous", "new" , "hampshire" }, { "NV", "", "nevada" }, { "Nev", "", "nevada" }, { "NY", "", "new" , "york" }, { "OH", "ambiguous", "ohio" }, { "OK", "ambiguous", "oklahoma" }, { "Okla", "", "oklahoma" }, { "OR", "ambiguous", "oregon" }, { "Or", "ambiguous", "oregon" }, { "Ore", "ambiguous", "oregon" }, { "PA", "ambiguous", "pennsylvania" }, { "Pa", "ambiguous", "pennsylvania" }, { "Penn", "ambiguous", "pennsylvania" }, { "RI", "ambiguous", "rhode" , "island" }, { "SC", "ambiguous", "south" , "carlolina" }, { "SD", "ambiguous", "south" , "dakota" }, { "TN", "ambiguous", "tennesee" }, { "Tn", "ambiguous", "tennesee" }, { "Tenn", "ambiguous", "tennesee" }, { "TX", "ambiguous", "texas" }, { "Tx", "ambiguous", "texas" }, { "Tex", "ambiguous", "texas" }, { "UT", "ambiguous", "utah" }, { "VA", "ambiguous", "virginia" }, { "WA", "ambiguous", "washington" }, { "Wa", "ambiguous", "washington" }, { "Wash", "ambiguous", "washington" }, { "WI", "ambiguous", "wisconsin" }, { "Wi", "ambiguous", "wisconsin" }, { "WV", "ambiguous", "west" , "virginia" }, { "WY", "ambiguous", "wyoming" }, { "Wy", "ambiguous", "wyoming" }, { "Wyo", "", "wyoming" }, { "PR", "ambiguous", "puerto" , "rico" } }; // Again hashtable for constant time searching private static Hashtable usStatesHash = new Hashtable(); // initialize the Hashtable for usStates static { for (int i = 0; i < usStates.length; i++) { usStatesHash.put(usStates[i][0], usStates[i]); } }; // class variables // the word relation that we are building private WordRelation wordRelation; // the current token Item private Item tokenItem; // a CART for classifying numbers private CART cart; /** * Constructs a default USTokenWordProcessor. It uses the USEnglish * regular expression set (USEngRegExp) by default. * * @param usNumbersCART the cart to use to classify numbers */ public TokenToWords(CART usNumbersCART, PronounceableFSM prefixFSM, PronounceableFSM suffixFSM) { this.cart = usNumbersCART; this.prefixFSM = prefixFSM; this.suffixFSM = suffixFSM; } /** * Returns the currently processing token Item. * * @return the current token Item; null if no item */ public Item getTokenItem() { return tokenItem; } /** * process the utterance * * @param utterance the utterance contain the tokens * * @throws ProcessException if an IOException is thrown during the * processing of the utterance */ public void processUtterance(Utterance utterance) throws ProcessException { Relation tokenRelation; if ((tokenRelation = utterance.getRelation(Relation.TOKEN)) == null) { throw new IllegalStateException ("TokenToWords: Token relation does not exist"); } Item wordItem; wordRelation = WordRelation.createWordRelation(utterance, this); for (tokenItem = tokenRelation.getHead(); tokenItem != null; tokenItem = tokenItem.getNext()) { FeatureSet featureSet = tokenItem.getFeatures(); String tokenVal = featureSet.getString("name"); // convert the token into a list of words tokenToWords(tokenVal); } } /** * Returns true if the given token matches part of a phone number * * @param tokenItem the token * @param tokenVal the string value of the token * * @return true or false */ private boolean matchesPartPhoneNumber(String tokenVal) { String n_name = (String) tokenItem.findFeature("n.name"); String n_n_name = (String) tokenItem.findFeature("n.n.name"); String p_name = (String) tokenItem.findFeature("p.name"); String p_p_name = (String) tokenItem.findFeature("p.p.name"); boolean matches3DigitsP_name = matches(threeDigitsPattern, p_name); return ((matches(threeDigitsPattern, tokenVal) && ((!matches(digitsPattern, p_name) && matches(threeDigitsPattern, n_name) && matches(fourDigitsPattern, n_n_name)) || (matches(sevenPhoneNumberPattern, n_name)) || (!matches(digitsPattern, p_p_name) && matches3DigitsP_name && matches(fourDigitsPattern, n_name)))) || (matches(fourDigitsPattern, tokenVal) && (!matches(digitsPattern, n_name) && matches3DigitsP_name && matches(threeDigitsPattern, p_p_name)))); } /** * Returns true if the given string is in the given string array. * * @param value the string to check * @param stringArray the array to check * * @return true if the string is in the array, false otherwise */ private static boolean inStringArray(String value, String[] stringArray) { for (int i = 0; i < stringArray.length; i++) { if (stringArray[i].equals(value)) { return true; } } return false; } /**
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -