fulldictionary.java
来自「It is the Speech recognition software. 」· Java 代码 · 共 425 行
JAVA
425 行
/* * Copyright 1999-2002 Carnegie Mellon University. * Portions Copyright 2002 Sun Microsystems, Inc. * Portions Copyright 2002 Mitsubishi Electric Research Laboratories. * All Rights Reserved. Use is subject to license terms. * * See the file "license.terms" for information on usage and * redistribution of this file, and for a DISCLAIMER OF ALL * WARRANTIES. * */package edu.cmu.sphinx.linguist.dictionary;import java.io.IOException;import java.io.InputStream;import java.net.URL;import java.util.ArrayList;import java.util.HashMap;import java.util.Iterator;import java.util.LinkedList;import java.util.List;import java.util.Map;import java.util.Set;import java.util.SortedMap;import java.util.TreeMap;import java.util.logging.Logger;import edu.cmu.sphinx.linguist.acoustic.Context;import edu.cmu.sphinx.linguist.acoustic.Unit;import edu.cmu.sphinx.linguist.acoustic.UnitManager;import edu.cmu.sphinx.util.ExtendedStreamTokenizer;import edu.cmu.sphinx.util.StreamFactory;import edu.cmu.sphinx.util.Timer;import edu.cmu.sphinx.util.props.PropertyException;import edu.cmu.sphinx.util.props.PropertySheet;import edu.cmu.sphinx.util.props.PropertyType;import edu.cmu.sphinx.util.props.Registry;/** * Creates a dictionary by reading in an ASCII-based Sphinx-3 * format dictionary. Each line of the dictionary specifies the * word, followed by spaces or tab, followed by the pronuncation * (by way of the list of phones) of the word. Each word can have * more than one pronunciations. * For example, a digits dictionary will look like: * * <pre> * ONE HH W AH N * ONE(2) W AH N * TWO T UW * THREE TH R IY * FOUR F AO R * FIVE F AY V * SIX S IH K S * SEVEN S EH V AH N * EIGHT EY T * NINE N AY N * ZERO Z IH R OW * ZERO(2) Z IY R OW * OH OW * </pre> * <p> * In the above example, the words "one" and "zero" have two pronunciations * each. * <p> * This dictionary will read in all the words and its pronunciation(s) * at startup. Therefore, if the dictionary is big, it will take longer * to load and will consume more memory. */public class FullDictionary implements Dictionary { // ---------------------------------- // configuration variables // ---------------------------------- private String name; private Logger logger; private boolean addSilEndingPronunciation; private boolean allowMissingWords; private boolean createMissingWords; private String wordReplacement; private URL wordDictionaryFile; private URL fillerDictionaryFile; private boolean allocated = false; private UnitManager unitManager; private Map wordDictionary; private Map fillerDictionary; private Timer loadTimer; /* * (non-Javadoc) * * @see edu.cmu.sphinx.util.props.Configurable#register(java.lang.String, * edu.cmu.sphinx.util.props.Registry) */ public void register(String name, Registry registry) throws PropertyException { this.name = name; registry.register(PROP_DICTIONARY, PropertyType.RESOURCE); registry.register(PROP_FILLER_DICTIONARY, PropertyType.RESOURCE); registry.register(PROP_ADD_SIL_ENDING_PRONUNCIATION, PropertyType.BOOLEAN); registry.register(PROP_WORD_REPLACEMENT, PropertyType.STRING); registry.register(PROP_ALLOW_MISSING_WORDS, PropertyType.BOOLEAN); registry.register(PROP_CREATE_MISSING_WORDS, PropertyType.BOOLEAN); registry.register(PROP_UNIT_MANAGER, PropertyType.COMPONENT); } /* * (non-Javadoc) * * @see edu.cmu.sphinx.util.props.Configurable#newProperties(edu.cmu.sphinx.util.props.PropertySheet) */ public void newProperties(PropertySheet ps) throws PropertyException { logger = ps.getLogger(); wordDictionaryFile = ps.getResource(PROP_DICTIONARY); fillerDictionaryFile = ps.getResource(PROP_FILLER_DICTIONARY); addSilEndingPronunciation = ps.getBoolean( PROP_ADD_SIL_ENDING_PRONUNCIATION, PROP_ADD_SIL_ENDING_PRONUNCIATION_DEFAULT); wordReplacement = ps.getString(Dictionary.PROP_WORD_REPLACEMENT, PROP_WORD_REPLACEMENT_DEFAULT); allowMissingWords = ps.getBoolean(Dictionary.PROP_ALLOW_MISSING_WORDS, PROP_ALLOW_MISSING_WORDS_DEFAULT); createMissingWords = ps.getBoolean(PROP_CREATE_MISSING_WORDS, PROP_CREATE_MISSING_WORDS_DEFAULT); unitManager = (UnitManager) ps.getComponent(PROP_UNIT_MANAGER, UnitManager.class); } /* * (non-Javadoc) * * @see edu.cmu.sphinx.util.props.Configurable#getName() */ public String getName() { return name; } /* (non-Javadoc) * @see edu.cmu.sphinx.linguist.dictionary.Dictionary#allocate() */ public void allocate() throws IOException { if (!allocated) { loadTimer = Timer.getTimer("DictionaryLoad"); loadTimer.start(); // NOTE: "location" can be null here, in which case the // "wordDictionaryFile" and "fillerDictionaryFile" should // contain the full path to the Dictionaries. logger.info("Loading dictionary from: " + wordDictionaryFile); wordDictionary = loadDictionary(wordDictionaryFile.openStream(), false); logger.info("Loading filler dictionary from: " + fillerDictionaryFile); fillerDictionary = loadDictionary(fillerDictionaryFile.openStream(), true); loadTimer.stop(); allocated = true; } } /* (non-Javadoc) * @see edu.cmu.sphinx.linguist.dictionary.Dictionary#deallocate() */ public void deallocate() { if (allocated) { fillerDictionary = null; wordDictionary = null; loadTimer = null; allocated = false; } } /** * Loads the given sphinx3 style simple dictionary from the given * InputStream. The InputStream is assumed to contain ASCII data. * * @param inputStream * the InputStream of the dictionary * @param isFillerDict * true if this is a filler dictionary, false otherwise * * @throws java.io.IOException * if there is an error reading the dictionary */ private Map loadDictionary(InputStream inputStream, boolean isFillerDict) throws IOException { Map dictionary = new HashMap(); ExtendedStreamTokenizer est = new ExtendedStreamTokenizer(inputStream, true); String word; while ((word = est.getString()) != null) { word = removeParensFromWord(word); word = word.toLowerCase(); List units = new ArrayList(20); String unitText; while ((unitText = est.getString()) != null) { units.add(getCIUnit(unitText, isFillerDict)); } Unit[] unitsArray = (Unit[]) units.toArray(new Unit[units.size()]); List pronunciations = (List) dictionary.get(word); if (pronunciations == null) { pronunciations = new LinkedList(); } Pronunciation pronunciation = new Pronunciation(unitsArray, null, null, 1.0f); pronunciations.add(pronunciation); // if we are adding a SIL ending duplicate if (!isFillerDict && addSilEndingPronunciation) { units.add(UnitManager.SILENCE); Unit[] unitsArray2 = (Unit[]) units.toArray(new Unit[units .size()]); Pronunciation pronunciation2 = new Pronunciation(unitsArray2, null, null, 1.0f); pronunciations.add(pronunciation2); } dictionary.put(word, pronunciations); } inputStream.close(); est.close(); createWords(dictionary, isFillerDict); return dictionary; } /** * Converts the spelling/Pronunciations mappings in the dictionary into * spelling/Word mappings. * * @param isFillerDict * if true this is a filler dictionary * */ private void createWords(Map dictionary, boolean isFillerDict) { Set spellings = dictionary.keySet(); for (Iterator s = spellings.iterator(); s.hasNext();) { String spelling = (String) s.next(); List pronunciations = (List) dictionary.get(spelling); Pronunciation[] pros = new Pronunciation[pronunciations.size()]; for (int i = 0; i < pros.length; i++) { pros[i] = (Pronunciation) pronunciations.get(i); } Word word = new Word(spelling, pros, isFillerDict); for (int i = 0; i < pros.length; i++) { pros[i].setWord(word); } dictionary.put(spelling, word); } } /** * Gets a context independent unit. There should only be one instance of * any CI unit * * @param name * the name of the unit * @param isFiller * if true, the unit is a filler unit * * @return the unit * */ private Unit getCIUnit(String name, boolean isFiller) { return unitManager.getUnit(name, isFiller, Context.EMPTY_CONTEXT); } /** * Returns a new string that is the given word but with the ending * parenthesis removed. * <p> * Example: * * <pre> * "LEAD(2)" returns "LEAD" * "LEAD" returns "LEAD" * @param word * the word to be stripped * * @return the given word but with all characters from the first * open parentheses removed */ private String removeParensFromWord(String word) { if (word.charAt(word.length() - 1) == ')') { int index = word.lastIndexOf('('); if (index > 0) { word = word.substring(0, index); } } return word; } /** * Returns a Word object based on the spelling and its classification. The * behavior of this method is affected by the properties wordReplacement, * allowMissingWords, and createMissingWords. * * @param text * the spelling of the word of interest. * * @return a Word object * * @see edu.cmu.sphinx.linguist.dictionary.Word */ public Word getWord(String text) { text = text.toLowerCase(); Word word = lookupWord(text); if (word == null) { logger.warning("Missing word: " + text); if (wordReplacement != null) { word = lookupWord(wordReplacement); logger.warning("Replacing " + text + " with " + wordReplacement); if (word == null) { logger.severe("Replacement word " + wordReplacement + " not found!"); } } else if (allowMissingWords) { if (createMissingWords) { word = new Word(text, null, false); wordDictionary.put(text, word); } return null; } } return word; } /** * Lookups up a word * * @param spelling * the spellling of the word * * @return the word or null */ private Word lookupWord(String spelling) { Word word = (Word) wordDictionary.get(spelling); if (word == null) { word = (Word) fillerDictionary.get(spelling); } return word; } /** * Returns the sentence start word. * * @return the sentence start word */ public Word getSentenceStartWord() { return getWord(SENTENCE_START_SPELLING); } /** * Returns the sentence end word. * * @return the sentence end word */ public Word getSentenceEndWord() { return getWord(SENTENCE_END_SPELLING); } /** * Returns the silence word. * * @return the silence word */ public Word getSilenceWord() { return getWord(SILENCE_SPELLING); } /** * Returns the set of all possible word classifications for this * dictionary. * * @return the set of all possible word classifications */ public WordClassification[] getPossibleWordClassifications() { return null; } /** * Returns a string representation of this FullDictionary in alphabetical * order. * * @return a string representation of this FullDictionary */ public String toString() { SortedMap sorted = new TreeMap(wordDictionary); String result = ""; sorted.putAll(fillerDictionary); for (Iterator i = sorted.keySet().iterator(); i.hasNext();) { String text = (String) i.next(); Word word = getWord(text); Pronunciation[] pronunciations = word.getPronunciations(null); result += (word + "\n"); for (int p = 0; p < pronunciations.length; p++) { result += (" " + pronunciations[p].toString() + "\n"); } } return result; } /** * Gets the set of all filler words in the dictionary * * @return an array (possibly empty) of all filler words */ public Word[] getFillerWords() { return (Word[]) fillerDictionary.values().toArray( new Word[fillerDictionary.values().size()]); } /** * Dumps this FullDictionary to System.out. */ public void dump() { System.out.println(wordDictionary.size() + " words"); System.out.print(toString()); }}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?