📄 lextreelinguist.java
字号:
/* * Copyright 1999-2002 Carnegie Mellon University. * Portions Copyright 2002 Sun Microsystems, Inc. * Portions Copyright 2002 Mitsubishi Electric Research Laboratories. * All Rights Reserved. Use is subject to license terms. * * See the file "license.terms" for information on usage and * redistribution of this file, and for a DISCLAIMER OF ALL * WARRANTIES. * */package edu.cmu.sphinx.linguist.lextree;import java.io.IOException;import java.util.ArrayList;import java.util.Collection;import java.util.Iterator;import java.util.List;import java.util.LinkedHashMap;import java.util.Map;import java.util.logging.Logger;import edu.cmu.sphinx.linguist.HMMSearchState;import edu.cmu.sphinx.linguist.Linguist;import edu.cmu.sphinx.linguist.SearchGraph;import edu.cmu.sphinx.linguist.SearchState;import edu.cmu.sphinx.linguist.SearchStateArc;import edu.cmu.sphinx.linguist.UnitSearchState;import edu.cmu.sphinx.linguist.WordSearchState;import edu.cmu.sphinx.linguist.WordSequence;import edu.cmu.sphinx.linguist.acoustic.AcousticModel;import edu.cmu.sphinx.linguist.acoustic.HMM;import edu.cmu.sphinx.linguist.acoustic.HMMState;import edu.cmu.sphinx.linguist.acoustic.HMMStateArc;import edu.cmu.sphinx.linguist.acoustic.Unit;import edu.cmu.sphinx.linguist.acoustic.UnitManager;import edu.cmu.sphinx.linguist.dictionary.Dictionary;import edu.cmu.sphinx.linguist.dictionary.Pronunciation;import edu.cmu.sphinx.linguist.dictionary.Word;import edu.cmu.sphinx.linguist.language.ngram.LanguageModel;import edu.cmu.sphinx.linguist.util.HMMPool;import edu.cmu.sphinx.util.LogMath;import edu.cmu.sphinx.util.Timer;import edu.cmu.sphinx.util.props.PropertyException;import edu.cmu.sphinx.util.props.PropertySheet;import edu.cmu.sphinx.util.props.PropertyType;import edu.cmu.sphinx.util.props.Registry;/** * A linguist that can represent large vocabularies efficiently. This class * implements the Linguist interface. The main role of any linguist is to * represent the search space for the decoder. The initial state in the search * space can be retrieved by a SearchManager via a call to <code> getInitialSearchState</code>. * This method returns a SearchState. Successor states can be retrieved via * calls to <code>SearchState.getSuccessors().</code>. There are a number of * search state subinterfaces that are used to indicate different types of * states in the search space: * * <ul> * <li><b>WordSearchState </b>- represents a word in the search space. * <li><b>UnitSearchState </b>- represents a unit in the search space * <li><b>HMMSearchState </b> represents an HMM state in the search space * </ul> * * A linguist has a great deal of latitude about the order in which it returns * states. For instance a 'flat' linguist may return a WordState at the * beginning of a word, while a 'tree' linguist may return WordStates at the * ending of a word. Likewise, a linguist may omit certain state types * completely (such as a unit state). Some Search Managers may want to know a * priori the order in which states will be generated by the linguist. The * method <code>getSearchStateOrder</code> can be used to retrieve the order * of state returned by the linguist. * * <p> * Depending on the vocabulary size and topology, the search space represented * by the linguist may include a very large number of states. Some linguists * will generate the search states dynamically, that is, the object * representing a particular state in the search space is not created until it * is needed by the SearchManager. SearchManagers often need to be able to * determine if a particular state has been entered before by comparing states. * Because SearchStates may be generated dynamically, the <code>SearchState.equals()</code> * call (as opposed to the reference equals '==' method) should be used to * determine if states are equal. The states returned by the linguist will * generally provide very efficient implementations of <code>equals</code> * and <code>hashCode</code>. This will allow a SearchManager to maintain * collections of states in HashMaps efficiently. * * * <p> * <b>LexTeeLinguist Characteristics </b> * * Some characteristics of this linguist: * <ul> * <li><b>Dynamic </b>- the linguist generates search states on the fly, * greatly reducing the required memory footprint * <li><b>tree topology </b> this linguist represents the search space as an * inverted tree. Units near the roots of word are shared among many different * words. These reduces the amount of states that need to be considered during * the search. * <li><b>HMM sharing </b>- because of state tying in the acoustic models, it * is often the case that triphone units that differ in the right context * actually are represented by the same HMM. This linguist recognizes this case * and will use a single state to represent the HMM instead of two states. This * can greatly reduce the number of states generated by the linguist. * <li><b>Small-footprint </b>- this linguist uses a few other techniques to * reduce the overall footprint of the search space. One technique that is * particularly helpful is to share the end word units (where the largest * fanout of states occurs) across all of the words. For a 60K word vocabulary, * these can result in a reduction in tree nodes of about 2 million to around * 3,000. * <li><b>Quick loading </b>- this linguist can compile the search space very * quickly. A 60K word vocabulary can be made ready in less than 10 seconds. * </ul> * * This linguist is not a general purpose linguist. It does impose some * constraints: * * <ul> * <li><b>unit size </b>- this linguist will units that are no larger than * triphones. * <li><b>n-gram grammars </b>- this linguist will generate the search space * directly from the N-Gram language model. The vocabulary supported is the * intersection of the words found in the language model and the words that * exist in the Dictionary. It is assumed that all sequences of words in the * vocabulary are valid. This linguist doesn't support arbitrary grammars. * </ul> * * <p> * <b>Design Notes </b> The following are some notes describing the design of * this linguist. They may be helpful to those who want to understand how this * linguist works but are not necessary if you are only interested in using * this linguist. * * * <p> * <b>Search Space Representation </b> It has been shown that representing the * search space as a tree can greatly reduce the number of active states in a * search since the units at the beginnings of words can be shared across * multiple words. For example, with a large vocabulary (60K words), at the end * of a word, with a flat representation, we have to provide transitions to the * initial state of each possible word. That is 60K transitions. In a tree * based system we need to only provide transitions to each initial phone * (within its context). That is about 1600 transitions. This is a substantial * reduction. Conceptually, this tree consists of a node for each possible * initial unit. Each node can have an arbitrary number of children which can * be either unit nodes or word nodes. * * <p> * This linguist uses the HMMTree class to build and represent the tree. The * HMMTree is given the dictionary and language model and builds the lex tree. * Instead of representing the nodes in the tree as phonemes and words as is * typically done, the HMMTree represents the tree as HMMs and words. The HMM * is essentially a unit within its context. This is typically a triphone * (although for some units (such as SIL) it is a simple phone. Representing * the nodes as HMM instead of nodes yields a much larger tree, but also has * some advantages: * * <ul> * <li>Because of state-tying in the acoustic models, many distinct triphones * actually share an HMM. Representing the nodes as HMMs allows these shared * HMMs to be represented in the tree only once instead of many times if we * representing states as phones or triphones. This leads to a reduction in the * actual number of states that are considered during a search. Experiments * have shown that this can reduce the required beam by a factor of 2 or 3. * <li>By representing the nodes as HMM, we avoid having to lookup the HMM for * a particular triphone during the search. This is a modest savings. * </ul> * * There are some disadvantages in representing the tree with HMMs: * * <ul> * <li><b>size </b> since HMMs represent units in their context, we have many * more copies of each node. For instance, instead of having a single unit * representing the initial 'd' in the word 'dog' we would have about 40 HMMs, * one for each possible left context. * <li><b>speed </b> building the much larger HMM tree can take much more * time, since many more nodes are needed to represent the tree. * <li><b>complexity </b> representing the tree with HMMs is more complex. * There are multiple entry points for each word/unit that have to be dealt * with. * </ul> * * Luckily the size and speed issues can be mitigated (by adding a bit more * complexity of course). The bulk of the nodes in the HMM tree are the word * ending nodes. There is a word ending node for each possible right context. * To reduce space, all of the word ending nodes are replaced by a single * EndNode. During the search, the actual hmm nodes for a particular EndNode * are generated on request. These sets of hmm nodes can be shared among * different word endings, and therefore are cached. The effect of using this * EndNode optimization is to reduce the space required by the tree by about * 300mb and the time required to generate the tree from about 60 seconds to * about 6 seconds. * * */public class LexTreeLinguist implements Linguist { /** * A sphinx property used to define the grammar to use when building the * search graph */ public final static String PROP_GRAMMAR = "grammar"; /** * A sphinx property used to define the acoustic model to use when building * the search graph */ public final static String PROP_ACOUSTIC_MODEL = "acousticModel"; /** * A sphinx property used to define the unit manager to use * when building the search graph */ public final static String PROP_UNIT_MANAGER = "unitManager"; /** * Sphinx property that defines the name of the logmath to be used by this * search manager. */ public final static String PROP_LOG_MATH = "logMath"; /** * Sphinx property used to determine whether or not the gstates are dumped. * * A sphinx property that determines whether or not full word histories are * used to determine when two states are equal. */ public final static String PROP_FULL_WORD_HISTORIES = "fullWordHistories"; /** * The default value for PROP_FULL_WORD_HISTORIES */ public final static boolean PROP_FULL_WORD_HISTORIES_DEFAULT = true; /** * A sphinx property for the language model to be used by this grammar */ public final static String PROP_LANGUAGE_MODEL = "languageModel"; /** * Property that defines the dictionary to use for this grammar */ public final static String PROP_DICTIONARY = "dictionary"; /** * A sphinx property that defines the size of the arc cache (zero * to disable the cache). */ public final static String PROP_CACHE_SIZE = "cacheSize"; /** * Property that defines the dictionary to use for this grammar */ public final static int PROP_CACHE_SIZE_DEFAULT = 0; // just for detailed debugging private final static boolean tracing = false; private final static SearchStateArc[] EMPTY_ARC = new SearchStateArc[0]; // ---------------------------------- // Subcomponents that are configured // by the property sheet // ----------------------------------- private LanguageModel languageModel; private AcousticModel acousticModel; private LogMath logMath; private Dictionary dictionary; private UnitManager unitManager; // ------------------------------------ // Data that is configured by the // property sheet // ------------------------------------ private String name; private Logger logger; private boolean fullWordHistories = true; private boolean addFillerWords = false; private boolean generateUnitStates = false; private boolean wantUnigramSmear = true; private float unigramSmearWeight = 1.0f; private float unigramSmearOffset = .0f; private boolean cacheEnabled = false; private int maxArcCacheSize = 0; private float languageWeight; private float logWordInsertionProbability; private float logUnitInsertionProbability; private float logFillerInsertionProbability; private float logSilenceInsertionProbability; private float logOne; // ------------------------------------ // Data used for building and maintaining // the search graph // ------------------------------------- private Word sentenceEndWord; private Word[] sentenceStartWordArray; private SearchGraph searchGraph; private HMMPool hmmPool; private HMMTree hmmTree; private ArcCache arcCache = new ArcCache(); private int cacheTrys; private int cacheHits; /* * (non-Javadoc) * * @see edu.cmu.sphinx.util.props.Configurable#register(java.lang.String, * edu.cmu.sphinx.util.props.Registry) */ public void register(String name, Registry registry) throws PropertyException { this.name = name; registry.register(PROP_ACOUSTIC_MODEL, PropertyType.COMPONENT); registry.register(PROP_LOG_MATH, PropertyType.COMPONENT); registry.register(PROP_LANGUAGE_MODEL, PropertyType.COMPONENT); registry.register(PROP_DICTIONARY, PropertyType.COMPONENT); registry.register(PROP_FULL_WORD_HISTORIES, PropertyType.BOOLEAN); registry.register(PROP_WANT_UNIGRAM_SMEAR, PropertyType.BOOLEAN); registry.register(PROP_WORD_INSERTION_PROBABILITY, PropertyType.DOUBLE); registry.register(PROP_SILENCE_INSERTION_PROBABILITY, PropertyType.DOUBLE); registry.register(PROP_FILLER_INSERTION_PROBABILITY, PropertyType.DOUBLE); registry.register(PROP_UNIT_INSERTION_PROBABILITY, PropertyType.DOUBLE); registry.register(PROP_LANGUAGE_WEIGHT, PropertyType.FLOAT); registry.register(PROP_ADD_FILLER_WORDS, PropertyType.BOOLEAN); registry.register(PROP_GENERATE_UNIT_STATES, PropertyType.BOOLEAN); registry.register(PROP_UNIGRAM_SMEAR_WEIGHT, PropertyType.FLOAT); registry.register(PROP_CACHE_SIZE, PropertyType.INT); registry.register(PROP_UNIT_MANAGER, PropertyType.COMPONENT); } /* * (non-Javadoc) * * @see edu.cmu.sphinx.util.props.Configurable#newProperties(edu.cmu.sphinx.util.props.PropertySheet) */ public void newProperties(PropertySheet ps) throws PropertyException { logger = ps.getLogger(); acousticModel = (AcousticModel) ps.getComponent(PROP_ACOUSTIC_MODEL, AcousticModel.class); logMath = (LogMath) ps.getComponent(PROP_LOG_MATH, LogMath.class); unitManager = (UnitManager) ps.getComponent(PROP_UNIT_MANAGER, UnitManager.class); languageModel = (LanguageModel) ps.getComponent(PROP_LANGUAGE_MODEL, LanguageModel.class); dictionary = (Dictionary) ps.getComponent(PROP_DICTIONARY, Dictionary.class); fullWordHistories = ps.getBoolean(PROP_FULL_WORD_HISTORIES, PROP_FULL_WORD_HISTORIES_DEFAULT); wantUnigramSmear = ps.getBoolean(PROP_WANT_UNIGRAM_SMEAR, PROP_WANT_UNIGRAM_SMEAR_DEFAULT); logWordInsertionProbability = logMath.linearToLog(ps.getDouble( PROP_WORD_INSERTION_PROBABILITY, PROP_WORD_INSERTION_PROBABILITY_DEFAULT)); logSilenceInsertionProbability = logMath.linearToLog(ps.getDouble( PROP_SILENCE_INSERTION_PROBABILITY, PROP_SILENCE_INSERTION_PROBABILITY_DEFAULT)); logFillerInsertionProbability = logMath.linearToLog(ps.getDouble( PROP_FILLER_INSERTION_PROBABILITY, PROP_FILLER_INSERTION_PROBABILITY_DEFAULT)); logUnitInsertionProbability = logMath.linearToLog(ps.getDouble( PROP_UNIT_INSERTION_PROBABILITY, PROP_UNIT_INSERTION_PROBABILITY_DEFAULT)); languageWeight = ps.getFloat(PROP_LANGUAGE_WEIGHT, PROP_LANGUAGE_WEIGHT_DEFAULT); addFillerWords = (ps.getBoolean(PROP_ADD_FILLER_WORDS, PROP_ADD_FILLER_WORDS_DEFAULT)); generateUnitStates = (ps.getBoolean(PROP_GENERATE_UNIT_STATES, PROP_GENERATE_UNIT_STATES_DEFAULT)); unigramSmearWeight = ps.getFloat(PROP_UNIGRAM_SMEAR_WEIGHT, PROP_UNIGRAM_SMEAR_WEIGHT_DEFAULT); int newMaxArcCacheSize = ps.getInt(PROP_CACHE_SIZE, PROP_CACHE_SIZE_DEFAULT); // if the new size of the arc cache is less than before // just clear out the cache, since we can easily grow it // but not easily shrink it. if (newMaxArcCacheSize < maxArcCacheSize) { arcCache = new ArcCache(); } maxArcCacheSize = newMaxArcCacheSize; cacheEnabled = maxArcCacheSize > 0; } /* * (non-Javadoc) * * @see edu.cmu.sphinx.util.props.Configurable#getName() */ public String getName() { return name; } /* * (non-Javadoc) * * @see edu.cmu.sphinx.linguist.Linguist#allocate() */ public void allocate() throws IOException { dictionary.allocate(); acousticModel.allocate(); languageModel.allocate(); compileGrammar(); acousticModel = null;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -