fastdictionary.java

来自「It is the Speech recognition software. 」· Java 代码 · 共 455 行

JAVA
455
字号
/* * Copyright 1999-2002 Carnegie Mellon University.   * Portions Copyright 2002 Sun Microsystems, Inc.   * Portions Copyright 2002 Mitsubishi Electric Research Laboratories. * All Rights Reserved.  Use is subject to license terms. *  * See the file "license.terms" for information on usage and * redistribution of this file, and for a DISCLAIMER OF ALL  * WARRANTIES. * */package edu.cmu.sphinx.linguist.dictionary;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.net.URL;import java.util.HashMap;import java.util.HashSet;import java.util.Iterator;import java.util.LinkedList;import java.util.List;import java.util.Map;import java.util.Set;import java.util.SortedMap;import java.util.StringTokenizer;import java.util.TreeMap;import java.util.logging.Logger;import edu.cmu.sphinx.linguist.acoustic.Context;import edu.cmu.sphinx.linguist.acoustic.Unit;import edu.cmu.sphinx.linguist.acoustic.UnitManager;import edu.cmu.sphinx.util.StreamFactory;import edu.cmu.sphinx.util.Timer;import edu.cmu.sphinx.util.props.PropertyException;import edu.cmu.sphinx.util.props.PropertySheet;import edu.cmu.sphinx.util.props.PropertyType;import edu.cmu.sphinx.util.props.Registry;/** * Creates a dictionary by quickly reading in an ASCII-based Sphinx-3 format * dictionary. It is called the FastDictionary because the loading is fast. * When loaded the dictionary just loads each line of the dictionary into the * hash table, assuming that most words are not going to be used. Only when a * word is actually used is its pronunciations massaged into an array of * pronunciations. * <p> * The format of the ASCII dictionary that it explains is the same as the * {@link FullDictionary FullDictionary}, i.e., the word, followed by spaces * or tab, followed by the pronunciation(s). For example, a digits dictionary * will look like: *  * <pre> *  ONE HH W AH N *  ONE(2) W AH N *  TWO T UW *  THREE TH R IY *  FOUR F AO R *  FIVE F AY V *  SIX S IH K S *  SEVEN S EH V AH N *  EIGHT EY T *  NINE N AY N *  ZERO Z IH R OW *  ZERO(2) Z IY R OW *  OH OW * </pre> *  * <p> * In the above example, the words "one" and "zero" have two pronunciations * each. */public class FastDictionary implements Dictionary {    // -------------------------------    // Configuration data    // --------------------------------    private String name;    private Logger logger;    private boolean addSilEndingPronunciation;    private boolean allowMissingWords;    private boolean createMissingWords;    private String wordReplacement;    private URL wordDictionaryFile;    private URL fillerDictionaryFile;    private UnitManager unitManager;    // -------------------------------    // working data    // -------------------------------    private Map dictionary;    private final static String FILLER_TAG = "-F-";    private Set fillerWords;    private boolean allocated;    /*     * (non-Javadoc)     *      * @see edu.cmu.sphinx.util.props.Configurable#register(java.lang.String,     *      edu.cmu.sphinx.util.props.Registry)     */    public void register(String name, Registry registry)        throws PropertyException {        this.name = name;        registry.register(PROP_DICTIONARY, PropertyType.RESOURCE);        registry.register(PROP_FILLER_DICTIONARY, PropertyType.RESOURCE);        registry.register(PROP_ADD_SIL_ENDING_PRONUNCIATION,                          PropertyType.BOOLEAN);        registry.register(PROP_WORD_REPLACEMENT, PropertyType.STRING);        registry.register(PROP_ALLOW_MISSING_WORDS, PropertyType.BOOLEAN);        registry.register(PROP_CREATE_MISSING_WORDS, PropertyType.BOOLEAN);        registry.register(PROP_UNIT_MANAGER, PropertyType.COMPONENT);    }    /*     * (non-Javadoc)     *      * @see edu.cmu.sphinx.util.props.Configurable#newProperties(edu.cmu.sphinx.util.props.PropertySheet)     */    public void newProperties(PropertySheet ps) throws PropertyException {        logger = ps.getLogger();                wordDictionaryFile = ps.getResource(PROP_DICTIONARY);        fillerDictionaryFile = ps.getResource(PROP_FILLER_DICTIONARY);        addSilEndingPronunciation = ps.getBoolean(                PROP_ADD_SIL_ENDING_PRONUNCIATION,                PROP_ADD_SIL_ENDING_PRONUNCIATION_DEFAULT);        wordReplacement = ps.getString(Dictionary.PROP_WORD_REPLACEMENT,                PROP_WORD_REPLACEMENT_DEFAULT);        allowMissingWords = ps.getBoolean(Dictionary.PROP_ALLOW_MISSING_WORDS,                PROP_ALLOW_MISSING_WORDS_DEFAULT);        createMissingWords = ps.getBoolean(PROP_CREATE_MISSING_WORDS,                PROP_CREATE_MISSING_WORDS_DEFAULT);        unitManager = (UnitManager) ps.getComponent(PROP_UNIT_MANAGER,                UnitManager.class);    }    /*     * (non-Javadoc)     *      * @see edu.cmu.sphinx.util.props.Configurable#getName()     */    public String getName() {        return name;    }    /*     * (non-Javadoc)     *      * @see edu.cmu.sphinx.linguist.dictionary.Dictionary#allocate()     */    public void allocate() throws IOException {        if (!allocated) {            dictionary = new HashMap();            Timer loadTimer = Timer.getTimer("DictionaryLoad");            fillerWords = new HashSet();            loadTimer.start();            logger.info("Loading dictionary from: " + wordDictionaryFile);            loadDictionary(wordDictionaryFile.openStream(), false);            logger.info("Loading filler dictionary from: " +                        fillerDictionaryFile);            loadDictionary(fillerDictionaryFile.openStream(), true);            loadTimer.stop();        }    }    /*     * (non-Javadoc)     *      * @see edu.cmu.sphinx.linguist.dictionary.Dictionary#deallocate()     */public void deallocate() {        if (allocated) {            dictionary = null;            allocated = false;        }    }    /**     * Loads the given sphinx3 style simple dictionary from the given     * InputStream. The InputStream is assumed to contain ASCII data.     *      * @param inputStream     *                the InputStream of the dictionary     * @param isFillerDict     *                true if this is a filler dictionary, false otherwise     *      * @throws java.io.IOException     *                 if there is an error reading the dictionary     */    private void loadDictionary(InputStream inputStream, boolean isFillerDict)            throws IOException {        InputStreamReader isr = new InputStreamReader(inputStream);        BufferedReader br = new BufferedReader(isr);        String line;        while ((line = br.readLine()) != null) {            if (line.length() > 0) {                int spaceIndex = line.indexOf(' ');                int spaceIndexTab = line.indexOf('\t');                if (spaceIndex == -1) {                    // Case where there's no blank character                    spaceIndex = spaceIndexTab;                } else if ((spaceIndexTab >= 0) && (spaceIndexTab < spaceIndex)) {                    // Case where there's a blank and a tab, but the tab                    // precedes the blank                    spaceIndex = spaceIndexTab;                }                // TODO: throw an exception if spaceIndex == -1 ?                if (spaceIndex == -1) {                    throw new Error("Error loading word: " + line);                }                String word = line.substring(0, spaceIndex);                word = word.toLowerCase();                if (isFillerDict) {                    dictionary.put(word, (FILLER_TAG + line));                    fillerWords.add(word);                } else {                    dictionary.put(word, line);                }            }        }        br.close();        isr.close();        inputStream.close();    }    /**     * Gets a context independent unit. There should only be one instance of     * any CI unit     *      * @param name     *                the name of the unit     * @param isFiller     *                if true, the unit is a filler unit     *      * @return the unit     *       */    private Unit getCIUnit(String name, boolean isFiller) {        return unitManager.getUnit(name, isFiller, Context.EMPTY_CONTEXT);    }    /**     * Returns the sentence start word.     *      * @return the sentence start word     */    public Word getSentenceStartWord() {        return getWord(SENTENCE_START_SPELLING);    }    /**     * Returns the sentence end word.     *      * @return the sentence end word     */    public Word getSentenceEndWord() {        return getWord(SENTENCE_END_SPELLING);    }    /**     * Returns the silence word.     *      * @return the silence word     */    public Word getSilenceWord() {        return getWord(SILENCE_SPELLING);    }    /**     * Returns a Word object based on the spelling and its classification. The     * behavior of this method is also affected by the properties     * wordReplacement, allowMissingWords, and createMissingWords.     *      * @param text     *                the spelling of the word of interest.     *      * @return a Word object     *      * @see edu.cmu.sphinx.linguist.dictionary.Word     */    public Word getWord(String text) {        Word word = null;        text = text.toLowerCase();        Object object = dictionary.get(text);        if (object == null) { // deal with 'not found' case            logger.warning("Missing word: " + text);            if (wordReplacement != null) {                word = getWord(wordReplacement);            } else if (allowMissingWords) {                if (createMissingWords) {                    word = createWord(text, null, false);                }            }                        } else if (object instanceof String) { // first lookup for this string            word = processEntry(text);        } else if (object instanceof Word) {            word = (Word) object;        }        return word;    }    /**     * Create a Word object with the given spelling and pronunciations, and     * insert it into the dictionary.     *      * @param text     *                the spelling of the word     * @param pronunciation     *                the pronunciation of the word     * @param isFiller     *                if <code>true</code> this is a filler word     *      * @return the word     */    private Word createWord(String text, Pronunciation[] pronunciation,            boolean isFiller) {        Word word = new Word(text, pronunciation, isFiller);        dictionary.put(text, word);        return word;    }    /**     * Processes a dictionary entry. When loaded the dictionary just loads each     * line of the dictionary into the hash table, assuming that most words are     * not going to be used. Only when a word is actually used is its     * pronunciations massaged into an array of pronunciations.     */    private Word processEntry(String word) {        List pList = new LinkedList();        String line = null;        int count = 0;        boolean isFiller = false;        do {            count++;            String lookupWord = word;            if (count > 1) {                lookupWord = lookupWord + "(" + count + ")";            }            line = (String) dictionary.get(lookupWord);            if (line != null) {                StringTokenizer st = new StringTokenizer(line);                String tag = st.nextToken();                isFiller = tag.startsWith(FILLER_TAG);                int unitCount = st.countTokens();                dictionary.remove(lookupWord);                Unit[] units = new Unit[unitCount];                for (int i = 0; i < units.length; i++) {                    String unitName = st.nextToken();                    units[i] = getCIUnit(unitName, isFiller);                }                if (!isFiller && addSilEndingPronunciation) {                    Unit[] silUnits = new Unit[unitCount + 1];                    System.arraycopy(units, 0, silUnits, 0, unitCount);                    silUnits[unitCount] = UnitManager.SILENCE;                    units = silUnits;                }                pList.add(new Pronunciation(units, null, null, 1.f));            }        } while (line != null);        Pronunciation[] pronunciations = new Pronunciation[pList.size()];        pList.toArray(pronunciations);        Word wordObject = createWord(word, pronunciations, isFiller);        for (int i = 0; i < pronunciations.length; i++) {            pronunciations[i].setWord(wordObject);        }        return wordObject;    }    /**     * Returns the set of all possible word classifications for this     * dictionary.     *      * @return the set of all possible word classifications     */    public WordClassification[] getPossibleWordClassifications() {        return null;    }    /**     * Returns a string representation of this FastDictionary in alphabetical     * order.     *      * @return a string representation of this FastDictionary     */    public String toString() {        SortedMap sorted = new TreeMap(dictionary);        String result = "";        for (Iterator i = sorted.keySet().iterator(); i.hasNext();) {            String word = (String) i.next();            List pronunciations = (List) sorted.get(word);            result += (word + "\n");            for (Iterator p = pronunciations.iterator(); p.hasNext();) {                Pronunciation pronunciation = (Pronunciation) p.next();                result += ("   " + pronunciation.toString() + "\n");            }        }        return result;    }    /**     * Gets the set of all filler words in the dictionary     *      * @return an array (possibly empty) of all filler words     */    public Word[] getFillerWords() {        Word[] fillerWordArray = new Word[fillerWords.size()];        int index = 0;        for (Iterator i = fillerWords.iterator(); i.hasNext();) {            String spelling = (String) i.next();            fillerWordArray[index++] = getWord(spelling);        }        return fillerWordArray;    }    /**     * Dumps this FastDictionary to System.out.     */    public void dump() {        System.out.print(toString());    }}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?