largetrigrammodel.java

来自「It is the Speech recognition software. 」· Java 代码 · 共 1,208 行 · 第 1/3 页

JAVA
1,208
字号
                }            }        }        // loadedBigramBuffers = new BigramBuffer[unigrams.length];        loadedTrigramBuffer = new HashMap();        logger.info("LM Cache: 3-g " + trigramCache.size() + " 2-g "                + bigramCache.size());        if (clearCacheAfterUtterance) {            trigramCache = new LRUCache(maxTrigramCacheSize);            bigramCache = new LRUCache(maxBigramCacheSize);        }    }    /**     * Gets the ngram probability of the word sequence represented by the word     * list     *      * @param wordSequence     *                the word sequence     *      * @return the probability of the word sequence. Probability is in logMath     *         log base     *       */    public float getProbability(WordSequence wordSequence) {        if (logFile != null) {            logFile.println(wordSequence.toText());        }        int numberWords = wordSequence.size();        if (numberWords <= maxDepth) {            if (numberWords == 3) {                return getTrigramProbability(wordSequence);            } else if (numberWords == 2) {                return getBigramProbability(wordSequence);            } else if (numberWords == 1) {                return getUnigramProbability(wordSequence);            }        }        throw new Error("Unsupported N-gram: " + wordSequence.size());    }    /**     * Returns the unigram probability of the given unigram.     *      * @param wordSequence     *                the unigram word sequence     *      * @return the unigram probability     */    private float getUnigramProbability(WordSequence wordSequence) {        Word unigram = wordSequence.getWord(0);        UnigramProbability unigramProb = getUnigram(unigram);        if (unigramProb == null) {            throw new Error("Unigram not in LM: " + unigram);        }        return unigramProb.getLogProbability();    }    /**     * Returns its UnigramProbability if this language model has the given     * unigram.     *      * @param unigram     *                the unigram to find     *      * @return the UnigramProbability, or null if this language model does not     *         have the unigram     */    private UnigramProbability getUnigram(Word unigram) {        return (UnigramProbability) unigramIDMap.get(unigram);    }    /**     * Returns true if this language model has the given unigram.     *      * @param unigram     *                the unigram to find     *      * @return true if this LM has this unigram, false otherwise     */    private boolean hasUnigram(Word unigram) {        return (unigramIDMap.get(unigram) != null);    }    /**     * Returns the ID of the given word.     *      * @param word     *                the word to find the ID     *      * @return the ID of the word     */    public final int getWordID(Word word) {        UnigramProbability probability = getUnigram(word);        if (probability == null) {            throw new IllegalArgumentException("No word ID: " + word);        } else {            return probability.getWordID();        }    }    /**     * Gets the smear term for the given wordSequence     *      * @param wordSequence     *                the word sequence     * @return the smear term associated with this word sequence     */    public float getSmearOld(WordSequence wordSequence) {        float smearTerm = 0.0f;        if (fullSmear) {            int length = wordSequence.size();            if (length > 0) {                int wordID = getWordID(wordSequence.getWord(length - 1));                smearTerm = (float) unigramSmearTerm[wordID];            }        }        if (fullSmear && logger.isLoggable(Level.FINE)) {            logger.fine("SmearTerm: " + smearTerm);        }        return smearTerm;    }    int smearCount;    int smearBigramHit;    public float getSmear(WordSequence wordSequence) {        float smearTerm = 1.0f;        if (fullSmear) {            smearCount++;            int length = wordSequence.size();            if (length == 1) {                int wordID = getWordID(wordSequence.getWord(0));                smearTerm = (float) unigramSmearTerm[wordID];            } else if (length >= 2) {                int size = wordSequence.size();                int wordID1 = getWordID(wordSequence.getWord(size - 2));                int wordID2 = getWordID(wordSequence.getWord(size - 1));                Float st = getSmearTerm(wordID1, wordID2);                if (st == null) {                    smearTerm = (float) unigramSmearTerm[wordID2];                } else {                    smearTerm = st.floatValue();                    smearBigramHit++;                }            }            if (smearCount % 100000 == 0) {                System.out.println("Smear hit: " + smearBigramHit +                         " tot: " + smearCount);            }        }        if (fullSmear && logger.isLoggable(Level.FINE)) {            logger.fine("SmearTerm: " + smearTerm);        }        return smearTerm;    }    /**     * Returns the unigram probability of the given unigram.     *      * @param wordSequence     *                the unigram word sequence     *      * @return the unigram probability     */    private float getBigramProbability(WordSequence wordSequence) {        Word firstWord = wordSequence.getWord(0);        if (loader.getNumberBigrams() <= 0 || !hasUnigram(firstWord)) {            return getUnigramProbability(wordSequence.getNewest());        }        BigramProbability bigramProbability = findBigram(wordSequence);        if (bigramProbability != null) {            return bigramProbTable[bigramProbability.getProbabilityID()];        } else {            Word secondWord = wordSequence.getWord(1);            if (getUnigram(secondWord) == null) {                throw new Error("Bad word2: " + secondWord);            }            // System.out.println("Didn't find bigram");            int firstWordID = getWordID(firstWord);            int secondWordID = getWordID(secondWord);            bigramMisses++;            return (unigrams[firstWordID].getLogBackoff() + unigrams[secondWordID]                    .getLogProbability());        }    }    /**     * Finds the BigramProbability for a particular bigram     *      * @param ws     *                the word sequence     *      * @return the BigramProbability of the bigram, or null if the given first     *         word has no bigrams     */    private BigramProbability findBigram(WordSequence ws) {        BigramProbability bigramProbability = (BigramProbability) bigramCache                .get(ws);        if (bigramProbability == null) {            int firstWordID = getWordID(ws.getWord(0));            int secondWordID = getWordID(ws.getWord(1));            BigramBuffer bigrams = getBigramBuffer(firstWordID);            if (bigrams != null) {                bigrams.setUsed(true);                bigramProbability = bigrams.findBigram(secondWordID);                if (bigramProbability != null) {                    bigramCache.put(ws, bigramProbability);                }            }        }        return bigramProbability;    }    /**     * Returns the bigrams of the given word     *      * @param firstWordID     *                the ID of the word     *      * @return the bigrams of the word     */    private BigramBuffer getBigramBuffer(int firstWordID) {        BigramBuffer bigramBuffer = loadedBigramBuffers[firstWordID];        if (bigramBuffer == null) {            int numberBigrams = getNumberBigramFollowers(firstWordID);            if (numberBigrams > 0) {                bigramBuffer = loadBigramBuffer(firstWordID, numberBigrams);                if (bigramBuffer != null) {                    loadedBigramBuffers[firstWordID] = bigramBuffer;                }            }        }        return bigramBuffer;    }    /**     * Loads the bigram followers of the given first word in a bigram from disk     * to memory. It actually loads (numberFollowers + 1) bigrams, since we     * need the first bigram of the next word to determine the number of     * trigrams of the last bigram.     *      * @param firstWordID     *                ID of the first word     * @param numberFollowers     *                the number of bigram followers this word has     *      * @return the bigram followers of the given word     */    private BigramBuffer loadBigramBuffer(int firstWordID, int numberFollowers) {        BigramBuffer followers = null;        int firstBigramEntry = unigrams[firstWordID].getFirstBigramEntry();        int size = (numberFollowers + 1) * BYTES_PER_BIGRAM;        long position = (long) (loader.getBigramOffset() + (firstBigramEntry * BYTES_PER_BIGRAM));        try {            byte[] buffer = loader.loadBuffer(position, size);            followers = new BigramBuffer(buffer, numberFollowers + 1, loader                    .getBigEndian());        } catch (IOException ioe) {            ioe.printStackTrace();            throw new Error("Error loading bigram followers");        }        return followers;    }    /**     * Returns the number of bigram followers of a word.     *      * @param wordID     *                the ID of the word     *      * @return the number of bigram followers     */    private int getNumberBigramFollowers(int wordID) {        if (wordID == unigrams.length - 1) {            return 0;        } else {            return unigrams[wordID + 1].getFirstBigramEntry()                    - unigrams[wordID].getFirstBigramEntry();        }    }    /**     * Returns the language probability of the given trigram.     *      * @param wordSequence     *                the trigram word sequence     *      * @return the trigram probability     */    private float getTrigramProbability(WordSequence wordSequence) {        Word firstWord = wordSequence.getWord(0);        if (loader.getNumberTrigrams() == 0 || !hasUnigram(firstWord)) {            return getBigramProbability(wordSequence.getNewest());        }        Float probability = (Float) trigramCache.get(wordSequence);        if (probability == null) {            float score = 0.0f;            int trigramProbID = findTrigram(wordSequence);            if (trigramProbID != -1) {                trigramHit++;                score = trigramProbTable[trigramProbID];            } else {                trigramMisses++;                BigramProbability bigram = findBigram(wordSequence.getOldest());                if (bigram != null) {                    score = trigramBackoffTable[bigram.getBackoffID()]                            + getBigramProbability(wordSequence.getNewest());                } else {                    score = getBigramProbability(wordSequence.getNewest());                }            }            probability = new Float(score);            trigramCache.put(wordSequence, probability);        }        return probability.floatValue();    }    /**     * Finds or loads the trigram probability of the given trigram.     *      * @param wordSequence     *                the trigram to load     *      * @return a TrigramProbability of the given trigram     */    private int findTrigram(WordSequence wordSequence) {        int trigram = -1;        WordSequence oldest = wordSequence.getOldest();        TrigramBuffer trigramBuffer = (TrigramBuffer) loadedTrigramBuffer                .get(oldest);        if (trigramBuffer == null) {            int firstWordID = getWordID(wordSequence.getWord(0));            int secondWordID = getWordID(wordSequence.getWord(1));            trigramBuffer = loadTrigramBuffer(firstWordID, secondWordID);            if (trigramBuffer != null) {                loadedTrigramBuffer.put(oldest, trigramBuffer);            }        }        if (trigramBuffer != null) {            int thirdWordID = getWordID(wordSequence.getWord(2));            trigram = trigramBuffer.findProbabilityID(thirdWordID);        }        return trigram;    }    /**     * Loads into a buffer all the trigram followers of the given bigram.     *      * @param firstWordID     *                the ID of the first word     * @param secondWordID     *                the ID of the second word     *      * @return a TrigramBuffer of all the trigram followers of the given two     *         words     */    private TrigramBuffer loadTrigramBuffer(int firstWordID, int secondWordID) {        TrigramBuffer trigramBuffer = null;        BigramBuffer bigramBuffer = getBigramBuffer(firstWordID);        if (bigramBuffer != null) {            BigramProbability bigram = bigramBuffer.findBigram(secondWordID);            if (bigram != null) {                BigramProbability nextBigram = bigramBuffer                        .getBigramProbability(bigram.getWhichFollower() + 1);                int firstBigramEntry = unigrams[firstWordID]                        .getFirstBigramEntry();                int firstTrigramEntry = getFirstTrigramEntry(bigram,                        firstBigramEntry);

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?