largetrigrammodel.java

来自「It is the Speech recognition software. 」· Java 代码 · 共 1,208 行 · 第 1/3 页

JAVA
1,208
字号
                int numberTrigrams = getFirstTrigramEntry(nextBigram,                        firstBigramEntry)                        - firstTrigramEntry;                int size = numberTrigrams * BYTES_PER_TRIGRAM;                long position = (loader.getTrigramOffset() + (long) (firstTrigramEntry * BYTES_PER_TRIGRAM));                try {                    // System.out.println("Loading TrigramBuffer from disk");                    byte[] buffer = loader.loadBuffer(position, size);                    trigramBuffer = new TrigramBuffer(buffer, numberTrigrams,                            loader.getBigEndian());                } catch (IOException ioe) {                    ioe.printStackTrace();                    throw new Error("Error loading trigrams.");                }            }        }        return trigramBuffer;    }    /**     * Returns the index of the first trigram entry of the given bigram     *      * @param bigram     *                the bigram which first trigram entry we're looking for     * @param firstBigramEntry     *                the index of the first bigram entry of the bigram in     *                question     *      * @return the index of the first trigram entry of the given bigram     */    private int getFirstTrigramEntry(BigramProbability bigram,            int firstBigramEntry) {        int firstTrigramEntry = trigramSegmentTable[(firstBigramEntry + bigram                .getWhichFollower()) >> loader.getLogBigramSegmentSize()]                + bigram.getFirstTrigramEntry();        return firstTrigramEntry;    }    /**     * Returns the backoff probability for the give sequence of words     *      * @param wordSequence     *                the sequence of words     *      * @return the backoff probability in LogMath log base     */    public float getBackoff(WordSequence wordSequence) {        float logBackoff = 0.0f; // log of 1.0        UnigramProbability prob = null; //getProb(wordSequence);        if (prob != null) {            logBackoff = prob.getLogBackoff();        }        return logBackoff;    }    /**     * Returns the maximum depth of the language model     *      * @return the maximum depth of the language model     */    public int getMaxDepth() {        return maxDepth;    }    /**     * Returns the set of words in the lanaguage model. The set is     * unmodifiable.     *      * @return the unmodifiable set of words     */    public Set getVocabulary() {        Set vocabulary = new HashSet();        vocabulary.addAll(Arrays.asList(loader.getWords()));        return Collections.unmodifiableSet(vocabulary);    }    /**     * Returns the number of times when a bigram is queried, but there is no     * bigram in the LM (in which case it uses the backoff probabilities).     *      * @return the number of bigram misses     */    public int getBigramMisses() {        return bigramMisses;    }    /**     * Returns the number of times when a trigram is queried, but there is no     * trigram in the LM (in which case it uses the backoff probabilities).     *      * @return the number of trigram misses     */    public int getTrigramMisses() {        return trigramMisses;    }    /**     * Returns the number of trigram hits.     *      * @return the number of trigram hits     */    public int getTrigramHits() {        return trigramHit;    }    private void buildSmearInfo() throws IOException {        int offset = 0;        double S0 = 0;        double R0 = 0;        bigramSmearMap = new HashMap();        double[] ugNumerator = new double[unigrams.length];        double[] ugDenominator = new double[unigrams.length];        double[] ugAvgLogProb = new double[unigrams.length];        unigramSmearTerm = new float[unigrams.length];        for (int i = 0; i < unigrams.length; i++) {            float logp = unigrams[i].getLogProbability();            double p = logMath.logToLinear(logp);            S0 += p * logp;            R0 += p * logp * logp;        }        System.out.println("R0 S0 " + R0 + " " + S0);        for (int i = 0; i < loadedBigramBuffers.length; i++) {            BigramBuffer bigram = getBigramBuffer(i);            if (bigram == null) {                unigramSmearTerm[i] = logMath.getLogOne();                continue;            }            ugNumerator[i] = 0.0;            ugDenominator[i] = 0.0;            ugAvgLogProb[i] = 0.0;            float logugbackoff = unigrams[i].getLogBackoff();            double ugbackoff = logMath.logToLinear(logugbackoff);                        for (int j = 0; j < bigram.getNumberNGrams(); j++) {                int wordID = bigram.getWordID(j);                BigramProbability bgProb = bigram.getBigramProbability(j);                float logugprob = unigrams[wordID].getLogProbability();                float logbgprob = bigramProbTable[bgProb.getProbabilityID()];                double ugprob = logMath.logToLinear(logugprob);                double bgprob = logMath.logToLinear(logbgprob);                double backoffbgprob = ugbackoff * ugprob;                double logbackoffbgprob = logMath.linearToLog(backoffbgprob);                ugNumerator[i] += (bgprob * logbgprob                     - backoffbgprob * logbackoffbgprob) * logugprob;                ugDenominator[i] += (bgprob - backoffbgprob) * logugprob;                if (false) {                    System.out.println("ubo " + ugprob + " " + bgprob + " " +                            backoffbgprob);                    System.out.println("logubo " + logugprob                             + " " + logbgprob + " " + logbackoffbgprob);                    System.out.println("n/d " + j + " "                             + ugNumerator[i] + " " + ugDenominator[i]);                }                if (false) {                    System.out.print( ugprob + " " + bgprob + " "                             + backoffbgprob);                    System.out.print(" " + logugprob + " "                             + logbgprob + " " + logbackoffbgprob);                    System.out.println("  "  + ugNumerator[i]                             + " " + ugDenominator[i]);                }            }            ugNumerator[i] += ugbackoff * (logugbackoff * S0 + R0);            ugAvgLogProb[i] = ugDenominator[i] + ugbackoff * S0;            ugDenominator[i] += ugbackoff * R0;            if (false) {                System.out.println("n/d " + ugNumerator[i] + " " +                        ugDenominator[i]);            }            unigramSmearTerm[i] = (float) (ugNumerator[i] / ugDenominator[i]);            /// unigramSmearTerm[i] =             //   logMath.linearToLog(ugNumerator[i] / ugDenominator[i]);            // System.out.println("ugs " + unigramSmearTerm[i]);        }        for (int i = 0; i < loadedBigramBuffers.length; i++) {            System.out.println("Processed " + i                     + " of " + loadedBigramBuffers.length);            BigramBuffer bigram = getBigramBuffer(i);            if (bigram == null) {                continue;            }            for (int j = 0; j < bigram.getNumberNGrams(); j++) {                float smearTerm;                BigramProbability bgProb = bigram.getBigramProbability(j);                float logbgbackoff = trigramBackoffTable[bgProb.getBackoffID()];                double bgbackoff = logMath.logToLinear(logbgbackoff);                int k = bigram.getWordID(j);                TrigramBuffer trigram = loadTrigramBuffer(i, k);                            if (trigram == null) {                    smearTerm = unigramSmearTerm[k];                } else {                    double bg_numerator = 0;                    double bg_denominator = 0;                    for (int l = 0; l < trigram.getNumberNGrams(); l++) {                        int m = trigram.getWordID(l);                        float logtgprob                             = trigramProbTable[trigram.getProbabilityID(l)];                        double tgprob = logMath.logToLinear(logtgprob);                        float logbgprob = getBigramProb(k, m);                        double bgprob = logMath.logToLinear(logbgprob);                        float logugprob = unigrams[m].getLogProbability();                        double ugprob = logMath.logToLinear(logugprob);                        double backofftgprob = bgbackoff * bgprob;                        double logbackofftgprob                             = logMath.linearToLog(backofftgprob);                        bg_numerator += (tgprob * logtgprob                             - backofftgprob * logbackofftgprob) * logugprob;                        bg_denominator += (tgprob - backofftgprob)                            * logugprob * logugprob;                    }                    bg_numerator += bgbackoff *  (logbgbackoff *                                ugAvgLogProb[k] - ugNumerator[k]);                    bg_denominator += bgbackoff * ugDenominator[k];                    // bigram.ugsmear = bg_numerator / bg_denominator;                    smearTerm = (float) (bg_numerator / bg_denominator);                    smearTermCount++;                }                putSmearTerm(i, k, smearTerm);            }        }        System.out.println("Smear count is " + smearTermCount);    }    /**     * Writes the smear info to the given file     *     * @param filename the file to write the smear info to     *     * @throws IOException if an error occurs on write     */    private void writeSmearInfo(String filename) throws IOException {        DataOutputStream out             = new DataOutputStream(new FileOutputStream(filename));        out.writeInt(SMEAR_MAGIC);            System.out.println("writing " + unigrams.length);        out.writeInt(unigrams.length);        for (int i = 0; i < unigrams.length; i++) {            out.writeFloat(unigramSmearTerm[i]);        }        for (int i = 0; i < unigrams.length; i++) {            System.out.println("Writing " + i                     + " of " + unigrams.length);            BigramBuffer bigram = getBigramBuffer(i);            if (bigram == null) {                out.writeInt(0);                continue;            }            out.writeInt(bigram.getNumberNGrams());            for (int j = 0; j < bigram.getNumberNGrams(); j++) {                int k = bigram.getWordID(j);                Float smearTerm = getSmearTerm(i, k);                out.writeInt(k);                out.writeFloat(smearTerm.floatValue());            }        }        out.close();    }    /**     * Reads the smear info from the given file     *     * @param filename where to read the smear info from     * @throws IOException if an inconsistent file is found or on any     * general I/O error     */    private void readSmearInfo(String filename) throws IOException {        DataInputStream in             = new DataInputStream(new FileInputStream(filename));        if (in.readInt() != SMEAR_MAGIC) {            throw new IOException("Bad smear format for " + filename);        }        if (in.readInt() != unigrams.length) {            throw new IOException("Bad unigram length in " + filename);        }        bigramSmearMap = new HashMap();        unigramSmearTerm = new float[unigrams.length];            System.out.println("Reading " + unigrams.length);        for (int i = 0; i < unigrams.length; i++) {            unigramSmearTerm[i] = in.readFloat();        }        for (int i = 0; i < unigrams.length; i++) {            System.out.println("Processed " + i                     + " of " + loadedBigramBuffers.length);            int numBigrams = in.readInt();            BigramBuffer bigram = getBigramBuffer(i);            if (bigram.getNumberNGrams() != numBigrams) {                throw new IOException("Bad ngrams for unigram " + i                 + " Found " + numBigrams + " expected " +                     bigram.getNumberNGrams()                 );            }            for (int j = 0; j < numBigrams; j++) {                int k = bigram.getWordID(j);                putSmearTerm(i, k, in.readFloat());            }        }        in.close();    }    /**     * Puts the smear term for the two words     *     * @param word1 the first word     * @param word2 the second word     * @param smearTerm the smear term     */    private void putSmearTerm(int word1, int word2, float smearTerm) {        long bigramID = (((long) word1) << 32) | word2;        bigramSmearMap.put(new Long(bigramID), new Float(smearTerm));    }    /**     * Retrieves the smear term for the two words     *     * @param word1 the first word     * @param word2 the second word     *     * @return the smear term     */    private Float  getSmearTerm(int word1, int word2) {        long bigramID = (((long) word1) << 32) | word2;        return (Float) bigramSmearMap.get(new Long(bigramID));    }    /**     * Retrieves the bigram probability for the two given words     *     * @param word1 the first word of the bigram     * @param word2 the second word of the bigram     *     * @return the log probability     */    private float getBigramProb(int word1, int word2) {       BigramBuffer bigram = getBigramBuffer(word1);        BigramProbability bigramProbability  = bigram.findBigram(word2);        return bigramProbTable[bigramProbability.getProbabilityID()];    }}/** * An LRU cache */class LRUCache extends LinkedHashMap {    int maxSize;    /**     * Creates an LRU cache with the given maximum size     *      * @param maxSize     *                the maximum size of the cache     */    LRUCache(int maxSize) {        this.maxSize = maxSize;    }    /**     * Determines if the eldest entry in the map should be removed.     *      * @param eldest     *                the eldest entry     *      * @return true if the eldest entry should be removed     */    protected boolean removeEldestEntry(Map.Entry eldest) {        return size() > maxSize;    }}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?