largetrigrammodel.java
来自「It is the Speech recognition software. 」· Java 代码 · 共 1,208 行 · 第 1/3 页
JAVA
1,208 行
int numberTrigrams = getFirstTrigramEntry(nextBigram, firstBigramEntry) - firstTrigramEntry; int size = numberTrigrams * BYTES_PER_TRIGRAM; long position = (loader.getTrigramOffset() + (long) (firstTrigramEntry * BYTES_PER_TRIGRAM)); try { // System.out.println("Loading TrigramBuffer from disk"); byte[] buffer = loader.loadBuffer(position, size); trigramBuffer = new TrigramBuffer(buffer, numberTrigrams, loader.getBigEndian()); } catch (IOException ioe) { ioe.printStackTrace(); throw new Error("Error loading trigrams."); } } } return trigramBuffer; } /** * Returns the index of the first trigram entry of the given bigram * * @param bigram * the bigram which first trigram entry we're looking for * @param firstBigramEntry * the index of the first bigram entry of the bigram in * question * * @return the index of the first trigram entry of the given bigram */ private int getFirstTrigramEntry(BigramProbability bigram, int firstBigramEntry) { int firstTrigramEntry = trigramSegmentTable[(firstBigramEntry + bigram .getWhichFollower()) >> loader.getLogBigramSegmentSize()] + bigram.getFirstTrigramEntry(); return firstTrigramEntry; } /** * Returns the backoff probability for the give sequence of words * * @param wordSequence * the sequence of words * * @return the backoff probability in LogMath log base */ public float getBackoff(WordSequence wordSequence) { float logBackoff = 0.0f; // log of 1.0 UnigramProbability prob = null; //getProb(wordSequence); if (prob != null) { logBackoff = prob.getLogBackoff(); } return logBackoff; } /** * Returns the maximum depth of the language model * * @return the maximum depth of the language model */ public int getMaxDepth() { return maxDepth; } /** * Returns the set of words in the lanaguage model. The set is * unmodifiable. * * @return the unmodifiable set of words */ public Set getVocabulary() { Set vocabulary = new HashSet(); vocabulary.addAll(Arrays.asList(loader.getWords())); return Collections.unmodifiableSet(vocabulary); } /** * Returns the number of times when a bigram is queried, but there is no * bigram in the LM (in which case it uses the backoff probabilities). * * @return the number of bigram misses */ public int getBigramMisses() { return bigramMisses; } /** * Returns the number of times when a trigram is queried, but there is no * trigram in the LM (in which case it uses the backoff probabilities). * * @return the number of trigram misses */ public int getTrigramMisses() { return trigramMisses; } /** * Returns the number of trigram hits. * * @return the number of trigram hits */ public int getTrigramHits() { return trigramHit; } private void buildSmearInfo() throws IOException { int offset = 0; double S0 = 0; double R0 = 0; bigramSmearMap = new HashMap(); double[] ugNumerator = new double[unigrams.length]; double[] ugDenominator = new double[unigrams.length]; double[] ugAvgLogProb = new double[unigrams.length]; unigramSmearTerm = new float[unigrams.length]; for (int i = 0; i < unigrams.length; i++) { float logp = unigrams[i].getLogProbability(); double p = logMath.logToLinear(logp); S0 += p * logp; R0 += p * logp * logp; } System.out.println("R0 S0 " + R0 + " " + S0); for (int i = 0; i < loadedBigramBuffers.length; i++) { BigramBuffer bigram = getBigramBuffer(i); if (bigram == null) { unigramSmearTerm[i] = logMath.getLogOne(); continue; } ugNumerator[i] = 0.0; ugDenominator[i] = 0.0; ugAvgLogProb[i] = 0.0; float logugbackoff = unigrams[i].getLogBackoff(); double ugbackoff = logMath.logToLinear(logugbackoff); for (int j = 0; j < bigram.getNumberNGrams(); j++) { int wordID = bigram.getWordID(j); BigramProbability bgProb = bigram.getBigramProbability(j); float logugprob = unigrams[wordID].getLogProbability(); float logbgprob = bigramProbTable[bgProb.getProbabilityID()]; double ugprob = logMath.logToLinear(logugprob); double bgprob = logMath.logToLinear(logbgprob); double backoffbgprob = ugbackoff * ugprob; double logbackoffbgprob = logMath.linearToLog(backoffbgprob); ugNumerator[i] += (bgprob * logbgprob - backoffbgprob * logbackoffbgprob) * logugprob; ugDenominator[i] += (bgprob - backoffbgprob) * logugprob; if (false) { System.out.println("ubo " + ugprob + " " + bgprob + " " + backoffbgprob); System.out.println("logubo " + logugprob + " " + logbgprob + " " + logbackoffbgprob); System.out.println("n/d " + j + " " + ugNumerator[i] + " " + ugDenominator[i]); } if (false) { System.out.print( ugprob + " " + bgprob + " " + backoffbgprob); System.out.print(" " + logugprob + " " + logbgprob + " " + logbackoffbgprob); System.out.println(" " + ugNumerator[i] + " " + ugDenominator[i]); } } ugNumerator[i] += ugbackoff * (logugbackoff * S0 + R0); ugAvgLogProb[i] = ugDenominator[i] + ugbackoff * S0; ugDenominator[i] += ugbackoff * R0; if (false) { System.out.println("n/d " + ugNumerator[i] + " " + ugDenominator[i]); } unigramSmearTerm[i] = (float) (ugNumerator[i] / ugDenominator[i]); /// unigramSmearTerm[i] = // logMath.linearToLog(ugNumerator[i] / ugDenominator[i]); // System.out.println("ugs " + unigramSmearTerm[i]); } for (int i = 0; i < loadedBigramBuffers.length; i++) { System.out.println("Processed " + i + " of " + loadedBigramBuffers.length); BigramBuffer bigram = getBigramBuffer(i); if (bigram == null) { continue; } for (int j = 0; j < bigram.getNumberNGrams(); j++) { float smearTerm; BigramProbability bgProb = bigram.getBigramProbability(j); float logbgbackoff = trigramBackoffTable[bgProb.getBackoffID()]; double bgbackoff = logMath.logToLinear(logbgbackoff); int k = bigram.getWordID(j); TrigramBuffer trigram = loadTrigramBuffer(i, k); if (trigram == null) { smearTerm = unigramSmearTerm[k]; } else { double bg_numerator = 0; double bg_denominator = 0; for (int l = 0; l < trigram.getNumberNGrams(); l++) { int m = trigram.getWordID(l); float logtgprob = trigramProbTable[trigram.getProbabilityID(l)]; double tgprob = logMath.logToLinear(logtgprob); float logbgprob = getBigramProb(k, m); double bgprob = logMath.logToLinear(logbgprob); float logugprob = unigrams[m].getLogProbability(); double ugprob = logMath.logToLinear(logugprob); double backofftgprob = bgbackoff * bgprob; double logbackofftgprob = logMath.linearToLog(backofftgprob); bg_numerator += (tgprob * logtgprob - backofftgprob * logbackofftgprob) * logugprob; bg_denominator += (tgprob - backofftgprob) * logugprob * logugprob; } bg_numerator += bgbackoff * (logbgbackoff * ugAvgLogProb[k] - ugNumerator[k]); bg_denominator += bgbackoff * ugDenominator[k]; // bigram.ugsmear = bg_numerator / bg_denominator; smearTerm = (float) (bg_numerator / bg_denominator); smearTermCount++; } putSmearTerm(i, k, smearTerm); } } System.out.println("Smear count is " + smearTermCount); } /** * Writes the smear info to the given file * * @param filename the file to write the smear info to * * @throws IOException if an error occurs on write */ private void writeSmearInfo(String filename) throws IOException { DataOutputStream out = new DataOutputStream(new FileOutputStream(filename)); out.writeInt(SMEAR_MAGIC); System.out.println("writing " + unigrams.length); out.writeInt(unigrams.length); for (int i = 0; i < unigrams.length; i++) { out.writeFloat(unigramSmearTerm[i]); } for (int i = 0; i < unigrams.length; i++) { System.out.println("Writing " + i + " of " + unigrams.length); BigramBuffer bigram = getBigramBuffer(i); if (bigram == null) { out.writeInt(0); continue; } out.writeInt(bigram.getNumberNGrams()); for (int j = 0; j < bigram.getNumberNGrams(); j++) { int k = bigram.getWordID(j); Float smearTerm = getSmearTerm(i, k); out.writeInt(k); out.writeFloat(smearTerm.floatValue()); } } out.close(); } /** * Reads the smear info from the given file * * @param filename where to read the smear info from * @throws IOException if an inconsistent file is found or on any * general I/O error */ private void readSmearInfo(String filename) throws IOException { DataInputStream in = new DataInputStream(new FileInputStream(filename)); if (in.readInt() != SMEAR_MAGIC) { throw new IOException("Bad smear format for " + filename); } if (in.readInt() != unigrams.length) { throw new IOException("Bad unigram length in " + filename); } bigramSmearMap = new HashMap(); unigramSmearTerm = new float[unigrams.length]; System.out.println("Reading " + unigrams.length); for (int i = 0; i < unigrams.length; i++) { unigramSmearTerm[i] = in.readFloat(); } for (int i = 0; i < unigrams.length; i++) { System.out.println("Processed " + i + " of " + loadedBigramBuffers.length); int numBigrams = in.readInt(); BigramBuffer bigram = getBigramBuffer(i); if (bigram.getNumberNGrams() != numBigrams) { throw new IOException("Bad ngrams for unigram " + i + " Found " + numBigrams + " expected " + bigram.getNumberNGrams() ); } for (int j = 0; j < numBigrams; j++) { int k = bigram.getWordID(j); putSmearTerm(i, k, in.readFloat()); } } in.close(); } /** * Puts the smear term for the two words * * @param word1 the first word * @param word2 the second word * @param smearTerm the smear term */ private void putSmearTerm(int word1, int word2, float smearTerm) { long bigramID = (((long) word1) << 32) | word2; bigramSmearMap.put(new Long(bigramID), new Float(smearTerm)); } /** * Retrieves the smear term for the two words * * @param word1 the first word * @param word2 the second word * * @return the smear term */ private Float getSmearTerm(int word1, int word2) { long bigramID = (((long) word1) << 32) | word2; return (Float) bigramSmearMap.get(new Long(bigramID)); } /** * Retrieves the bigram probability for the two given words * * @param word1 the first word of the bigram * @param word2 the second word of the bigram * * @return the log probability */ private float getBigramProb(int word1, int word2) { BigramBuffer bigram = getBigramBuffer(word1); BigramProbability bigramProbability = bigram.findBigram(word2); return bigramProbTable[bigramProbability.getProbabilityID()]; }}/** * An LRU cache */class LRUCache extends LinkedHashMap { int maxSize; /** * Creates an LRU cache with the given maximum size * * @param maxSize * the maximum size of the cache */ LRUCache(int maxSize) { this.maxSize = maxSize; } /** * Determines if the eldest entry in the map should be removed. * * @param eldest * the eldest entry * * @return true if the eldest entry should be removed */ protected boolean removeEldestEntry(Map.Entry eldest) { return size() > maxSize; }}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?