largetrigrammodel.java
来自「It is the Speech recognition software. 」· Java 代码 · 共 1,208 行 · 第 1/3 页
JAVA
1,208 行
} } } // loadedBigramBuffers = new BigramBuffer[unigrams.length]; loadedTrigramBuffer = new HashMap(); logger.info("LM Cache: 3-g " + trigramCache.size() + " 2-g " + bigramCache.size()); if (clearCacheAfterUtterance) { trigramCache = new LRUCache(maxTrigramCacheSize); bigramCache = new LRUCache(maxBigramCacheSize); } } /** * Gets the ngram probability of the word sequence represented by the word * list * * @param wordSequence * the word sequence * * @return the probability of the word sequence. Probability is in logMath * log base * */ public float getProbability(WordSequence wordSequence) { if (logFile != null) { logFile.println(wordSequence.toText()); } int numberWords = wordSequence.size(); if (numberWords <= maxDepth) { if (numberWords == 3) { return getTrigramProbability(wordSequence); } else if (numberWords == 2) { return getBigramProbability(wordSequence); } else if (numberWords == 1) { return getUnigramProbability(wordSequence); } } throw new Error("Unsupported N-gram: " + wordSequence.size()); } /** * Returns the unigram probability of the given unigram. * * @param wordSequence * the unigram word sequence * * @return the unigram probability */ private float getUnigramProbability(WordSequence wordSequence) { Word unigram = wordSequence.getWord(0); UnigramProbability unigramProb = getUnigram(unigram); if (unigramProb == null) { throw new Error("Unigram not in LM: " + unigram); } return unigramProb.getLogProbability(); } /** * Returns its UnigramProbability if this language model has the given * unigram. * * @param unigram * the unigram to find * * @return the UnigramProbability, or null if this language model does not * have the unigram */ private UnigramProbability getUnigram(Word unigram) { return (UnigramProbability) unigramIDMap.get(unigram); } /** * Returns true if this language model has the given unigram. * * @param unigram * the unigram to find * * @return true if this LM has this unigram, false otherwise */ private boolean hasUnigram(Word unigram) { return (unigramIDMap.get(unigram) != null); } /** * Returns the ID of the given word. * * @param word * the word to find the ID * * @return the ID of the word */ public final int getWordID(Word word) { UnigramProbability probability = getUnigram(word); if (probability == null) { throw new IllegalArgumentException("No word ID: " + word); } else { return probability.getWordID(); } } /** * Gets the smear term for the given wordSequence * * @param wordSequence * the word sequence * @return the smear term associated with this word sequence */ public float getSmearOld(WordSequence wordSequence) { float smearTerm = 0.0f; if (fullSmear) { int length = wordSequence.size(); if (length > 0) { int wordID = getWordID(wordSequence.getWord(length - 1)); smearTerm = (float) unigramSmearTerm[wordID]; } } if (fullSmear && logger.isLoggable(Level.FINE)) { logger.fine("SmearTerm: " + smearTerm); } return smearTerm; } int smearCount; int smearBigramHit; public float getSmear(WordSequence wordSequence) { float smearTerm = 1.0f; if (fullSmear) { smearCount++; int length = wordSequence.size(); if (length == 1) { int wordID = getWordID(wordSequence.getWord(0)); smearTerm = (float) unigramSmearTerm[wordID]; } else if (length >= 2) { int size = wordSequence.size(); int wordID1 = getWordID(wordSequence.getWord(size - 2)); int wordID2 = getWordID(wordSequence.getWord(size - 1)); Float st = getSmearTerm(wordID1, wordID2); if (st == null) { smearTerm = (float) unigramSmearTerm[wordID2]; } else { smearTerm = st.floatValue(); smearBigramHit++; } } if (smearCount % 100000 == 0) { System.out.println("Smear hit: " + smearBigramHit + " tot: " + smearCount); } } if (fullSmear && logger.isLoggable(Level.FINE)) { logger.fine("SmearTerm: " + smearTerm); } return smearTerm; } /** * Returns the unigram probability of the given unigram. * * @param wordSequence * the unigram word sequence * * @return the unigram probability */ private float getBigramProbability(WordSequence wordSequence) { Word firstWord = wordSequence.getWord(0); if (loader.getNumberBigrams() <= 0 || !hasUnigram(firstWord)) { return getUnigramProbability(wordSequence.getNewest()); } BigramProbability bigramProbability = findBigram(wordSequence); if (bigramProbability != null) { return bigramProbTable[bigramProbability.getProbabilityID()]; } else { Word secondWord = wordSequence.getWord(1); if (getUnigram(secondWord) == null) { throw new Error("Bad word2: " + secondWord); } // System.out.println("Didn't find bigram"); int firstWordID = getWordID(firstWord); int secondWordID = getWordID(secondWord); bigramMisses++; return (unigrams[firstWordID].getLogBackoff() + unigrams[secondWordID] .getLogProbability()); } } /** * Finds the BigramProbability for a particular bigram * * @param ws * the word sequence * * @return the BigramProbability of the bigram, or null if the given first * word has no bigrams */ private BigramProbability findBigram(WordSequence ws) { BigramProbability bigramProbability = (BigramProbability) bigramCache .get(ws); if (bigramProbability == null) { int firstWordID = getWordID(ws.getWord(0)); int secondWordID = getWordID(ws.getWord(1)); BigramBuffer bigrams = getBigramBuffer(firstWordID); if (bigrams != null) { bigrams.setUsed(true); bigramProbability = bigrams.findBigram(secondWordID); if (bigramProbability != null) { bigramCache.put(ws, bigramProbability); } } } return bigramProbability; } /** * Returns the bigrams of the given word * * @param firstWordID * the ID of the word * * @return the bigrams of the word */ private BigramBuffer getBigramBuffer(int firstWordID) { BigramBuffer bigramBuffer = loadedBigramBuffers[firstWordID]; if (bigramBuffer == null) { int numberBigrams = getNumberBigramFollowers(firstWordID); if (numberBigrams > 0) { bigramBuffer = loadBigramBuffer(firstWordID, numberBigrams); if (bigramBuffer != null) { loadedBigramBuffers[firstWordID] = bigramBuffer; } } } return bigramBuffer; } /** * Loads the bigram followers of the given first word in a bigram from disk * to memory. It actually loads (numberFollowers + 1) bigrams, since we * need the first bigram of the next word to determine the number of * trigrams of the last bigram. * * @param firstWordID * ID of the first word * @param numberFollowers * the number of bigram followers this word has * * @return the bigram followers of the given word */ private BigramBuffer loadBigramBuffer(int firstWordID, int numberFollowers) { BigramBuffer followers = null; int firstBigramEntry = unigrams[firstWordID].getFirstBigramEntry(); int size = (numberFollowers + 1) * BYTES_PER_BIGRAM; long position = (long) (loader.getBigramOffset() + (firstBigramEntry * BYTES_PER_BIGRAM)); try { byte[] buffer = loader.loadBuffer(position, size); followers = new BigramBuffer(buffer, numberFollowers + 1, loader .getBigEndian()); } catch (IOException ioe) { ioe.printStackTrace(); throw new Error("Error loading bigram followers"); } return followers; } /** * Returns the number of bigram followers of a word. * * @param wordID * the ID of the word * * @return the number of bigram followers */ private int getNumberBigramFollowers(int wordID) { if (wordID == unigrams.length - 1) { return 0; } else { return unigrams[wordID + 1].getFirstBigramEntry() - unigrams[wordID].getFirstBigramEntry(); } } /** * Returns the language probability of the given trigram. * * @param wordSequence * the trigram word sequence * * @return the trigram probability */ private float getTrigramProbability(WordSequence wordSequence) { Word firstWord = wordSequence.getWord(0); if (loader.getNumberTrigrams() == 0 || !hasUnigram(firstWord)) { return getBigramProbability(wordSequence.getNewest()); } Float probability = (Float) trigramCache.get(wordSequence); if (probability == null) { float score = 0.0f; int trigramProbID = findTrigram(wordSequence); if (trigramProbID != -1) { trigramHit++; score = trigramProbTable[trigramProbID]; } else { trigramMisses++; BigramProbability bigram = findBigram(wordSequence.getOldest()); if (bigram != null) { score = trigramBackoffTable[bigram.getBackoffID()] + getBigramProbability(wordSequence.getNewest()); } else { score = getBigramProbability(wordSequence.getNewest()); } } probability = new Float(score); trigramCache.put(wordSequence, probability); } return probability.floatValue(); } /** * Finds or loads the trigram probability of the given trigram. * * @param wordSequence * the trigram to load * * @return a TrigramProbability of the given trigram */ private int findTrigram(WordSequence wordSequence) { int trigram = -1; WordSequence oldest = wordSequence.getOldest(); TrigramBuffer trigramBuffer = (TrigramBuffer) loadedTrigramBuffer .get(oldest); if (trigramBuffer == null) { int firstWordID = getWordID(wordSequence.getWord(0)); int secondWordID = getWordID(wordSequence.getWord(1)); trigramBuffer = loadTrigramBuffer(firstWordID, secondWordID); if (trigramBuffer != null) { loadedTrigramBuffer.put(oldest, trigramBuffer); } } if (trigramBuffer != null) { int thirdWordID = getWordID(wordSequence.getWord(2)); trigram = trigramBuffer.findProbabilityID(thirdWordID); } return trigram; } /** * Loads into a buffer all the trigram followers of the given bigram. * * @param firstWordID * the ID of the first word * @param secondWordID * the ID of the second word * * @return a TrigramBuffer of all the trigram followers of the given two * words */ private TrigramBuffer loadTrigramBuffer(int firstWordID, int secondWordID) { TrigramBuffer trigramBuffer = null; BigramBuffer bigramBuffer = getBigramBuffer(firstWordID); if (bigramBuffer != null) { BigramProbability bigram = bigramBuffer.findBigram(secondWordID); if (bigram != null) { BigramProbability nextBigram = bigramBuffer .getBigramProbability(bigram.getWhichFollower() + 1); int firstBigramEntry = unigrams[firstWordID] .getFirstBigramEntry(); int firstTrigramEntry = getFirstTrigramEntry(bigram, firstBigramEntry);
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?