binaryloader.java
来自「It is the Speech recognition software. 」· Java 代码 · 共 696 行 · 第 1/2 页
JAVA
696 行
/* * Copyright 1999-2002 Carnegie Mellon University. * Portions Copyright 2002 Sun Microsystems, Inc. * Portions Copyright 2002 Mitsubishi Electric Research Laboratories. * All Rights Reserved. Use is subject to license terms. * * See the file "license.terms" for information on usage and * redistribution of this file, and for a DISCLAIMER OF ALL * WARRANTIES. * */package edu.cmu.sphinx.linguist.language.ngram.large;import java.io.BufferedInputStream;import java.io.DataInputStream;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.io.RandomAccessFile;import edu.cmu.sphinx.linguist.dictionary.Dictionary;import edu.cmu.sphinx.util.LogMath;import edu.cmu.sphinx.util.Utilities;/** * Reads a binary language model file generated by the CMU-Cambridge * Statistical Language Modelling Toolkit. * * Note that all probabilites in the grammar are stored in LogMath log base * format. Language Probabilties in the language model file are stored in log * 10 base. They are converted to the LogMath logbase. */class BinaryLoader { private static final String DARPA_LM_HEADER = "Darpa Trigram LM"; private static final int LOG2_BIGRAM_SEGMENT_SIZE_DEFAULT = 9; private static final float MIN_PROBABILITY = -99.0f; private static final int MAX_PROB_TABLE_SIZE = 65536; private LogMath logMath; private int maxNGram; private float unigramWeight; private float languageWeight; private double wip; private boolean bigEndian = true; private boolean applyLanguageWeightAndWip; private int bytesRead = 0; private UnigramProbability[] unigrams; private String[] words; private int bigramOffset; private int trigramOffset; private int numberUnigrams; private int numberBigrams; private int numberTrigrams; private int logBigramSegmentSize; private int startWordID; private int endWordID; private int[] trigramSegmentTable; private float[] bigramProbTable; private float[] trigramBackoffTable; private float[] trigramProbTable; private RandomAccessFile file; /** * Initializes the binary loader * * @param format the file format * @param location the location of the model * @param applyLanguageWeightAndWip if true apply lw and wip * @param logMath the logmath to sue * @param languageWeight the language weight * @param wip the word insertion probability * @param unigramWeight the unigram weight * @throws IOException if an I/O error occurs */ public BinaryLoader(String format, File location, boolean applyLanguageWeightAndWip, LogMath logMath, float languageWeight, double wip, float unigramWeight) throws IOException { startWordID = -1; endWordID = -1; this.applyLanguageWeightAndWip = applyLanguageWeightAndWip; this.logMath = logMath; this.languageWeight = languageWeight; this.wip = wip; this.unigramWeight = unigramWeight; loadBinary(location); } /** * Returns the number of unigrams * * @return the nubmer of unigrams */ public int getNumberUnigrams() { return numberUnigrams; } /** * Returns the number of bigrams * * @return the nubmer of bigrams */ public int getNumberBigrams() { return numberBigrams; } /** * Returns the number of trigrams * * @return the nubmer of trigrams */ public int getNumberTrigrams() { return numberTrigrams; } /** * Returns all the unigrams * * @return all the unigrams */ public UnigramProbability[] getUnigrams() { return unigrams; } /** * Returns all the bigram probabilities. * * @return all the bigram probabilities */ public float[] getBigramProbabilities() { return bigramProbTable; } /** * Returns all the trigram probabilities. * * @return all the trigram probabilities */ public float[] getTrigramProbabilities() { return trigramProbTable; } /** * Returns all the trigram backoff weights * * @return all the trigram backoff weights */ public float[] getTrigramBackoffWeights() { return trigramBackoffTable; } /** * Returns the trigram segment table. * * @return the trigram segment table */ public int[] getTrigramSegments() { return trigramSegmentTable; } /** * Returns the log of the bigram segment size * * @return the log of the bigram segment size */ public int getLogBigramSegmentSize() { return logBigramSegmentSize; } /** * Returns all the words. * * @return all the words */ public String[] getWords() { return words; } /** * Initializes this LanguageModel * * @param context * the context to associate this linguist with */ /** * Returns the location (or offset) into the file where bigrams start. * * @return the location of the bigrams */ public int getBigramOffset() { return bigramOffset; } /** * Returns the location (or offset) into the file where trigrams start. * * @return the location of the trigrams */ public int getTrigramOffset() { return trigramOffset; } /** * Returns the maximum depth of the language model * * @return the maximum depth of the language mdoel */ public int getMaxDepth() { return maxNGram; } /** * Returns true if the loaded file is in big-endian. * * @return true if the loaded file is big-endian */ public boolean getBigEndian() { return bigEndian; } /** * Loads the contents of the memory-mapped file starting at the given * position and for the given size, into a byte buffer. This method is * implemented because MappedByteBuffer.load() does not work properly. * * @param position * the starting position in the file * @param size * the number of bytes to load * * @return the loaded ByteBuffer */ public byte[] loadBuffer(long position, int size) throws IOException { // assert ((position + size) <= fileChannel.size()); file.seek(position); byte[] bytes = new byte[size]; if (file.read(bytes) != size) { throw new IOException("Incorrect number of bytes read."); } return bytes; } /** * Loads the language model from the given file. * * @param location * the file containing the language model */ private void loadBinary(File location) throws IOException { DataInputStream stream = new DataInputStream (new BufferedInputStream(new FileInputStream(location))); // read standard header string-size; set bigEndian flag readHeader(stream); // +1 is the sentinel unigram at the end unigrams = readUnigrams(stream, numberUnigrams + 1, bigEndian); skipBigramsTrigrams(stream); // read the bigram probabilities table if (numberBigrams > 0) { this.bigramProbTable = readFloatTable(stream, bigEndian); } // read the trigram backoff weight table and trigram prob table if (numberTrigrams > 0) { trigramBackoffTable = readFloatTable(stream, bigEndian); trigramProbTable = readFloatTable(stream, bigEndian); int bigramSegmentSize = 1 << logBigramSegmentSize; int trigramSegTableSize = ((numberBigrams + 1) / bigramSegmentSize) + 1; trigramSegmentTable = readIntTable(stream, bigEndian, trigramSegTableSize); } // read word string names int wordsStringLength = readInt(stream, bigEndian); if (wordsStringLength <= 0) { throw new Error("Bad word string size: " + wordsStringLength); } // read the string of all words this.words = readWords(stream, wordsStringLength, numberUnigrams); if (startWordID > -1) { UnigramProbability unigram = unigrams[startWordID]; unigram.setLogProbability(MIN_PROBABILITY); } if (endWordID > -1) { UnigramProbability unigram = unigrams[endWordID]; unigram.setLogBackoff(MIN_PROBABILITY); } applyUnigramWeight(); if (applyLanguageWeightAndWip) { applyLanguageWeight(bigramProbTable, languageWeight); applyWip(bigramProbTable, wip); applyLanguageWeight(trigramProbTable, languageWeight); applyWip(trigramProbTable, wip); applyLanguageWeight(trigramBackoffTable, languageWeight); } stream.close(); file = new RandomAccessFile(location, "r"); } /** * Reads the LM file header * * @param stream * the data stream of the LM file */ private void readHeader(DataInputStream stream) throws IOException { int headerLength = readInt(stream, bigEndian); if (headerLength != (DARPA_LM_HEADER.length() + 1)) { // not big-endian headerLength = Utilities.swapInteger(headerLength); if (headerLength == (DARPA_LM_HEADER.length() + 1)) { bigEndian = false; // System.out.println("Little-endian"); } else { throw new Error("Bad binary LM file magic number: " + headerLength + ", not an LM dumpfile?"); } } else { // System.out.println("Big-endian"); } // read and verify standard header string String header = readString(stream, headerLength - 1); readByte(stream); // read the '\0' if (!header.equals(DARPA_LM_HEADER)) { throw new Error("Bad binary LM file header: " + header); } // read LM filename string size and string
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?