📄 ngramlm.cc
字号:
/*
* NgramLM.cc --
* N-gram backoff language models
*
*/
#ifndef lint
static char Copyright[] = "Copyright (c) 1995-2006 SRI International. All Rights Reserved.";
static char RcsId[] = "@(#)$Header: /home/srilm/devel/lm/src/RCS/NgramLM.cc,v 1.92 2006/01/05 20:21:27 stolcke Exp $";
#endif
#include <new>
#include <iostream>
using namespace std;
#include <stdlib.h>
#include <math.h>
#include "Ngram.h"
#include "File.h"
#include "Array.cc"
#ifdef USE_SARRAY
#include "SArray.cc"
#ifdef INSTANTIATE_TEMPLATES
INSTANTIATE_SARRAY(VocabIndex,LogP);
#endif
#else /* ! USE_SARRAY */
#include "LHash.cc"
#ifdef INSTANTIATE_TEMPLATES
INSTANTIATE_LHASH(VocabIndex,LogP);
#endif
#endif /* USE_SARRAY */
#include "Trie.cc"
#ifdef INSTANTIATE_TEMPLATES
INSTANTIATE_TRIE(VocabIndex,BOnode);
#endif
/*
* Debug levels used
*/
#define DEBUG_ESTIMATE_WARNINGS 1
#define DEBUG_FIXUP_WARNINGS 3
#define DEBUG_PRINT_GTPARAMS 2
#define DEBUG_READ_STATS 1
#define DEBUG_WRITE_STATS 1
#define DEBUG_NGRAM_HITS 2
#define DEBUG_ESTIMATES 4
/* these are the same as in LM.cc */
#define DEBUG_PRINT_SENT_PROBS 1
#define DEBUG_PRINT_WORD_PROBS 2
#define DEBUG_PRINT_PROB_SUMS 3
const LogP LogP_PseudoZero = -99.0; /* non-inf value used for log 0 */
/*
* Low level methods to access context (BOW) nodes and probs
*/
void
Ngram::memStats(MemStats &stats)
{
stats.total += sizeof(*this) - sizeof(contexts);
contexts.memStats(stats);
/*
* The probs tables are not included in the above call,
* so we have to count those separately.
*/
makeArray(VocabIndex, context, order + 1);
for (int i = 1; i <= (int)order; i++) {
NgramBOsIter iter(*this, context, i - 1);
BOnode *node;
while (node = iter.next()) {
stats.total -= sizeof(node->probs);
node->probs.memStats(stats);
}
}
}
Ngram::Ngram(Vocab &vocab, unsigned neworder)
: LM(vocab), order(neworder), _skipOOVs(false), _trustTotals(false)
{
if (order < 1) {
order = 1;
}
}
Ngram::~Ngram()
{
clear();
}
unsigned
Ngram::setorder(unsigned neworder)
{
unsigned oldorder = order;
if (neworder > 0) {
order = neworder;
}
return oldorder;
}
/*
* Locate a BOW entry in the n-gram trie
*/
LogP *
Ngram::findBOW(const VocabIndex *context)
{
BOnode *bonode = contexts.find(context);
if (bonode) {
return &(bonode->bow);
} else {
return 0;
}
}
/*
* Locate a prob entry in the n-gram trie
*/
LogP *
Ngram::findProb(VocabIndex word, const VocabIndex *context)
{
BOnode *bonode = contexts.find(context);
if (bonode) {
return bonode->probs.find(word);
} else {
return 0;
}
}
/*
* Locate or create a BOW entry in the n-gram trie
*/
LogP *
Ngram::insertBOW(const VocabIndex *context)
{
Boolean found;
BOnode *bonode = contexts.insert(context, found);
if (!found) {
/*
* initialize the index in the BOnode
*/
new (&bonode->probs) PROB_INDEX_T<VocabIndex,LogP>(0);
}
return &(bonode->bow);
}
/*
* Locate or create a prob entry in the n-gram trie
*/
LogP *
Ngram::insertProb(VocabIndex word, const VocabIndex *context)
{
Boolean found;
BOnode *bonode = contexts.insert(context, found);
if (!found) {
/*
* initialize the index in the BOnode
*/
new (&bonode->probs) PROB_INDEX_T<VocabIndex,LogP>(0);
}
return bonode->probs.insert(word);
}
/*
* Remove a BOW node (context) from the n-gram trie
*/
void
Ngram::removeBOW(const VocabIndex *context)
{
contexts.removeTrie(context);
}
/*
* Remove a prob entry from the n-gram trie
*/
void
Ngram::removeProb(VocabIndex word, const VocabIndex *context)
{
BOnode *bonode = contexts.find(context);
if (bonode) {
bonode->probs.remove(word);
}
}
/*
* Remove all probabilities and contexts from n-gram trie
*/
void
Ngram::clear()
{
makeArray(VocabIndex, context, order);
BOnode *node;
/*
* Remove a ngram probabilities
*/
for (unsigned i = order; i > 0; i--) {
BOnode *node;
NgramBOsIter iter(*this, context, i - 1);
while (node = iter.next()) {
node->probs.clear(0);
}
}
/*
* Remove all unigram context tries
* (since it won't work to delete at the root)
*/
NgramBOsIter iter(*this, context, 1);
while (node = iter.next()) {
removeBOW(context);
}
}
/*
* Higher level methods, implemented low-level for efficiency
*/
/*
* Find the longest BO node that matches a prefix of context.
* If word != Vocab_None, return longest context defining prob for word.
* Returns its address as the ID.
*/
void *
Ngram::contextID(VocabIndex word, const VocabIndex *context, unsigned &length)
{
BOtrie *trieNode = &contexts;
int i = 0;
while (i < (int)(order - 1) && context[i] != Vocab_None) {
BOtrie *next = trieNode->findTrie(context[i]);
if (next && (word == Vocab_None || next->value().probs.find(word))) {
trieNode = next;
i ++;
} else {
break;
}
}
length = i;
return (void *)trieNode;
}
/*
* Compute backoff weight corresponding to truncating context at given length
*/
LogP
Ngram::contextBOW(const VocabIndex *context, unsigned length)
{
BOtrie *trieNode = &contexts;
LogP bow = LogP_One;
int i = 0;
while (i < (int)(order - 1) && context[i] != Vocab_None) {
BOtrie *next = trieNode->findTrie(context[i]);
if (next) {
trieNode = next;
i ++;
if (i > (int)length) {
bow += next->value().bow;
}
} else {
break;
}
}
return bow;
}
/*
* This method implements the backoff algorithm in a way that descends into
* the context trie only once, finding the maximal ngram and accumulating
* backoff weights along the way.
*/
LogP
Ngram::wordProbBO(VocabIndex word, const VocabIndex *context, unsigned int clen)
{
LogP logp = LogP_Zero;
LogP bow = LogP_One;
unsigned found = 0;
BOtrie *trieNode = &contexts;
int i = 0;
do {
LogP *prob = trieNode->value().probs.find(word);
if (prob) {
/*
* If a probability is found at this level record it as the
* most specific one found so far and reset the backoff weight.
*/
logp = *prob;
bow = LogP_One;
found = i + 1;
}
if (i >= (int)clen || context[i] == Vocab_None) break;
BOtrie *next = trieNode->findTrie(context[i]);
if (next) {
/*
* Accumulate backoff weights
*/
bow += next->value().bow;
trieNode = next;
i ++;
} else {
break;
}
} while (1);
if (running() && debug(DEBUG_NGRAM_HITS)) {
if (found) {
dout() << "[" << found << "gram]";
} else {
dout() << "[OOV]";
}
}
return logp + bow;
}
/*
* Higher level methods (using the lower-level methods)
*/
LogP
Ngram::wordProb(VocabIndex word, const VocabIndex *context)
{
unsigned int clen = Vocab::length(context);
if (skipOOVs()) {
/*
* Backward compatibility with the old broken perplexity code:
* return prob 0 if any of the context-words have an unknown
* word.
*/
if (word == vocab.unkIndex() ||
order > 1 && context[0] == vocab.unkIndex() ||
order > 2 && context[1] == vocab.unkIndex())
{
return LogP_Zero;
}
}
/*
* Perform the backoff algorithm for a context lenth that is
* the minimum of what we were given and the maximal length of
* the contexts stored in the LM
*/
return wordProbBO(word, context, ((clen > order - 1) ? order - 1 : clen));
}
Boolean
Ngram::read(File &file, Boolean limitVocab)
{
char *line;
unsigned maxOrder = 0; /* maximal n-gram order in this model */
unsigned numNgrams[maxNgramOrder + 1];
/* the number of n-grams for each order */
unsigned numRead[maxNgramOrder + 1];
/* Number of n-grams actually read */
unsigned numOOVs = 0; /* Numer of n-gram skipped due to OOVs */
int state = -1 ; /* section of file being read:
* -1 - pre-header, 0 - header,
* 1 - unigrams, 2 - bigrams, ... */
Boolean warnedAboutUnk = false; /* at most one warning about <unk> */
for (int i = 0; i <= maxNgramOrder; i++) {
numNgrams[i] = 0;
numRead[i] = 0;
}
clear();
/*
* The ARPA format implicitly assumes a zero-gram backoff weight of 0.
* This has to be properly represented in the BOW trie so various
* recursive operations work correctly.
*/
VocabIndex nullContext[1];
nullContext[0] = Vocab_None;
*insertBOW(nullContext) = LogP_Zero;
while (line = file.getline()) {
Boolean backslash = (line[0] == '\\');
switch (state) {
case -1: /* looking for start of header */
if (backslash && strncmp(line, "\\data\\", 6) == 0) {
state = 0;
continue;
}
/*
* Everything before "\\data\\" is ignored
*/
continue;
case 0: /* ngram header */
unsigned thisOrder;
int nNgrams;
if (backslash && sscanf(line, "\\%d-grams", &state) == 1) {
/*
* start reading n-grams
*/
if (state < 1 || state > (int)maxOrder) {
file.position() << "invalid ngram order " << state << "\n";
return false;
}
if (debug(DEBUG_READ_STATS)) {
dout() << (state <= (int)order ? "reading " : "skipping ")
<< numNgrams[state] << " "
<< state << "-grams\n";
}
continue;
} else if (sscanf(line, "ngram %d=%d", &thisOrder, &nNgrams) == 2) {
/*
* scanned a line of the form
* ngram <N>=<howmany>
* now perform various sanity checks
*/
if (thisOrder <= 0 || thisOrder > maxNgramOrder) {
file.position() << "ngram order " << thisOrder
<< " out of range\n";
return false;
}
if (nNgrams < 0) {
file.position() << "ngram number " << nNgrams
<< " out of range\n";
return false;
}
if (thisOrder > maxOrder) {
maxOrder = thisOrder;
}
numNgrams[thisOrder] = nNgrams;
continue;
} else {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -