📄 ngramlm.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/*
 * NgramLM.cc --
 *	N-gram backoff language models
 *
 */

#ifndef lint
static char Copyright[] = "Copyright (c) 1995-2006 SRI International.  All Rights Reserved.";
static char RcsId[] = "@(#)$Header: /home/srilm/devel/lm/src/RCS/NgramLM.cc,v 1.92 2006/01/05 20:21:27 stolcke Exp $";
#endif

#include <new>
#include <iostream>
using namespace std;
#include <stdlib.h>
#include <math.h>

#include "Ngram.h"
#include "File.h"
#include "Array.cc"

#ifdef USE_SARRAY

#include "SArray.cc"

#ifdef INSTANTIATE_TEMPLATES
INSTANTIATE_SARRAY(VocabIndex,LogP); 
#endif

#else /* ! USE_SARRAY */

#include "LHash.cc"

#ifdef INSTANTIATE_TEMPLATES
INSTANTIATE_LHASH(VocabIndex,LogP); 
#endif

#endif /* USE_SARRAY */

#include "Trie.cc"

#ifdef INSTANTIATE_TEMPLATES
INSTANTIATE_TRIE(VocabIndex,BOnode);
#endif

/*
 * Debug levels used
 */
#define DEBUG_ESTIMATE_WARNINGS	1
#define DEBUG_FIXUP_WARNINGS 3
#define DEBUG_PRINT_GTPARAMS 2
#define DEBUG_READ_STATS 1
#define DEBUG_WRITE_STATS 1
#define DEBUG_NGRAM_HITS 2
#define DEBUG_ESTIMATES 4

/* these are the same as in LM.cc */
#define DEBUG_PRINT_SENT_PROBS		1
#define DEBUG_PRINT_WORD_PROBS		2
#define DEBUG_PRINT_PROB_SUMS		3

const LogP LogP_PseudoZero = -99.0;	/* non-inf value used for log 0 */

/*
 * Low level methods to access context (BOW) nodes and probs
 */

void
Ngram::memStats(MemStats &stats)
{
    stats.total += sizeof(*this) - sizeof(contexts);
    contexts.memStats(stats);

    /*
     * The probs tables are not included in the above call,
     * so we have to count those separately.
     */
    makeArray(VocabIndex, context, order + 1);

    for (int i = 1; i <= (int)order; i++) {
	NgramBOsIter iter(*this, context, i - 1);
	BOnode *node;

	while (node = iter.next()) {
	    stats.total -= sizeof(node->probs);
	    node->probs.memStats(stats);
	}
    }
}

Ngram::Ngram(Vocab &vocab, unsigned neworder)
    : LM(vocab), order(neworder), _skipOOVs(false), _trustTotals(false)
{
    if (order < 1) {
	order = 1;
    }
}

Ngram::~Ngram()
{
    clear();
}

unsigned 
Ngram::setorder(unsigned neworder)
{
    unsigned oldorder = order;

    if (neworder > 0) {
	order = neworder;
    }

    return oldorder;
}

/*
 * Locate a BOW entry in the n-gram trie
 */
LogP *
Ngram::findBOW(const VocabIndex *context)
{
    BOnode *bonode = contexts.find(context);
    if (bonode) {
	return &(bonode->bow);
    } else {
	return 0;
    }
}

/*
 * Locate a prob entry in the n-gram trie
 */
LogP *
Ngram::findProb(VocabIndex word, const VocabIndex *context)
{
    BOnode *bonode = contexts.find(context);
    if (bonode) {
	return bonode->probs.find(word);
    } else {
	return 0;
    }
}

/*
 * Locate or create a BOW entry in the n-gram trie
 */
LogP *
Ngram::insertBOW(const VocabIndex *context)
{
    Boolean found;
    BOnode *bonode = contexts.insert(context, found);

    if (!found) {
	/*
	 * initialize the index in the BOnode
	 */
	new (&bonode->probs) PROB_INDEX_T<VocabIndex,LogP>(0);
    }
    return &(bonode->bow);
}

/*
 * Locate or create a prob entry in the n-gram trie
 */
LogP *
Ngram::insertProb(VocabIndex word, const VocabIndex *context)
{
    Boolean found;
    BOnode *bonode = contexts.insert(context, found);

    if (!found) {
	/*
	 * initialize the index in the BOnode
	 */
	new (&bonode->probs) PROB_INDEX_T<VocabIndex,LogP>(0);
    }
    return bonode->probs.insert(word);
}

/*
 * Remove a BOW node (context) from the n-gram trie
 */
void
Ngram::removeBOW(const VocabIndex *context)
{
    contexts.removeTrie(context);
}

/*
 * Remove a prob entry from the n-gram trie
 */
void
Ngram::removeProb(VocabIndex word, const VocabIndex *context)
{
    BOnode *bonode = contexts.find(context);

    if (bonode) {
	bonode->probs.remove(word);
    }
}

/*
 * Remove all probabilities and contexts from n-gram trie
 */
void
Ngram::clear()
{
    makeArray(VocabIndex, context, order);

    BOnode *node;

    /*
     * Remove a ngram probabilities
     */
    for (unsigned i = order; i > 0; i--) {
	BOnode *node;
	NgramBOsIter iter(*this, context, i - 1);
	
	while (node = iter.next()) {
	    node->probs.clear(0);
	}
    }
    
    /*
     * Remove all unigram context tries
     * (since it won't work to delete at the root)
     */
    NgramBOsIter iter(*this, context, 1);

    while (node = iter.next()) {
	removeBOW(context);
    }
}

/*
 * Higher level methods, implemented low-level for efficiency
 */

/*
 * Find the longest BO node that matches a prefix of context.
 * If word != Vocab_None, return longest context defining prob for word.
 * Returns its address as the ID.
 */
void *
Ngram::contextID(VocabIndex word, const VocabIndex *context, unsigned &length)
{
    BOtrie *trieNode = &contexts;

    int i = 0;
    while (i < (int)(order - 1) && context[i] != Vocab_None) {
	BOtrie *next = trieNode->findTrie(context[i]);
	if (next && (word == Vocab_None || next->value().probs.find(word))) {
	    trieNode = next;
	    i ++;
	} else {
	    break;
	}
    }

    length = i;
    return (void *)trieNode;
}

/*
 * Compute backoff weight corresponding to truncating context at given length
 */
LogP
Ngram::contextBOW(const VocabIndex *context, unsigned length)
{
    BOtrie *trieNode = &contexts;
    LogP bow = LogP_One;

    int i = 0;
    while (i < (int)(order - 1) && context[i] != Vocab_None) {
	BOtrie *next = trieNode->findTrie(context[i]);
	if (next) {
	    trieNode = next;
	    i ++;

	    if (i > (int)length) {
		bow += next->value().bow;
	    }
	} else {
	    break;
	}
    }

    return bow;
}

/*
 * This method implements the backoff algorithm in a way that descends into
 * the context trie only once, finding the maximal ngram and accumulating 
 * backoff weights along the way.
 */
LogP
Ngram::wordProbBO(VocabIndex word, const VocabIndex *context, unsigned int clen)
{
    LogP logp = LogP_Zero;
    LogP bow = LogP_One;
    unsigned found = 0;

    BOtrie *trieNode = &contexts;
    int i = 0;

    do {
	LogP *prob = trieNode->value().probs.find(word);

	if (prob) {
	    /*
	     * If a probability is found at this level record it as the 
	     * most specific one found so far and reset the backoff weight.
	     */
	    logp = *prob;
	    bow = LogP_One;
	    found = i + 1;
	} 

        if  (i >= (int)clen || context[i] == Vocab_None) break;

	BOtrie *next = trieNode->findTrie(context[i]);
	if (next) {
	    /*
	     * Accumulate backoff weights 
	     */
	    bow += next->value().bow;
	    trieNode = next;
	    i ++;
	} else {
	    break;
	}
    } while (1);

    if (running() && debug(DEBUG_NGRAM_HITS)) {
	if (found) {
	    dout() << "[" << found << "gram]";
	} else {
	    dout() << "[OOV]";
	}
    }

    return logp + bow;
}

/*
 * Higher level methods (using the lower-level methods)
 */

LogP
Ngram::wordProb(VocabIndex word, const VocabIndex *context)
{
    unsigned int clen = Vocab::length(context);

    if (skipOOVs()) {
	/*
	 * Backward compatibility with the old broken perplexity code:
	 * return prob 0 if any of the context-words have an unknown
	 * word.
	 */
	if (word == vocab.unkIndex() ||
	    order > 1 && context[0] == vocab.unkIndex() ||
	    order > 2 && context[1] == vocab.unkIndex())
	{
	    return LogP_Zero;
	}
    }

    /*
     * Perform the backoff algorithm for a context lenth that is 
     * the minimum of what we were given and the maximal length of
     * the contexts stored in the LM
     */
    return wordProbBO(word, context, ((clen > order - 1) ? order - 1 : clen));
}

Boolean
Ngram::read(File &file, Boolean limitVocab)
{
    char *line;
    unsigned maxOrder = 0;	/* maximal n-gram order in this model */
    unsigned numNgrams[maxNgramOrder + 1];
				/* the number of n-grams for each order */
    unsigned numRead[maxNgramOrder + 1];
				/* Number of n-grams actually read */
    unsigned numOOVs = 0;	/* Numer of n-gram skipped due to OOVs */
    int state = -1 ;		/* section of file being read:
				 * -1 - pre-header, 0 - header,
				 * 1 - unigrams, 2 - bigrams, ... */
    Boolean warnedAboutUnk = false; /* at most one warning about <unk> */

    for (int i = 0; i <= maxNgramOrder; i++) {
	numNgrams[i] = 0;
	numRead[i] = 0;
    }

    clear();

    /*
     * The ARPA format implicitly assumes a zero-gram backoff weight of 0.
     * This has to be properly represented in the BOW trie so various
     * recursive operations work correctly.
     */
    VocabIndex nullContext[1];
    nullContext[0] = Vocab_None;
    *insertBOW(nullContext) = LogP_Zero;

    while (line = file.getline()) {
	
	Boolean backslash = (line[0] == '\\');

	switch (state) {

	case -1: 	/* looking for start of header */
	    if (backslash && strncmp(line, "\\data\\", 6) == 0) {
		state = 0;
		continue;
	    }
	    /*
	     * Everything before "\\data\\" is ignored
	     */
	    continue;

	case 0:		/* ngram header */
	    unsigned thisOrder;
	    int nNgrams;

	    if (backslash && sscanf(line, "\\%d-grams", &state) == 1) {
		/*
		 * start reading n-grams
		 */
		if (state < 1 || state > (int)maxOrder) {
		    file.position() << "invalid ngram order " << state << "\n";
		    return false;
		}

	        if (debug(DEBUG_READ_STATS)) {
		    dout() << (state <= (int)order ? "reading " : "skipping ")
			   << numNgrams[state] << " "
			   << state << "-grams\n";
		}

		continue;
	    } else if (sscanf(line, "ngram %d=%d", &thisOrder, &nNgrams) == 2) {
		/*
		 * scanned a line of the form
		 *	ngram <N>=<howmany>
		 * now perform various sanity checks
		 */
		if (thisOrder <= 0 || thisOrder > maxNgramOrder) {
		    file.position() << "ngram order " << thisOrder
				    << " out of range\n";
		    return false;
		}
		if (nNgrams < 0) {
		    file.position() << "ngram number " << nNgrams
				    << " out of range\n";
		    return false;
		}
		if (thisOrder > maxOrder) {
		    maxOrder = thisOrder;
		}
		numNgrams[thisOrder] = nNgrams;
		continue;
	    } else {
12 3 4 下一页
💿 文件大小 3034 K
👤 上传用户 wanghaihah
📂 所属分类其他
🏷️ 相关标签

#工具包
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -