cachelm.cc

来自「这是一款很好用的工具包」· CC 代码 · 共 114 行

114 行

/*
 * CacheLM.cc --
 *	Unigram cache language model
 *
 */

#ifndef lint
static char Copyright[] = "Copyright (c) 1995, SRI International.  All Rights Reserved.";
static char RcsId[] = "@(#)$Header: /home/srilm/devel/lm/src/RCS/CacheLM.cc,v 1.7 2006/01/05 20:21:27 stolcke Exp $";
#endif

#include <iostream>
using namespace std;
#include <stdlib.h>
#include <math.h>

#include "CacheLM.h"

#include "Array.cc"
#ifdef INSTANTIATE_TEMPLATES
// INSTANTIATE_ARRAY(VocabIndex);
#endif

#include "LHash.cc"
#ifdef INSTANTIATE_TEMPLATES
INSTANTIATE_LHASH(VocabIndex,double);
#endif

/*
 * Debug levels used
 */
#define DEBUG_CACHE_HITS	2

CacheLM::CacheLM(Vocab &vocab, unsigned historyLength)
    : LM(vocab), historyLength(historyLength),
      wordHistory(0, historyLength), wordCounts(0)
{
   flushCache();
}

/*
 * Forget all that is in the cache
 */
void
CacheLM::flushCache()
{
    /*
     * Initialize word history.
     */
    for (unsigned i = 0; i < historyLength; i++) {
	wordHistory[i] = Vocab_None;
    }
    historyEnd = 0;
    totalCount = 0.0;

    /*
     * Reset word counts to zero
     */
    LHashIter<VocabIndex,double> wordIter(wordCounts);
    VocabIndex word;
    double *wordCount;

    while (wordCount = wordIter.next(word)) {
	*wordCount = 0.0;
    }
}

LogP
CacheLM::wordProb(VocabIndex word, const VocabIndex *context)
{
    /*
     * We don't cache unknown words unless <unk> is treated as a regular word.
     */
    if (word == vocab.unkIndex() && !vocab.unkIsWord()) {
	return LogP_Zero;
    }

    /*
     * Return the maximum likelihood estimate based on all words
     * in the history.  Return prob 0 for the very first word.
     */
    double *wordCount = wordCounts.insert(word);

    Prob wordProb =
	totalCount == 0.0 ? 0.0 : (*wordCount / totalCount);

    if (running() && debug(DEBUG_CACHE_HITS)) {
	dout() << "[cache=" << wordProb << "]";
    }

    /*
     * Update history and counts
     */
    if (running() && historyLength > 0) {
	VocabIndex oldWord = wordHistory[historyEnd];
	if (oldWord == Vocab_None) {
	    totalCount ++;
	} else {
	    double *oldWordCount = wordCounts.find(oldWord);
	    assert(oldWordCount != 0);

	    *oldWordCount -= 1.0;
	}

	wordHistory[historyEnd] = word;
	*wordCount += 1.0;

	historyEnd = (historyEnd + 1) % historyLength;
    }

    return ProbToLogP(wordProb);
}

cachelm.cc - 源码说明

本页面展示了「这是一款很好用的工具包」中的 cachelm.cc 源码文件，采用 CC 编程语言编写，共 114 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与工具包相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?