📄 stopngramstats.cc
字号:
/*
* StopNgramStats.cc --
* N-gram statistics with contexts excluding stop words
*
*/
#ifndef lint
static char TaggedNgramStats_Copyright[] = "Copyright (c) 1996-2006 SRI International. All Rights Reserved.";
static char TaggedNgramStats_RcsId[] = "@(#)$Header: /home/srilm/devel/lm/src/RCS/StopNgramStats.cc,v 1.6 2006/01/05 20:21:27 stolcke Exp $";
#endif
#include <iostream>
using namespace std;
#include <string.h>
#include "StopNgramStats.h"
#include "Array.cc"
StopNgramStats::StopNgramStats(Vocab &vocab, SubVocab &stopWords,
unsigned maxOrder)
: NgramStats(vocab, maxOrder), stopWords(stopWords)
{
}
void
StopNgramStats::incrementCounts(const VocabIndex *words, NgramCount factor)
{
while (*words != Vocab_None) {
counts.insertTrie(words ++)->value() += factor;
}
}
unsigned
StopNgramStats::countSentence(const VocabIndex *words, NgramCount factor)
{
unsigned sentLength = Vocab::length(words);
makeArray(VocabIndex, countWords, sentLength + 1);
unsigned countPos = 0;
for (unsigned nextPos = 0; nextPos < sentLength; nextPos++) {
/*
* Count an ngram that has the current word as the last item,
* and is preceded the non-stop words found so far.
*/
countWords[countPos] = words[nextPos];
countWords[countPos + 1] = Vocab_None;
if (countPos + 1 >= order) {
incrementCounts(&countWords[countPos + 1 - order], factor);
} else {
incrementCounts(countWords, factor);
}
/*
* Check if the next word is a non-stop one, and if so
* include it in the context for the following ngrams
*/
if (!stopWords.getWord(words[nextPos])) {
countWords[countPos ++] = words[nextPos];
}
}
/*
* keep track of word and sentence counts
*/
stats.numWords += sentLength;
if (words[0] == vocab.ssIndex()) {
stats.numWords --;
}
if (sentLength > 0 && words[sentLength-1] == vocab.seIndex()) {
stats.numWords --;
}
stats.numSentences ++;
return sentLength;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -