📄 fngramlm.cc
字号:
/* * FNgramLM.cc -- * factored N-gram general graph backoff language models * * Jeff Bilmes <bilmes@ee.washington.edu> * but based on some code from NgramLM.cc and Ngram.h */#ifndef lintstatic char Copyright[] = "Copyright (c) 1995-2006 SRI International. All Rights Reserved.";static char RcsId[] = "@(#)$Header: /home/srilm/devel/flm/src/RCS/FNgramLM.cc,v 1.14 2006/01/09 19:01:44 stolcke Exp $";#endif#ifndef EXCLUDE_CONTRIB#include <new>#include <iostream>using namespace std;#include <stdlib.h>#include <math.h>#include <string.h>#include "FNgram.h"#include "FactoredVocab.h"#include "FDiscount.h"#include "File.h"#include "Array.cc"#include "Trie.cc"#include "hexdec.h"#include "FNgramSpecs.cc"#ifdef INSTANTIATE_TEMPLATESINSTANTIATE_TRIE(VocabIndex,FNgram::BOnode);INSTANTIATE_ARRAY(FNgram::FNgramLM); INSTANTIATE_ARRAY(FNgram::ParentSubset); #ifdef USE_SARRAY#include "SArray.cc"INSTANTIATE_SARRAY(VocabIndex,FNgram::ProbEntry); #else#include "LHash.cc"INSTANTIATE_LHASH(VocabIndex,FNgram::ProbEntry); #endif#endif /* INSTANTIATE_TEMPLATES *//* * Debug levels used */#define DEBUG_ESTIMATE_WARNINGS 1#define DEBUG_FIXUP_WARNINGS 3#define DEBUG_PRINT_GTPARAMS 2#define DEBUG_READ_STATS 1#define DEBUG_WRITE_STATS 1#define DEBUG_NGRAM_HITS 2#define DEBUG_ESTIMATES 4#define DEBUG_ESTIMATE_LM 4#define DEBUG_BOWS 4#define DEBUG_EXTREME 20/* these are the same as in LM.cc */#define DEBUG_PRINT_SENT_PROBS 1#define DEBUG_PRINT_WORD_PROBS 2#define DEBUG_PRINT_PROB_SUMS 3const LogP LogP_PseudoZero = -99.0; /* non-inf value used for log 0 *//* * Low level methods to access context (BOW) nodes and probs */voidFNgram::memStats(MemStats &stats){ // TODO: finish this function.}FNgram::FNgram(FactoredVocab &vocab, FNgramSpecsType& _fngs) : LM(vocab), fngs(_fngs), skipOOVs(false), trustTotals(false), combineLMScores(true), virtualBeginSentence(true), virtualEndSentence(true), noScoreSentenceBoundaryMarks(false){ // we could pre-allocate the fngrams arrays here // but Array objects do not export alloc. fNgrams = new FNgramLM[fngs.fnSpecArray.size()]; fNgramsSize = fngs.fnSpecArray.size(); for (unsigned specNum=0;specNum<fngs.fnSpecArray.size();specNum++) { fNgrams[specNum].parentSubsets = new ParentSubset[fngs.fnSpecArray[specNum].numSubSets]; fNgrams[specNum].parentSubsetsSize = fngs.fnSpecArray[specNum].numSubSets; for (unsigned node=0;node<fngs.fnSpecArray[specNum].numSubSets;node++) { fNgrams[specNum].parentSubsets[node].active = (fngs.fnSpecArray[specNum].parentSubsets[node].counts != NULL); // make copy for convenient access fNgrams[specNum].parentSubsets[node].order = fngs.fnSpecArray[specNum].parentSubsets[node].order; } }}FNgram::~FNgram(){ clear(); for (unsigned specNum=0;specNum<fNgramsSize;specNum++) { delete [] fNgrams[specNum].parentSubsets; } delete [] fNgrams;}/* * Locate a BOW entry in the n-gram trie */LogP *FNgram::ParentSubset::findBOW(const VocabIndex *context){ BOnode *bonode = contexts.find(context); if (bonode) { return &(bonode->bow); } else { return 0; }}/* * Locate a prob entry in the n-gram trie. */LogP *FNgram::ParentSubset::findProb(VocabIndex word, const VocabIndex *context){ BOnode *bonode = contexts.find(context); if (bonode) { return &(bonode->probs.find(word)->prob); } else { return 0; }}/* * Locate or create a BOW entry in the n-gram trie */LogP *FNgram::ParentSubset::insertBOW(const VocabIndex *context){ Boolean found; BOnode *bonode = contexts.insert(context, found); if (!found) { /* * initialize the index in the BOnode */ new (&bonode->probs) PROB_INDEX_T<VocabIndex,ProbEntry>(0); } return &(bonode->bow);}/* * Locate or create a prob entry in the n-gram trie */LogP *FNgram::ParentSubset::insertProb(VocabIndex word, const VocabIndex *context){ Boolean found; BOnode *bonode = contexts.insert(context, found); // fprintf(stderr,"inserting word %d context starting with %d\n",word,*context); if (!found) { // fprintf(stderr,"not found\n"); /* * initialize the index in the BOnode */ new (&bonode->probs) PROB_INDEX_T<VocabIndex,ProbEntry>(0); } ProbEntry* res = bonode->probs.insert(word); // fprintf(stderr,"inserted into probs, got back 0x%X, size now %d\n", // res,bonode->probs.numEntries()); return res ? &(res->prob) : NULL; // return bonode->probs.insert(word);}/* * Locate or create a prob entry in the n-gram trie */LogP *FNgram::ParentSubset::insertProbAndCNT(VocabIndex word, const VocabIndex *context, const unsigned int cnt){ Boolean found; BOnode *bonode = contexts.insert(context, found); if (!found) { /* * initialize the index in the BOnode */ new (&bonode->probs) PROB_INDEX_T<VocabIndex,ProbEntry>(0); } ProbEntry* res = bonode->probs.insert(word); if (res) { res->cnt = cnt; return &(res->prob); } return NULL;}/* * Remove a BOW node (context) from the n-gram trie */voidFNgram::ParentSubset::removeBOW(const VocabIndex *context){ contexts.removeTrie(context);}/* * Remove a prob entry from the n-gram trie */voidFNgram::ParentSubset::removeProb(VocabIndex word, const VocabIndex *context){ BOnode *bonode = contexts.find(context); if (bonode) { bonode->probs.remove(word); }}FNgram::BOnode*FNgram::ParentSubset::findTrieNodeSubCtx(const VocabIndex *context, unsigned int bits){ // layout of arguments // variables: p1 p2 p3 p4 (i.e., parent number) // bits: b1 b2 b3 b4 // context[i]: 0 1 2 3 // From model p(c|p1,p2,p3,p4) // LM tries have p1 at the root tree level, then p2, p3, and p4 at // the tree leaf level. Therefore, this routine indexes lm tries // in ascending context array order. const int wlen = Vocab::length(context); assert (FNgramSpecsType::numBitsSet(bits) <= (unsigned)wlen); BOtrie* boTrie = &contexts; VocabIndex word[2]; word[1] = Vocab_None; for (int i=0; i<wlen && bits;i++) { if (bits & 0x1) { word[0] = context[i]; if ((boTrie = boTrie->findTrie(word)) == NULL) return NULL; } bits >>= 1; } return boTrie ? &(boTrie->value()) : NULL;}/* * Locate a prob entry in the n-gram trie. */LogP *FNgram::ParentSubset::findProbSubCtx(VocabIndex word1, const VocabIndex *context, unsigned int bits){ BOnode* bonode = findTrieNodeSubCtx(context,bits); if (!bonode) return NULL; ProbEntry* pe = bonode->probs.find(word1); return pe ? &(pe->prob) : NULL; }/* * Locate a bow in the n-gram trie. */LogP *FNgram::ParentSubset::findBOWSubCtx(const VocabIndex *context, unsigned int bits){ BOnode* bonode = findTrieNodeSubCtx(context,bits); return bonode ? &(bonode->bow) : NULL; }/* * Locate both prob entry and bow in the n-gram trie. */BooleanFNgram::ParentSubset::findBOWProbSubCtx(VocabIndex word1, const VocabIndex *context, LogP*& prob, LogP*& bow, unsigned int bits){ BOnode* bonode = findTrieNodeSubCtx(context,bits); if (bonode) { prob = &(bonode->probs.find(word1)->prob); bow = &(bonode->bow); return true; } else return false;}FNgram::BOnode*FNgram::ParentSubset::insertTrieNodeSubCtx(const VocabIndex *context, unsigned int bits, Boolean& foundP){ // same as findTrieNodeSubCtx except we do inserts rather than finds. const int wlen = Vocab::length(context); assert (FNgramSpecsType::numBitsSet(bits) <= (unsigned)wlen); BOtrie* boTrie = &contexts; VocabIndex word[2]; word[1] = Vocab_None; for (int i=0; i<wlen && bits;i++) { if (bits & 0x1) { word[0] = context[i]; if ((boTrie = boTrie->insertTrie(word,foundP)) == NULL) return NULL; } bits >>= 1; } foundP = true; return boTrie ? &(boTrie->value()) : NULL;}/* * Locate or create a BOW entry in the n-gram trie */LogP *FNgram::ParentSubset::insertBOWSubCtx(const VocabIndex *context, unsigned int bits){ Boolean found; BOnode *bonode = insertTrieNodeSubCtx(context, bits, found); if (!found) { /* * initialize the index in the BOnode */ new (&bonode->probs) PROB_INDEX_T<VocabIndex,ProbEntry>(0); } return &(bonode->bow);}/* * Locate or create a BOW entry in the n-gram trie */LogP *FNgram::ParentSubset::insertProbSubCtx(VocabIndex word, const VocabIndex *context, unsigned int bits){ Boolean found; BOnode *bonode = insertTrieNodeSubCtx(context, bits, found); if (!found) { /* * initialize the index in the BOnode */ new (&bonode->probs) PROB_INDEX_T<VocabIndex,ProbEntry>(0); } ProbEntry* pe = bonode->probs.insert(word); return pe ? &(pe->prob) : NULL;}voidFNgram::ParentSubset::removeBOWSubCtx(const VocabIndex *context, unsigned int bits){ // same as findTrieNodeSubCtx except that we pack context // into a local words array rather than do the trie search // explicitly. This is because this routine // it is probably not called very often and // doesn't need to be as fast as the others. const int wlen = Vocab::length(context); assert (FNgramSpecsType::numBitsSet(bits) <= (unsigned)wlen); VocabIndex words[maxNumParentsPerChild+2]; unsigned j=0; for (int i=0; i<wlen && bits;i++) { if (bits & 0x1) { words[j] = context[i]; } bits >>= 1; } words[j] = Vocab_None; removeBOW(words);}voidFNgram::ParentSubset::removeProbSubCtx(VocabIndex word1, const VocabIndex *context, unsigned int bits){ const int wlen = Vocab::length(context); assert (FNgramSpecsType::numBitsSet(bits) <= (unsigned)wlen); VocabIndex words[maxNumParentsPerChild+2]; unsigned j=0; for (int i=0; i<wlen && bits;i++) { if (bits & 0x1) { words[j] = context[i]; } bits >>= 1; } words[j] = Vocab_None; removeProb(word1,words);}/* * Remove all probabilities and contexts from n-gram trie */voidFNgram::clear(unsigned int specNum){ VocabIndex context[maxNumParentsPerChild+2]; BOnode *node; if (specNum >= fNgramsSize) return;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -