📄 fdiscount.cc
字号:
/* * FDiscount.cc -- * Discounting methods for factored LMs * */#ifndef lintstatic char Copyright[] = "Copyright (c) 1995-2006 SRI International. All Rights Reserved.";static char RcsId[] = "@(#)$Header: /home/srilm/devel/flm/src/RCS/FDiscount.cc,v 1.9 2006/01/09 18:33:31 stolcke Exp $";#endif#ifndef EXCLUDE_CONTRIB#include <iostream>using namespace std;#include <stdlib.h>#include "hexdec.h"#include "FDiscount.h"#include "Array.cc"#include "FNgramSpecs.cc"/* * Debug levels used here */#define DEBUG_ESTIMATE_DISCOUNT 1#define DEBUG_PRINT_CONTEXTS_WO_COUNTS 8#define DEBUG_PRINT_WARNING_CONTEXTS_WO_COUNTS 0// from defined in FNgramLM.cc// TODO: place all bit routines in separate file.extern unsigned int bitGather(unsigned int mask,unsigned int bitv);/* * For factored models, same as GoodTuring::estimate(), except different * interface. * * Estimation of discount coefficients from ngram count-of-counts * * The Good-Turing formula for this is * * d(c) = (c+1)/c * n_(c+1)/n_c */BooleanFGoodTuring::estimate(FNgramSpecs<FNgramCount>::FNgramSpec &spec, unsigned int node, FactoredVocab& vocab){ Array<Count> countOfCounts; FNgramSpecs<FNgramCount>::FNgramSpec::ParentSubset& counts = spec.parentSubsets[node]; /* * First tabulate count-of-counts for the given order of ngrams * Note we need GT count for up to maxCount + 1 inclusive, to apply * the GT formula for counts up to maxCount. */ makeArray(VocabIndex, wids, counts.order + 1); FNgramSpecs<FNgramCount>::FNgramSpec::PSIter iter(counts, wids); NgramCount *count; Count i; for (i = 0; i <= maxCount + 1; i++) { countOfCounts[i] = 0; } while (count = iter.next()) { if (vocab.isNonEvent(wids[counts.order - 1])) { continue; } else if (vocab.isMetaTag(wids[counts.order - 1])) { unsigned type = vocab.typeOfMetaTag(wids[counts.order - 1]); /* * process count-of-count */ if (type > 0 && type <= maxCount + 1) { countOfCounts[type] += *count; } } else if (*count <= maxCount + 1) { countOfCounts[*count] ++; } } if (FDiscount::debug(DEBUG_ESTIMATE_DISCOUNT)) { FDiscount::dout() << "Good-Turing discounting " << counts.order << "-grams\n"; for (i = 0; i <= maxCount + 1; i++) { FDiscount::dout() << "GT-count [" << i << "] = " << countOfCounts[i] << endl; } } if (countOfCounts[1] == 0) { cerr << "warning: no singleton counts\n"; maxCount = 0; } while (maxCount > 0 && countOfCounts[maxCount + 1] == 0) { cerr << "warning: count of count " << maxCount + 1 << " is zero " << "-- lowering maxcount\n"; maxCount --; } if (maxCount <= 0) { cerr << "GT discounting disabled\n"; } else { double commonTerm = (maxCount + 1) * (double)countOfCounts[maxCount + 1] / (double)countOfCounts[1]; for (i = 1; i <= maxCount; i++) { double coeff; if (countOfCounts[i] == 0) { cerr << "warning: count of count " << i << " is zero\n"; coeff = 1.0; } else { double coeff0 = (i + 1) * (double)countOfCounts[i+1] / (i * (double)countOfCounts[i]); coeff = (coeff0 - commonTerm) / (1.0 - commonTerm); if (coeff <= Prob_Epsilon || coeff0 > 1.0) { cerr << "warning: discount coeff " << i << " is out of range: " << coeff << "\n"; coeff = 1.0; } } discountCoeffs[i] = coeff; } } return true;}/* * Eric Ristad's Natural Law of Succession -- * The discount factor d is identical for all counts, * * d = ( n(n+1) + q(1-q) ) / * ( n^2 + n + 2q ) * * where n is the total number events tokens, q is the number of observed * event types. If q equals the vocabulary size no discounting is * performed. */BooleanFNaturalDiscount::estimate(FNgramSpecs<FNgramCount>::FNgramSpec &spec, unsigned int node, FactoredVocab& vocab){ /* * Determine the true vocab size, i.e., the number of true event * tokens, by enumeration. */ VocabIter viter(vocab); VocabIndex wid; unsigned total = 0; while (viter.next(wid)) { if (!vocab.isNonEvent(wid)) { total ++; } } vocabSize = total; return true;}/* * Unmodified (i.e., regular) Kneser-Ney discounting */BooleanFKneserNey::estimate(FNgramSpecs<FNgramCount>::FNgramSpec &spec, unsigned int node, FactoredVocab& vocab){ if (!prepareCountsAtEnd) { prepareCounts(spec, node, vocab); } /* * First tabulate count-of-counts */ Count n1 = 0; Count n2 = 0; FNgramSpecs<FNgramCount>::FNgramSpec::ParentSubset& counts = spec.parentSubsets[node]; VocabIndex wids[maxNumParentsPerChild + 1]; FNgramSpecs<FNgramCount>::FNgramSpec::PSIter iter(counts, wids); NgramCount *count; while (count = iter.next()) { if (vocab.isNonEvent(wids[counts.order - 1])) { continue; } else if (vocab.isMetaTag(wids[counts.order - 1])) { unsigned type = vocab.typeOfMetaTag(wids[counts.order - 1]); /* * process count-of-count */ if (type == 1) { n1 ++; } else if (type == 2) { n2 ++; } } else if (*count == 1) { n1 ++; } else if (*count == 2) { n2 ++; } } if (FDiscount::debug(DEBUG_ESTIMATE_DISCOUNT)) { FDiscount::dout() << "Kneser-Ney smoothing " << HEX << node << DEC << "-grams\n" << "n1 = " << n1 << endl << "n2 = " << n2 << endl; } if (n1 == 0 || n2 == 0) { cerr << "warning: one of required KneserNey count-of-counts is zero\n"; return false; } discount1 = n1/((double)n1 + 2*n2); if (FDiscount::debug(DEBUG_ESTIMATE_DISCOUNT)) { FDiscount::dout() << "D = " << discount1 << endl; } if (prepareCountsAtEnd) { prepareCounts(spec, node, vocab); } return true;}voidFKneserNey::prepareCounts(FNgramSpecs<FNgramCount>::FNgramSpec &spec, unsigned int node, FactoredVocab& vocab){ if (countsAreModified || node == (spec.numSubSets-1)) { return; } // if (debug(DEBUG_ESTIMATE_DISCOUNT)) { // nothing like a simple way to print out a lousy hex number! // dout() << "Modifying node 0x" << HEX << node << DEC // << "-gram counts for Kneser-Ney smoothing\n"; // } /* * For the lower-order counts in KN discounting we need to replace the * counts to reflect the number of contexts in which a given N-gram * occurs. Specifically, * * c(w2,...,wN) = number of N-grams w1,w2,...wN with count > 0 * = |{ w1 : c(w1,w2,...wN) > 0 }| * = N1+(*,w2,...,wN) * = number of different words that preceed (w2,...,wN) in * training data. * */ VocabIndex ngram[maxNumParentsPerChild + 2]; /* * clear all counts of given order * Note: exclude N-grams starting with non-events (such as <s>) since * there usually are no words preceeding them. */ { FNgramSpecs<FNgramCount>::FNgramSpec::PSIter iter(spec.parentSubsets[node], ngram); FNgramCount *count; while (count = iter.next()) { if (!vocab.Vocab::isNonEvent(ngram[0]) && !(vocab.nullIsWord() && ngram[0] == vocab.nullIndex)) { *count = 0; } } } /* * Now accumulate new counts, but first choose a parent to get counts * from. */ // TODO: the selection process here chooses only one of many // possible choises to get the counts from. Ideally, KN-smoothing // should be re-thought for general BG models. For now, choose the // parent with the highest bit, so we'll at least get the same // results as with normal word models with temporally constrained // backoff. unsigned int max_parent_node=0; if ((int)spec.parentSubsets[node].knCountParent == ~0x0) { FNgramSpecs<FNgramCount>::FNgramSpec::BGParentIter piter(spec.numParents, node); unsigned int parent_node; while (piter.next(parent_node)) { // not interested in parents that do not exist if (spec.parentSubsets[parent_node].counts == NULL) continue; if (parent_node > max_parent_node) max_parent_node = parent_node; } } else { max_parent_node=spec.parentSubsets[node].knCountParent; } // fprintf(stderr,"in prepcounts: node = 0x%X, max_par = 0x%X\n",node,max_parent_node); // if (node == 0x1) // fprintf(stderr,"in prepcounts: node = 0x%X, max_par = 0x%X\n",node,max_parent_node); { FNgramSpecs<FNgramCount>::FNgramSpec::PSIter iter(spec.parentSubsets[max_parent_node],ngram); FNgramCount *count; while (count = iter.next()) { if (*count > 0) { FNgramCount *loCount = spec.parentSubsets[node]. findCountSubCtx(ngram, bitGather(max_parent_node,node)<<1|0x1); if (loCount) { (*loCount) += 1; } } } } // check to make sure that all guys have a non-zero count. // TODO: this is arguably a debugging option, and could be turned off. // This check is useful, however, if the -no-virtual-begin-sentence // option has been used. { FNgramSpecs<FNgramCount>::FNgramSpec::PSIter iter(spec.parentSubsets[node], ngram); FNgramCount *count; unsigned total=0; unsigned thoseWoCounts=0; while (count = iter.next()) { total++; if (!vocab.Vocab::isNonEvent(ngram[0]) && !(vocab.nullIsWord() && ngram[0] == vocab.nullIndex)) { if (*count == 0) { thoseWoCounts++; if (FDiscount::debug(DEBUG_PRINT_CONTEXTS_WO_COUNTS)) { fprintf(stderr, "Node 0x%X, Parent 0x%X: A context does not have a count\n",node,max_parent_node); cerr << "Context: " << ngram << endl; } } } } if (FDiscount::debug(DEBUG_PRINT_WARNING_CONTEXTS_WO_COUNTS) && thoseWoCounts > 0) { fprintf(stderr,"Node 0x%X, Parent 0x%X: Found %d/%d contexts with zero counts\n",node,max_parent_node,thoseWoCounts,total); } } countsAreModified = true;}/* * Modified Kneser-Ney discounting * as described in S. F. Chen & J. Goodman, An Empirical Study of * Smoothing Techniques for Language Modeling, TR-10-98, Computer * Science Group, Harvard University, Cambridge, MA, August 1998. */BooleanFModKneserNey::estimate(FNgramSpecs<FNgramCount>::FNgramSpec &spec, unsigned int node, FactoredVocab& vocab){ if (!FKneserNey::prepareCountsAtEnd) { FKneserNey::prepareCounts(spec, node, vocab); } /* * First tabulate count-of-counts */ Count n1 = 0; Count n2 = 0; Count n3 = 0; Count n4 = 0; FNgramSpecs<FNgramCount>::FNgramSpec::ParentSubset& counts = spec.parentSubsets[node]; VocabIndex wids[maxNumParentsPerChild + 1]; FNgramSpecs<FNgramCount>::FNgramSpec::PSIter iter(counts, wids); NgramCount *count; while (count = iter.next()) { if (vocab.isNonEvent(wids[counts.order - 1])) { continue; } else if (vocab.isMetaTag(wids[counts.order - 1])) { unsigned type = vocab.typeOfMetaTag(wids[counts.order - 1]); /* * process count-of-count */ if (type == 1) { n1 ++; } else if (type == 2) { n2 ++; } else if (type == 3) { n3 ++; } else if (type == 4) { n4 ++; } } else if (*count == 1) { n1 ++; } else if (*count == 2) { n2 ++; } else if (*count == 3) { n3 ++; } else if (*count == 4) { n4 ++; } } if (FKneserNey::FDiscount::debug(DEBUG_ESTIMATE_DISCOUNT)) { FKneserNey::FDiscount::dout() << "Mod Kneser-Ney smoothing " << HEX << node << DEC << "-grams\n" << "n1 = " << n1 << endl << "n2 = " << n2 << endl << "n3 = " << n3 << endl << "n4 = " << n4 << endl; } if (n1 == 0 || n2 == 0 || n3 == 0 || n4 ==0) { cerr << "warning: one of required modified KneserNey count-of-counts is zero\n"; return false; } /* * Compute discount constants (Ries 1997, Chen & Goodman 1998) */ double Y = (double)n1/(n1 + 2 * n2); // use ModKneserNey:discount1 since we use it in ModKneserNey::discount() ModKneserNey::discount1 = 1 - 2 * Y * n2 / n1; discount2 = 2 - 3 * Y * n3 / n2; discount3plus = 3 - 4 * Y * n4 / n3; if (FKneserNey::FDiscount::debug(DEBUG_ESTIMATE_DISCOUNT)) { FKneserNey::FDiscount::dout() << "D1 = " << ModKneserNey::discount1 << endl << "D2 = " << discount2 << endl << "D3+ = " << discount3plus << endl; } if (FKneserNey::prepareCountsAtEnd) { FKneserNey::prepareCounts(spec, node, vocab); } return true;}#endif /* EXCLUDE_CONTRIB_END */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -