📄 fngramlm.cc
字号:
/* * Remove a ngram probabilities */ for (unsigned i=0;i<fNgrams[specNum].parentSubsetsSize;i++) { BOnode *node; BOsIter iter(*this,specNum,i,context); while (node = iter.next()) { node->probs.clear(0); } }}voidFNgram::clear(){ for (unsigned i=0;i<fNgramsSize;i++) clear(i);}// for printing binary stringsstatic char *intToBinStr(unsigned i){ // good for 64 bit ints const int len = 64+3; static char local_buff[len]; local_buff[len-1] = 0; int pos = len-1; if (i == 0) local_buff[--pos] = '0'; else { do { if (i&0x1) local_buff[--pos] = '1'; else local_buff[--pos] = '0'; i >>= 1; } while (i); } local_buff[--pos] = 'b'; local_buff[--pos] = '0'; return &local_buff[pos];}// TODO: but this and other bit routines in one// separate pair of .cc .h files.unsigned int bitGather(register unsigned int mask,register unsigned int bitv){ // gather (or pack) together the bits in bitv that // correspond to the 1 bits in mask, and place them // at the low end of the word 'res' register unsigned res = 0; register unsigned res_pos = 0; while (mask && bitv) { if (mask & 0x1) { if (bitv & 0x1) { res |= (1<<res_pos); } res_pos++; } mask >>= 1; bitv >>= 1; } return res;}/* * For a given node in the BG, compute the child node that * we should backoff to. The 'context' argument is assumed * to be in reverse order with respect to the cound tries. * * Note: This routine contains the backoff algorithm. * TODO: at some point make a version of this w/o a nWrtwCip argument * for fast ppl and rescoring. */unsignedFNgram::boNode(const VocabIndex word, const VocabIndex *context, const unsigned nWrtwCip, // node number w.r.t which context is packed. const unsigned int specNum, const unsigned int node){ assert (node != 0); // no reason to call this routine if node == 0. // all nodes with one parent back off to the unigram. const unsigned int nbitsSet = FNgramSpecsType::numBitsSet(node); if (nbitsSet == 1) return 0; const unsigned int numParents = fngs.fnSpecArray[specNum].numParents; FNgramSpecsType::FNgramSpec::BGChildIterCnstr citer(numParents,node,fngs.fnSpecArray[specNum].parentSubsets[node].backoffConstraint); unsigned bg_child; // backoff-graph child unsigned chosen_bg_child = ~0x0; unsigned a_bg_child = ~0x0; // arbitrary child, if everything fails, we use this. const Boolean domax = // do min if domax == false. (fngs.fnSpecArray[specNum].parentSubsets[node].backoffCombine == MaxBgChild); const double initScore = domax ? -1e220 : 1e220; double bestScore = initScore; // find child node with the largest counts while (citer.next(bg_child)) { // We max/min over the BO value. The BO value is determined by the // BO algorithm (i.e., counts, normalized counts, etc.) in the // specs object for this node. double score = fngs.fnSpecArray[specNum].parentSubsets[bg_child]. backoffValueRSubCtxW(word,context,nWrtwCip, fngs.fnSpecArray[specNum]. parentSubsets[node].backoffStrategy, *this, specNum, bg_child); if (score == -1e200) // TODO: change this to NaN or Inf, and a #define (also see below) continue; // continue presumably because of a NULL counts object if (a_bg_child == ~0x0) a_bg_child = bg_child; if (domax && (score > bestScore) || !domax && (score < bestScore)) { chosen_bg_child = bg_child; bestScore = score; } } // make sure that we have at least one valid child assert (a_bg_child != ~0x0); // if we only have one child, or if we have two children and have // not found a best child, just return an arbitrary child node. if ((fngs.fnSpecArray[specNum].parentSubsets[node].numBGChildren == 1) || (chosen_bg_child == ~0x0 && nbitsSet == 2)) return a_bg_child; if (chosen_bg_child == ~0x0) { // Then we did not found any BG-child with a score for this // context. We back off to the child that has the best // combined schore of its children. We keep // doing this, as long as possible. unsigned int great; for (great=0; (great+2)< nbitsSet ; great++) { citer.init(); while (citer.next(bg_child)) { double score = initScore; // get score for child if (great == 0) { // children of child iter FNgramSpecsType::FNgramSpec::BGChildIterCnstr gciter(numParents,bg_child,fngs.fnSpecArray[specNum].parentSubsets[bg_child].backoffConstraint); unsigned bg_grandchild; while (gciter.next(bg_grandchild)) { double tmp=fngs.fnSpecArray[specNum].parentSubsets[bg_grandchild]. backoffValueRSubCtxW(word,context,nWrtwCip, fngs.fnSpecArray[specNum]. parentSubsets[node].backoffStrategy, *this, specNum, bg_grandchild); // compute local max min of offspring if (domax && (tmp > score) || !domax && (tmp < score)) { score = tmp; } } } else { fprintf(stderr,"WARNING: might produce unnormalized LM. Check with -debug 3\n"); // grandchildren of child iter FNgramSpecsType::FNgramSpec::BGGrandChildIter descendant_iter(numParents,bg_child,great-1); unsigned bg_grandchild; while (descendant_iter.next(bg_grandchild)) { double tmp=fngs.fnSpecArray[specNum].parentSubsets[bg_grandchild]. backoffValueRSubCtxW(word,context,nWrtwCip, fngs.fnSpecArray[specNum]. parentSubsets[node].backoffStrategy, *this, specNum, bg_grandchild); if (domax && (tmp > score) || !domax && (tmp < score)) { score = tmp; } } } if (score == -1e200) // TODO: change this to NaN or Inf, and a #define (also see above) continue; // presumably because of a NULL counts objects if (domax && (score > bestScore) || !domax && (score < bestScore)) { chosen_bg_child = bg_child; bestScore = score; } } if (chosen_bg_child != ~0x0) break; } // still not found, chose an arbitrary child if (chosen_bg_child == ~0x0) chosen_bg_child = a_bg_child; } return chosen_bg_child;}//// General algorithm for the "BG child probability backoff"LogPFNgram::bgChildProbBO(VocabIndex word, const VocabIndex *context, const unsigned nWrtwCip, const unsigned int specNum, const unsigned int node){ // general graph backoff algorithm for multiple BG children. // should never be called at node 0 since there is nothing to // backoff to. assert ( node != 0 ); // special case numBGChildren == 1 for speed. if (fngs.fnSpecArray[specNum].parentSubsets[node].numBGChildren == 1) { unsigned int bg_child; FNgramSpecsType::FNgramSpec::BGChildIterCnstr citer(fngs.fnSpecArray[specNum].numParents, node, fngs.fnSpecArray[specNum].parentSubsets[node].backoffConstraint); while (citer.next(bg_child)) { if (fngs.fnSpecArray[specNum].parentSubsets[bg_child].counts != NULL) { // return immediately since we've found the bg child return wordProbBO(word,context,nWrtwCip,specNum,bg_child); } } } // still here? Do the general case. LogP bo_prob; if (fngs.fnSpecArray[specNum].parentSubsets[node].backoffCombine == ProdBgChild || fngs.fnSpecArray[specNum].parentSubsets[node].backoffCombine == GmeanBgChild) { bo_prob = LogP_One; unsigned int bg_child; FNgramSpecsType::FNgramSpec::BGChildIterCnstr citer(fngs.fnSpecArray[specNum].numParents, node, fngs.fnSpecArray[specNum].parentSubsets[node].backoffConstraint); while (citer.next(bg_child)) { if (fngs.fnSpecArray[specNum].parentSubsets[bg_child].counts != NULL) { // multiply the probs (add the log probs) bo_prob += wordProbBO(word,context,nWrtwCip,specNum,bg_child); } } if (fngs.fnSpecArray[specNum].parentSubsets[node].backoffCombine == GmeanBgChild) bo_prob /= fngs.fnSpecArray[specNum]. parentSubsets[node].numBGChildren; } else if (fngs.fnSpecArray[specNum].parentSubsets[node].backoffCombine == SumBgChild || fngs.fnSpecArray[specNum].parentSubsets[node].backoffCombine == AvgBgChild) { bo_prob = LogP_Zero; unsigned int bg_child; FNgramSpecsType::FNgramSpec::BGChildIterCnstr citer(fngs.fnSpecArray[specNum].numParents, node, fngs.fnSpecArray[specNum].parentSubsets[node].backoffConstraint); while (citer.next(bg_child)) { if (fngs.fnSpecArray[specNum].parentSubsets[bg_child].counts != NULL) { // add the probs bo_prob = (LogP)AddLogP(bo_prob,wordProbBO(word,context,nWrtwCip,specNum,bg_child)); } } if (fngs.fnSpecArray[specNum].parentSubsets[node].backoffCombine == AvgBgChild) bo_prob -= (LogP)log10((double)fngs.fnSpecArray[specNum].parentSubsets[node].numBGChildren); } else if (fngs.fnSpecArray[specNum].parentSubsets[node].backoffCombine == WmeanBgChild) { bo_prob = LogP_Zero; unsigned int bg_child; FNgramSpecsType::FNgramSpec::BGChildIterCnstr citer(fngs.fnSpecArray[specNum].numParents, node, fngs.fnSpecArray[specNum].parentSubsets[node].backoffConstraint); unsigned cpos = 0; while (citer.next(bg_child)) { if (fngs.fnSpecArray[specNum].parentSubsets[bg_child].counts != NULL) { // add the probs by weights bo_prob = (LogP)AddLogP(bo_prob, // log10(0.5) + fngs.fnSpecArray[specNum].parentSubsets[node].wmean[cpos] + wordProbBO(word,context,nWrtwCip,specNum,bg_child)); cpos++; } } } else { // choose only one backoff node unsigned chosen_descendant = boNode(word,context,nWrtwCip,specNum,node); bo_prob = wordProbBO(word,context,nWrtwCip,specNum,chosen_descendant); } return bo_prob;}/* * This method implements the backoff algorithm in a straightforward, * recursive manner. * Note, this function likes context in reverse order. * I.e., if word = child = word_t, then * context[0] = parent1 word_{t-1} * context[1] = parent2 word_{t-2} * context[2] = parent3 word_{t-3} * In the model p(child|parent1,parent2,parent3) = p(w_t|w_t-1,w_t-2,w_t-3) */// TODO: make a version of this routine that assumes context is// packed w.r.t. 0x1111 node, used for ppl and rescoring.LogPFNgram::wordProbBO(VocabIndex word, const VocabIndex *context, // nWrtwCip: Node number With respect to (w.r.t.) // which Context is packed. const unsigned nWrtwCip, const unsigned int specNum, const unsigned int node){ LogP result; const unsigned packedBits = bitGather(nWrtwCip,node); LogP *prob = fNgrams[specNum].parentSubsets[node]. findProbSubCtx(word,context,packedBits); if (prob) { if (running() && debug(DEBUG_NGRAM_HITS)) { // note that this message will occur multiple times when doing // certain strategies and/or combination methods. The true // "hit" is the last (right most) one that is printed. char buff[1024]; sprintf(buff,"[0x%X gram]",node); dout() << buff; } result = *prob; } else { if (node == 0) { if (running() && debug(DEBUG_NGRAM_HITS)) { dout() << "[OOV]"; } result = LogP_Zero; } else { LogP *bow = fNgrams[specNum].parentSubsets[node]. findBOWSubCtx(context,packedBits); if (bow) { // fprintf(stderr,"found bow = %f\n",*bow); result = *bow + bgChildProbBO(word,context,nWrtwCip,specNum,node); } else { if (!fngs.fnSpecArray[specNum].parentSubsets[node].requiresGenBackoff()) { result = bgChildProbBO(word,context,nWrtwCip,specNum,node); } else { // need to compute BOW for this context (i.e., normalize // the BO distribution). Prob sum = bgChildProbSum(context,nWrtwCip,specNum,node); LogP lsum = ProbToLogP(sum); // insert sum as a BOW to speed up access on future access. // NOTE: This next code is one of the reasons BG LM estimation runs so slowly. // TODO: could always recompute to save memory but slow things down. // TODO: this slows things down significantly, think of a way to make // this part run faster! (perhaps compute all bows during training, // but that makes the files large). // NOTE: this code will also cause memory usage to grow, it will // look like a memory leak, but it is not, it is just // the insertion of bows into the lm trie. *(fNgrams[specNum].parentSubsets[node]. insertBOWSubCtx(context,packedBits)) = - lsum; result = bgChildProbBO(word,context,nWrtwCip,specNum,node) - lsum; } } } } return result;}LogPFNgram::wordProb(VocabIndex word, const VocabIndex *context, // nWrtwCip: Node number With respect to (w.r.t.) which Context is packed. const unsigned nWrtwCip, const unsigned int specNum, const unsigned int node){ if (skipOOVs) { /* * Backward compatibility with the old broken perplexity code: * TODO: figure out what is meant by broken here. * return prob 0 if any of the context-words have an unknown * word. */ if (word == vocab.unkIndex()) return LogP_Zero; unsigned int clen = Vocab::length(context); for (unsigned j=0;j<clen; j++) { if (context[j] == vocab.unkIndex()) return LogP_Zero; } } /* * Perform the backoff algorithm for a context lenth that is * the minimum of what we were given and the maximal length of * the contexts stored in the LM */ return wordProbBO(word, context, nWrtwCip, specNum, node);}// reads in an ARPA-file inspired format -// - there are node-grams rather than n-grams, where// node is the bit vector (hex integer) giving LM parents that// to be used in that gram in the *LM* graph// - there might be multiple bows, depending on from where// the backoff is coming from.// TODO: make sure that read sets all variables in FNgram that// constructor is supposed to do (e.g., "order", and other member variables).
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -