📄 fngramlm.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 5 页
字号:
  unsigned int howmany = 0;  BOsIter iter(*this,specNum,node,context);  BOnode *bo_node;  while (bo_node = iter.next()) {    howmany += bo_node->probs.numEntries();  }  return howmany;}/* * Estimation *//* * Count number of vocabulary items that get probability mass * for the current tag set. */unsignedFNgram::vocabSize(){    unsigned numWords = 0;    FactoredVocab::TagIter viter(*((FactoredVocab*)&vocab));    VocabIndex word;    while (viter.next(word)) {	if (!vocab.isNonEvent(word) && !vocab.isMetaTag(word)) {	    numWords ++;	}    }    return numWords;}voidFNgram::estimate(){  for (unsigned specNum=0;specNum<fNgramsSize;specNum++) {      estimate(specNum);  }  if (debug(DEBUG_ESTIMATE_LM))    wordProbSum();}voidFNgram::estimate(const unsigned int specNum){    assert ( specNum < fNgramsSize );    // this is necessary so that estimate() doesn't smooth    // over the wrong random variable value set.    ((FactoredVocab*)&vocab)->setCurrentTagVocab(fngs.fnSpecArray[specNum].child);    /*     * For all ngrams, compute probabilities and apply the discount     * coefficients.     */    VocabIndex context[maxNumParentsPerChild+2];    unsigned vocabSize = FNgram::vocabSize();    /*     * Remove all old contexts     */    clear(specNum);    /*     * Ensure <s> unigram exists (being a non-event, it is not inserted     * in distributeProb(), yet is assumed by much other software).     */    if (vocab.ssIndex() != Vocab_None) {	context[0] = Vocab_None;	*(fNgrams[specNum].parentSubsets[0].insertProb(vocab.ssIndex(), context))	  = LogP_Zero;    }    // By convention, the lowest numeric level in the backoff graph (BG)    // (level zero) has one node corresponding to all LM parents, so this    // node has value (2^(numParents)-1) (all bits on). The highest numeric level    // (level numParents) has one node with no LM parents, so this node    // has value 0 (all bits off). All other BG nodes have some intermediary    // number of bits, and the bit pattern determines the number of LM parents    // that are currently active. See the ASCII pictures in FNgramSpecs.h.    // We learn the LM by doing a reverse BG level level-iterator,    // starting with the unigram, and moving up to the all LM-parents case.    const unsigned numParents = fngs.fnSpecArray[specNum].numParents;    for (int level=0;level<=(int)numParents;level++) {      FNgramSpecsType::FNgramSpec::LevelIter iter(numParents,level);      unsigned int node;      while (iter.next(node)) {	// check if backoff constraints turned off this node, if	// so we don't learn it.	if (fngs.fnSpecArray[specNum].parentSubsets[node].counts == NULL)	  continue;	const unsigned numBitsSetInNode = FNgramSpecsType::numBitsSet(node);	unsigned noneventContexts = 0;	unsigned noneventFNgrams = 0;	unsigned discountedFNgrams = 0;	/*	 * check if discounting is disabled for this round	 */	Boolean noDiscount =	  (fngs.fnSpecArray[specNum].parentSubsets[node].discount == 0) ||	  fngs.fnSpecArray[specNum].parentSubsets[node].discount->nodiscount();	// fprintf(stderr,"noDiscount = %d\n",noDiscount);	Boolean interpolate =	  (fngs.fnSpecArray[specNum].parentSubsets[node].discount != 0) &&	  fngs.fnSpecArray[specNum].parentSubsets[node].discount->interpolate;	/*	 * assume counts are already "prepared" (which is always true for factored counts)	 */	/*	 * This enumerates all parent set contexts, i.e., i-1 "grams"	 */	FNgramCount *contextCount;	assert ( fngs.fnSpecArray[specNum].parentSubsets[node].order > 0 );	FNgramSpecsType::FNgramSpec::PSIter 	  contextIter(fngs.fnSpecArray[specNum].parentSubsets[node],		      context,		      fngs.fnSpecArray[specNum].parentSubsets[node].order-1);	while (contextCount = contextIter.next()) {	    /*	     * If <unk> is not real word, skip contexts that contain	     * it. Note that original code checked for </s> here in	     * context. We do not do this here because:	     *  1) count tries here are for conditional distributions where	     *     the leaves of the trie is the child, and the non-leaves	     *     are a set of parent variables.	     *  2) a context might contain a </s>, say in P(M_t|R_t,R_t-1)	     *     where the context comes from the same time point.	     *  3) It is not clear why you would need to do doubling here	     *     since short word strings are effectively elongated	     *     in FNgramCounts<CountT>::countSentence() by	     *     having any context that goes earlier than the beginning	     *     of the sentence be replaced with <s> (so we can deal	     *     with sentences shorter than the gram temporal width,	     *     but this is only the case if the -no-virtual-begin-sentence 	     *     option is not set).	     * Note also, that this means that this code will have	     * fewer pseudo-events when comparing with ngram-count.cc.	     */	  if (vocab.isNonEvent(vocab.unkIndex()) &&	      vocab.contains(context, vocab.unkIndex()))	    {	      noneventContexts ++;	      continue;	    }	    VocabIndex word[2];	/* the follow word */	    FNgramSpecsType::FNgramSpec::PSIter 	      followIter(fngs.fnSpecArray[specNum].parentSubsets[node],			 context,word,1);	    FNgramCount *ngramCount;	    /*	     * Total up the counts for the denominator	     * (the lower-order counts are not consistent with	     * the higher-order ones, so we can't just use *contextCount)	     * I.e., we assume here that trustTotal flag is always false.	     */	    FNgramCount totalCount = 0;	    Count observedVocab = 0, min2Vocab = 0, min3Vocab = 0;	    while (ngramCount = followIter.next()) {		if (vocab.isNonEvent(word[0]) ||		    ngramCount == 0 ||		    node == 0 && vocab.isMetaTag(word[0]))		{		    continue;		}		if (!vocab.isMetaTag(word[0])) {		    totalCount += *ngramCount;		    observedVocab ++;		    if (*ngramCount >= 2) {			min2Vocab ++;		    }		    if (*ngramCount >= 3) {			min3Vocab ++;		    }		} else {		    /*		     * Process meta-counts		     */		    unsigned type = vocab.typeOfMetaTag(word[0]);		    if (type == 0) {			/*			 * a count total: just add to the totalCount			 * the corresponding type count can't be known,			 * but it has to be at least 1			 */			totalCount += *ngramCount;			observedVocab ++;		    } else {			/*			 * a count-of-count: increment the word type counts,			 * and infer the totalCount			 */			totalCount += type * *ngramCount;			observedVocab += (Count)*ngramCount;			if (type >= 2) {			    min2Vocab += (Count)*ngramCount;			}			if (type >= 3) {			    min3Vocab += (Count)*ngramCount;			}		    }		}	    }	    if (totalCount == 0) {		continue;	    }	    /*	     * reverse the context ngram since that's how	     * the BO nodes are indexed. More specifically,	     * normal count tries are organized as follows:	     *    x1 y1 z1 c	     *    x1 y1 z2 c	     *    ...	     *    x1 y1 zn c	     *    x1 y2 z1 c	     *    x1 y2 z2 c	     *    ...	     *    x1 y2 zn c	     * and so on, where c is the count for the context,	     * and where z follows y which follows x in the input	     * files, so we might want to compute p(z|x,y). I.e.,	     * the x's are at the root, and the z's are at the leaves	     * of the count trie.	     *	     * When we get one of these 'context' wid strings, we are	     * iterating over the x_i and y_i's that exist in the	     * corresponding trie object. (the z's are the follow iter above)	     *	     * When LM tries are created, however, we insert them using	     * a reversed context, so what gets inserted used to index	     * the triee is "y x" and then z is in the hash table at that node.	     * 	     *	     * In flm file, we've got	     *	     *      C | P1 P2 P3	     * 	     * here, P1 = bit0, P1 = bit1, P3 = bit2	     * where a bit vector is organzied in an integer as:	     *	     *                     [...  bit2 bit1 bit0]	     *	     * In count file, contexts are given as 	     *	     *    word[0] = P3, word[1] = P2, word[2] = P1 [ word[3] = C ]	     *       = bit2        bit1            bit0    	     *	     * when we reverse the context, we've got	     *	     *    word[0] = P1, word[1] = P2, word[2] = P3	     *       = bit0         bit1         bit2	     *	     * or if we include the child,	     *	     *    word[0] = C, word[1] = P1, word[2] = P2, word[3] = P3	     *                      = bit0         bit1         bit2	     *	     * In a normal LM (as is done in NgramLM), the reason for	     * reversing things in the LM but not in the counts (I	     * believe) is that it is faster to backoff to a parent	     * node in the trie rather than having to re-index down	     * along a completely different path to another node in	     * the trie. 	     *	     * For a general graph backoff, however, this reversal	     * isn't really necessary since we need to index up into	     * an entirely different LM trie when we backoff so the	     * speed advantage is lost. We keep doing the reversal	     * here, however, just to stay consistent with the	     * code in NgramLM.cc. This means, however, that 	     * we need to reverse wid count lookup routines (as	     * defined in NgramSpecs.h).	     *     	    */	    Vocab::reverse(context);	    /*	     * Compute the discounted probabilities	     * from the counts and store them in the backoff model.	     */	retry:	    followIter.init();	    Prob totalProb = 0.0;	    while (ngramCount = followIter.next()) {		LogP lprob;		double discountCoeff;		/*		 * zero counts,		 * Officially, we shouldn't see this for a FLM count trie,		 * except possibly at node 0.		 */		if (node != 0 && *ngramCount == 0) {		  fprintf(stderr,"WARNING: FNgramLM:  node = 0x%X, *ngramCount = %d\n",			  node, (unsigned)*ngramCount);		  cerr << "context: " << (vocab.use(), context) << endl;		  cerr << "word: " << word << endl;		  // assert (0);		  // NOTE: if this happens, check the count		  // modification code in discount.cc. This might		  // also be a result of running fngram-count with		  // the "-no-virtual-begin-sentence" option.		}		if (vocab.isNonEvent(word[0]) || vocab.isMetaTag(word[0])) {		    /*		     * Discard all pseudo-word probabilities,		     * except for unigrams.  For unigrams, assign		     * probability zero.  This will leave them with		     * prob zero in all cases, due to the backoff		     * algorithm.		     * Also discard the <unk> token entirely in closed		     * vocab models, its presence would prevent OOV		     * detection when the model is read back in.		     */		    if (node != 0 ||			word[0] == vocab.unkIndex() ||			vocab.isMetaTag(word[0]))		    {			noneventFNgrams ++;			continue;		    }		    lprob = LogP_Zero;		    discountCoeff = 1.0;		} else {		    /*		     * Ths discount array passed may contain 0 elements		     * to indicate no discounting at this order.		     */		    if (noDiscount) {			discountCoeff = 1.0;			// fprintf(stderr,"setting discount to 1.0\n");		    } else {			discountCoeff =			  fngs.fnSpecArray[specNum].			  parentSubsets[node].discount->discount(*ngramCount, totalCount,								 observedVocab);			// fprintf(stderr,"*ngramCount = %d, setting discount to %f\n",			// *ngramCount,discountCoeff);		    }		    Prob prob = (discountCoeff * *ngramCount) / totalCount;		    /*		     * For interpolated estimates we compute the weighted 		     * linear combination of the high-order estimate		     * (computed above) and the lower-order estimate.		     * The high-order weight is given by the discount factor,		     * the lower-order weight is obtained from the Discount		     * method (it may be 0 if the method doesn't support		     * interpolation).		     */		    double lowerOrderWeight;		    LogP lowerOrderProb;		    if (interpolate) {			lowerOrderWeight = 			  fngs.fnSpecArray[specNum].			  parentSubsets[node].discount->lowerOrderWeight(totalCount,									 observedVocab,									 min2Vocab,									 min3Vocab);			if (node > 0) {			  lowerOrderProb = 			    bgChildProbBO(word[0],context,node,specNum,node);			} else {			    lowerOrderProb = (LogP)(- log10((double)vocabSize));			}			prob += lowerOrderWeight * LogPtoProb(lowerOrderProb);		    }		    if (discountCoeff != 0.0) {			totalProb += prob;		    }		    lprob = ProbToLogP(prob);		    if (discountCoeff != 0.0 && debug(DEBUG_ESTIMATES)) {			dout() << "CONTEXT " << (vocab.use(), context)			       << " WORD " << vocab.getWord(word[0])			       << " NUMER " << *ngramCount			       << " DENOM " << totalCount			       << " DISCOUNT " << discountCoeff;			if (interpolate) {			    dout() << " LOW " << lowerOrderWeight				   << " LOLPROB " << lowerOrderProb;			}			dout() << " LPROB " << lprob << endl;		    }		}		/*		 * A discount coefficient of zero indicates this ngram		 * should be omitted entirely (to save space and to 		 * ensure that BOW code below works).		 */		if (discountCoeff == 0.0) {		    discountedFNgrams ++;		    fNgrams[specNum].parentSubsets[node].		      removeProb(word[0],context);		} else {		  // fprintf(stderr,"inserting word %d\n",word[0]);		  *(fNgrams[specNum].parentSubsets[node].		    insertProb(word[0],context)) = lprob;		  // fprintf(stderr,"size now = %d\n",numFNgrams(specNum,node));		}	    }	    /*	     * This is a hack credited to Doug Paul (by Roni Rosenfeld in	     * his CMU tools).  It may happen that no probability mass	     * is left after totalling all the explicit probs, typically	     * because the discount coefficients were out of range and	     * forced to 1.0.  To arrive at some non-zero backoff mass	     * we try incrementing the denominator in the estimator by 1.	     */	    if (!noDiscount && totalCount > 0 &&		totalProb > 1.0 - Prob_Epsilon)	    {		totalCount += 1;		if (debug(DEBUG_ESTIMATE_WARNINGS)) {		    cerr << "warning: " << (1.0 - totalProb)			 << " backoff probability mass left for " <<
💿 文件大小 3034 K
👤 上传用户 wanghaihah
📂 所属分类其他
🏷️ 相关标签

#工具包
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -