📄 classngram.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 3 页
字号:
			 << " PROB = " << wordProb
			 << endl;
		}

		/*
		 * For efficiency reasons we don't update the trellis
		 * when at the final word.  In that case we just record
		 * the total probability.
		 */
		if (prefix > 0) {
		    trellis.update(prevState, nextState, wordProb);
		} else {
		    logSum = (LogP)AddLogP(logSum, prevProb + wordProb);
		}

                /*
                 * Restore newContext
                 */
                newContext[usedLength] = truncatedContextWord;

		/*
		 * Now extend context by all class expansions that can start
		 * with the current word
		 */
		Map2Iter2<VocabIndex,ClassExpansion,Prob>
					expandIter(classDefsByWord, currWord);
		Prob *expansionProb;
		ClassExpansion classAndExpansion;

		while (expansionProb = expandIter.next(classAndExpansion)) {

		    VocabIndex clasz = classAndExpansion[0]; 

		    /*
		     * Prepend new class to context
		     */ 
		    newContext[0] = clasz;

		    /*
		     * Transition prob out of previous context to new class
		     */
		    LogP classProb =
			    Ngram::wordProb(clasz, prevState.classContext);

		    /*
		     * Truncate context to what is actually used by LM
		     */
		    unsigned usedLength;
		    Ngram::contextID(Vocab_None, newContext, usedLength);

		    VocabIndex truncatedContextWord = newContext[usedLength];
		    newContext[usedLength] = Vocab_None;

		    /*
		     * Discard the class itself and the first word,
		     * which is already consumed by the current position.
		     */
		    nextState.classExpansion = classAndExpansion + 2;

		    if (debug(DEBUG_TRANSITIONS)) {
			cerr << "POSITION = " << pos
			     << " FROM: " << (vocab.use(), prevState)
			     << " TO: " << (vocab.use(), nextState)
			     << " WORD = " << vocab.getWord(currWord)
			     << " PROB = " << classProb
			     << " EXPANDPROB = " << *expansionProb
			     << endl;
		    }

		    if (classProb != LogP_Zero && *expansionProb != 0.0) {
			havePosProb = true;
		    }

		    /*
		     * For efficiency reasons we don't update the trellis
		     * when at the final word.  In that case we just record
		     * the total probability.
		     */
		    if (prefix > 0) {
			trellis.update(prevState, nextState,
				    classProb + ProbToLogP(*expansionProb));
		    } else {
			logSum = (LogP)AddLogP(logSum, prevProb + 
				    classProb + ProbToLogP(*expansionProb));
		    }

		    /*
		     * Restore newContext
		     */
		    newContext[usedLength] = truncatedContextWord;
		}
            }
	}

	/*
	 * Set noevent state probability to the previous total prefix
	 * probability if the current word had probability zero in all
	 * states, and we are not yet at the end of the prefix.
	 * This allows us to compute conditional probs based on
	 * truncated contexts, and to compute the total sentence probability
	 * leaving out the OOVs, as required by sentenceProb().
	 * We include the words in the state so that context cues (e.g., <s>)
	 * can still be used down the line.
	 */
	if (prefix > 0 && !havePosProb) {
	    ClassNgramState newState;
	    newState.classContext = &context[prefix - 1];
	    newState.classExpansion = 0;

	    trellis.init(pos);
	    trellis.setProb(newState, trellis.sumLogP(pos - 1));

	    if (currWord == vocab.unkIndex()) {
		stats.numOOVs ++;
	    } else {
	        stats.zeroProbs ++;
	    }
	}
	
	trellis.step();
	prevPos = pos;
    }

    if (prevPos > 0) {
	contextProb = trellis.sumLogP(prevPos - 1);
    } else { 
	contextProb = LogP_One;
    }
    return logSum;
}

/*
 * The conditional word probability is computed as
 *	p(w1 .... wk)/p(w1 ... w(k-1)
 */
LogP
ClassNgram::wordProb(VocabIndex word, const VocabIndex *context)
{
    if (simpleNgram) {
	return Ngram::wordProb(word, context);
    } else {
	LogP cProb;
	TextStats stats;
	LogP pProb = prefixProb(word, context, cProb, stats);
	return pProb - cProb;
    }
}

LogP
ClassNgram::wordProbRecompute(VocabIndex word, const VocabIndex *context)
{
    if (simpleNgram) {
	return Ngram::wordProbRecompute(word, context);
    } else {
	LogP cProb;
	TextStats stats;
	LogP pProb = prefixProb(word, 0, cProb, stats);
	return pProb - cProb;
    }
}

/*
 * Sentence probabilities from indices
 *	This version computes the result directly using prefixProb to
 *	avoid recomputing prefix probs for each prefix.
 */
LogP
ClassNgram::sentenceProb(const VocabIndex *sentence, TextStats &stats)
{
    /*
     * The debugging machinery is not duplicated here, so just fall back
     * on the general code for that.
     */
    if (simpleNgram || debug(DEBUG_PRINT_WORD_PROBS)) {
	return Ngram::sentenceProb(sentence, stats);
    } else {
	unsigned int len = vocab.length(sentence);
	LogP totalProb;

	makeArray(VocabIndex, reversed, len + 2 + 1);

	/*
	 * Contexts are represented most-recent-word-first.
	 * Also, we have to prepend the sentence-begin token,
	 * and append the sentence-end token.
	 */
	len = prepareSentence(sentence, reversed, len);

	/*
	 * Invalidate cache (for efficiency only)
	 */
	savedLength = 0;

	LogP contextProb;
	totalProb = prefixProb(reversed[0], reversed + 1, contextProb, stats);

	/* 
	 * OOVs and zeroProbs are updated by prefixProb()
	 */
	stats.numSentences ++;
	stats.prob += totalProb;
	stats.numWords += len;

	return totalProb;
    }
}

Boolean
ClassNgram::read(File &file, Boolean limitVocab)
{
    /*
     * First read the ngram data in standard format
     */
    if (!Ngram::read(file, limitVocab)) {
	return false;
    }
	
    /*
     * Now read class definitions
     */
    return readClasses(file);
}

void
ClassNgram::write(File &file)
{
    /*
     * First write out the Ngram parameters in the usual format
     */
    Ngram::write(file);
    
    fprintf(file, "\n");

    /*
     * Now write the class definitions
     */
    writeClasses(file);

    fprintf(file, "\n");
}

void
ClassNgram::clearClasses()
{
    /* 
     * Remove all class definitions
     */
    classDefs.clear();
    classDefsByWord.clear();
}

Boolean
ClassNgram::readClasses(File &file)
{
    char *line;
    Boolean classesCleared = false;

    while (line = file.getline()) {
	VocabString words[maxWordsPerLine];

	/*
	 * clear old class definitions only when encountering first new
	 * class definition
	 */
	if (!classesCleared) {
	    clearClasses();
	    classesCleared = true;
	}

	unsigned howmany = Vocab::parseWords(line, words, maxWordsPerLine);

	if (howmany == maxWordsPerLine) {
	    file.position() << "class definition has too many fields\n";
	    return false;
	}

	/*
	 * First word contains class name
	 */
	VocabIndex clasz = classVocab.addWord(words[0]);

	double prob = 1.0;
	VocabString *expansionWords;

	/*
	 * If second word is numeral, assume it's the class expansion prob
	 */
	if (howmany > 1 && sscanf(words[1], "%lf", &prob)) {
	    expansionWords = &words[2];
	} else {
	    expansionWords = &words[1];
	}

	/*
	 * Add expansion words to vocabulary and store.
	 * The first position in the string is reserved for the class itself
	 * (for use in classDefsByWord).
	 */
	VocabIndex classAndExpansion[maxWordsPerLine + 1];
	classAndExpansion[0] = clasz;
	if (vocab.addWords(expansionWords, &classAndExpansion[1],
						maxWordsPerLine) == 0)
	{
	    file.position() << "class expansion contains no words\n";
	    return false;
	}

	*classDefs.insert(clasz, &classAndExpansion[1]) = prob;

	/*
	 * Index the class and its expansion by the first word.
	 */
	*classDefsByWord.insert(classAndExpansion[1], classAndExpansion) = prob;
    }
	
    return true;
}

void
ClassNgram::writeClasses(File &file)
{
    VocabIndex clasz;
    Map2Iter<VocabIndex, ClassExpansion, Prob>
				classIter(classDefs, vocab.compareIndex());

    while (classIter.next(clasz)) {
	Map2Iter2<VocabIndex, ClassExpansion, Prob>
				iter(classDefs, clasz, vocab.compareIndices());
	ClassExpansion expansion;
	Prob *prob;

	while (prob = iter.next(expansion)) {
	    fprintf(file, "%s %lf", vocab.getWord(clasz), *prob);

	    for (unsigned i = 0; expansion[i] != Vocab_None; i ++) {
		fprintf(file, " %s", vocab.getWord(expansion[i]));
	    }
	    fprintf(file, "\n");
	}
    }
    fprintf(file, "\n");
}

/*
 * Compile class-ngram into word-ngram model by expanding classes
 * Algorithm:
 * 1 - Compute joint probabilities for expanded word-ngrams
 * 2 - Compute conditional word-ngram probabilities from joint probs.
 * 3 - Compute backoff weights
 * The second argument gives the length of expanded ngrams whose
 * conditional probability should be computed using the forward algorithm.
 * This is more expensive but gives better results for ngrams much longer
 * than those contained in the original model.
 */
Ngram *
ClassNgram::expand(unsigned newOrder, unsigned expandExact)
{
    NgramCounts<LogP> ngramProbs(vocab, maxWordsPerLine);
					// accumulators for joint ngram probs

    unsigned maxNgramLength = 0;	// to determine final ngram order

    VocabIndex wordNgram[maxWordsPerLine];
    wordNgram[0] = Vocab_None;

    makeArray(VocabIndex, context, order + 2);

    /*
     * Turn off the DP for the computation of joint context probabilities
     */
    simpleNgram = true;

    unsigned i;
    for (i = 0 ; i < order; i ++) {
	BOnode *node;
	NgramBOsIter iter(*this, context, i);

	while (node = iter.next()) {
	    LogP jointContextProb = contextProb(context);

	    /*
	     * Flip context to give regular ngram order and allow appending
	     * final word.
	     */
	    Vocab::reverse(context);

	    /*
	     * Enumerate all follow ngrams
	     */
	    NgramProbsIter piter(*node);
	    VocabIndex clasz;
	    LogP *ngramProb;

	    while (ngramProb = piter.next(clasz)) {
		context[i] = clasz;
		context[i+1] = Vocab_None;

		/*
		 * Expand the full ngram.
💿 文件大小 3034 K
👤 上传用户 wanghaihah
📂 所属分类其他
🏷️ 相关标签

#工具包
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -