📄 classngram.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
		 */
		ClassNgramExpandIter expandIter(*this, context, wordNgram);

		LogP expandProb;
		unsigned firstLen, lastLen;
		while (expandIter.next(expandProb, firstLen, lastLen)) {
		    unsigned expandedLen = Vocab::length(wordNgram);

		    if (expandedLen > maxNgramLength) {
			maxNgramLength = expandedLen;
		    }

		    LogP newProb = jointContextProb + *ngramProb + expandProb;

		    /*
		     * Increment the total joint probability for all 
		     * ngrams resulting from the last class expansion.
		     * (Shorter prefixes of the expansion are taken care
		     * of automatically as a result of expanding prefixes
		     * of the context.)
		     */
		    for (unsigned j = 0; j < lastLen; j ++) {
			for (unsigned k = 0; k < firstLen; k ++) {
			    /*
			     * Truncate the N-gram starting from the back
			     */
			    wordNgram[expandedLen - j] = Vocab_None;

			    LogP *oldProb =
					ngramProbs.insertCount(&wordNgram[k]);
			    if (*oldProb == 0.0) {
				*oldProb = newProb;
			    } else {
				*oldProb = (LogP)AddLogP(*oldProb, newProb);
			    }
			}
		    }
		}
	    }

	    context[i] = Vocab_None;
	    Vocab::reverse(context);
	}
    }

    /*
     * Unless requested otherwise, include all ngrams in the new model
     */
    if (newOrder == 0) {
	newOrder = maxNgramLength;
    } else if (newOrder > maxNgramLength) {
	newOrder = maxNgramLength;
    }

    /*
     * Default is to not use exact expansion at all
     */
    if (expandExact == 0) {
	expandExact = newOrder + 1;
    }

    /*
     * Compute joint probs for word ngrams that were expanded in the
     * above step, but are not contained in the original model.
     */
    for (i = 1; i < expandExact; i++) {
	LogP *oldProb;

	NgramCountsIter<LogP> ngramIter(ngramProbs, wordNgram, i);

	/*
	 * This enumerates all i-grams.
	 */
	while (oldProb = ngramIter.next()) {
	    /*
	     * destructively extract context portion of ngram
	     */
	    Vocab::reverse(wordNgram);

	    if (findProb(wordNgram[0], &wordNgram[1]) == 0) {
		/*
		 * ngram is not in old model:
		 * compute joint probability for this ngram, excluding classes
		 */
		LogP newProb = contextProb(wordNgram);

		if (*oldProb == 0.0) {
		    *oldProb = newProb;
		} else {
		    *oldProb = (LogP)AddLogP(*oldProb, newProb);
		}
	    }
	    Vocab::reverse(wordNgram);
	}
    }

    simpleNgram = false;

    /*
     * Copy all regular (non-class) words to the new vocabulary,
     * including special tokens.
     */
    SubVocab *newVocab = new SubVocab(vocab);
    assert(newVocab);

    VocabIter viter(vocab);
    VocabIndex wordIndex;
    VocabString wordString;

    while (wordString = viter.next(wordIndex)) {
	if (!classVocab.getWord(wordIndex)) {
	    newVocab->addWord(wordString);
	} else {
	    /* 
	     * ensure all words in the class expansion are in the new vocab:
	     * this includes classes that occur in expansions of other classes,
	     * even though we currently don't support "context-free" rules
	     */
	    Map2Iter2<VocabIndex, ClassExpansion, Prob>
						iter(classDefs, wordIndex);
	    ClassExpansion expansion;

	    while (iter.next(expansion)) {
		for (i = 0; expansion[i] != Vocab_None; i ++) {
		    VocabString className = classVocab.getWord(expansion[i]);

		    if (className) {
			cerr << "warning: expansion of " << wordString 
			     << " -> " << (vocab.use(), expansion)
			     << " refers to another class: " << className
			     << endl;
			newVocab->addWord(className);
		    }
		}
	    }
	}
    }

    /*
     * Duplicate special word indices in new vocab
     */
    newVocab->unkIndex() = vocab.unkIndex();
    newVocab->ssIndex() = vocab.ssIndex();
    newVocab->seIndex() = vocab.seIndex();
    newVocab->pauseIndex() = vocab.pauseIndex();
    newVocab->addNonEvent(vocab.ssIndex());
    newVocab->addNonEvent(vocab.pauseIndex());

    /*
     * Create new ngram model (inherit debug level from class ngram)
     */
    Ngram *ngram = new Ngram(*newVocab, newOrder);
    assert(ngram != 0);
    ngram->debugme(debuglevel());

    /*
     * For all ngrams, compute probabilities
     */
    for (i = 0; i < newOrder; i++) {
	LogP *contextProb;
	NgramCountsIter<LogP> contextIter(ngramProbs, wordNgram, i);

	/*
	 * This enumerates all contexts, i.e., i-grams.
	 */
	while (contextProb = contextIter.next()) {
	    /*
	     * The probability of <s> is -Inf in the model, but it should be
	     * P(</s>) for purposes of normalization when computing the
	     * conditional probs below (consistent with LM::contextProb()).
	     */
	    if (i == 1 && wordNgram[0] == vocab.ssIndex()) {
		VocabIndex emptyContext = Vocab_None;
		*contextProb = wordProb(vocab.seIndex(), &emptyContext);
	    }

	    VocabIndex word[2];	/* the follow word */
	    NgramCountsIter<LogP> followIter(ngramProbs, wordNgram, word, 1);
	    LogP *ngramProb;

	    /*
	     * reverse context words in preparation for ngram prob insertion
	     */
	    Vocab::reverse(wordNgram);

	    if (i + 1 >= expandExact) {
		/*
		 * Exact conditional probability for expanded ngram:
		 * Run the forward algorithm (by way of wordProb).
		 */
		while (ngramProb = followIter.next()) {
		    LogP lprob = wordProb(word[0], wordNgram);

		    if (lprob > LogP_One) {
			if (LogPtoProb(lprob) - 1.0 > Prob_Epsilon) {
			    cerr << "bad conditional prob for \""
				 << (vocab.use(), wordNgram) << "\": "
				 << LogPtoProb(lprob) << " > 1\n";
			}
			lprob = LogP_One;
		    }

		    if (debug(DEBUG_ESTIMATES)) {
			dout() << "CONTEXT " << (vocab.use(), wordNgram)
			       << " WORD " << vocab.getWord(word[0])
			       << " EXACT LPROB " << lprob
			       << endl;
		    }
		    *ngram->insertProb(word[0], wordNgram) = lprob;
		}
	    } else {
		/*
		 * Compute sum of all ngram probs
		 */
		LogP probSum = LogP_Zero;

		while (ngramProb = followIter.next()) {
		    probSum = (LogP)AddLogP(probSum, *ngramProb);
		}

		/*
		 * Compute the sum of ngram probs
		 * - because it needs to be computed for the empty context
		 * - to check for abnormal conditions
		 */
		if (i == 0 || probSum > *contextProb) {
		    if (i > 0 &&
			LogPtoProb(probSum) - LogPtoProb(*contextProb)
							    > Prob_Epsilon &&
			debug(DEBUG_ESTIMATE_WARNINGS))
		    {
			cerr << "warning: prob for context \""
			     << (vocab.use(), wordNgram)
			     << "\" lower than total ngram prob for words "
			     << "(" << *contextProb << " < " << probSum << ")"
			     << endl;
		    }
		    *contextProb = probSum;
		}

		/*
		 * Enumerate all words that can follow this context
		 */
		followIter.init();

		while (ngramProb = followIter.next()) {
		    LogP lprob = *ngramProb - *contextProb;

		    if (debug(DEBUG_ESTIMATES)) {
			dout() << "CONTEXT " << (vocab.use(), wordNgram)
			       << " WORD " << vocab.getWord(word[0])
			       << " NUMER " << *ngramProb
			       << " DENOM " << *contextProb
			       << " LPROB " << lprob
			       << endl;
		    }
		    *ngram->insertProb(word[0], wordNgram) = lprob;
		}
	    }

	    Vocab::reverse(wordNgram);
	}
    }

    /*
     * Complete new model estimation by filling in backoff weights
     */
    ngram->recomputeBOWs();

    return ngram;
}

/*
 * Enumerate all class expansions in a mixed word/class token string
 */

ClassNgramExpandIter::ClassNgramExpandIter(ClassNgram &ngram,
		const VocabIndex *classes, VocabIndex *buffer)
    : ngram(ngram), classes(classes), buffer(buffer),
      expandIter(0), subIter(0), done(false)
{
    /*
     * Find the first class token in classes string
     */
    for (firstClassPos = 0;
	 classes[firstClassPos] != Vocab_None;
	 firstClassPos++)
    {
	if (ngram.classVocab.getWord(classes[firstClassPos]) != 0) {
	    break;
	}
    }
		
    /*
     * If there is a class token, set up the iterator over its expansions
     */
    if (classes[firstClassPos] != Vocab_None) {
	expandIter = new Map2Iter2<VocabIndex,ClassExpansion,Prob>
				    (ngram.classDefs, classes[firstClassPos]);
	assert(expandIter != 0);
    }

    /*
     * Copy the words preceding the first class into the buffer
     */
    for (unsigned i = 0; i < (unsigned)firstClassPos; i ++) {
	buffer[i] = classes[i];
    }
    buffer[firstClassPos] = Vocab_None;
}

ClassNgramExpandIter::~ClassNgramExpandIter()
{
    delete expandIter;
    delete subIter;
}

/*
 * Return the next class-expanded word string.
 * Also return the aggregate probability of all expansions in the current
 * string (prob), the expanded length of the first input token (firstLen),
 * and the expanded length of the last input token (lastLen).
 */
VocabIndex *
ClassNgramExpandIter::next(LogP &prob, unsigned &firstLen, unsigned &lastLen)
{
    if (done) {
	return 0;
    } else if (expandIter == 0) {
	/*
	 * If the class iterator is not active, we have an all-words string
	 * and just return it.
	 */
	done = true;
	prob = LogP_One;
	firstLen = lastLen = (classes[0] == Vocab_None ? 0 : 1);
	return buffer;
    } else {
	while (1) {
	    if (subIter == 0) {
		/*
		 * The sub-iteration is done, advance to the next 
		 * expansion of the first class.
		 */
		ClassExpansion expansion;
		Prob *expandProb = expandIter->next(expansion);

		if (expandProb == 0) {
		    done = true;
		    return 0;
		} else {
		    /*
		     * Remember across invocations
		     */
		    prob1 = ProbToLogP(*expandProb);

		    /*
		     * Append expansion to buffer, and record its length
		     */
		    for (firstClassLen = 0;
			 expansion[firstClassLen] != Vocab_None;
			 firstClassLen ++)
		    {
			buffer[firstClassPos + firstClassLen] =
						expansion[firstClassLen];
		    }

		    /*
		     * Create recursive iterator to expand the 
		     * remaining string
		     */
		    subIter = new ClassNgramExpandIter(ngram,
					&classes[firstClassPos + 1],
					&buffer[firstClassPos + firstClassLen]);
		    assert(subIter != 0);
		}
	    }

	    LogP subProb;
	    unsigned subFirstLen, subLastLen;
	    if (subIter->next(subProb, subFirstLen, subLastLen) == 0) {
		/*
		 * expansion of rest string exhausted
		 * -- continue expanding first class 
		 */
		delete subIter;
		subIter = 0;
	    } else {
		prob = prob1 + subProb;
		firstLen = (firstClassPos == 0 ? firstClassLen : 1);
		lastLen = (classes[firstClassPos + 1] == Vocab_None ? 
						firstClassLen : subLastLen);
		return buffer;
	    }
	}
    }
}
上一页 1 23
💿 文件大小 3034 K
👤 上传用户 wanghaihah
📂 所属分类其他
🏷️ 相关标签

#工具包
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -