📄 wordmesh.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
			    Boolean foundP;
			    NBestWordInfo *oldInfo =
					wordInfo[sortedAligns[i-1]]->
						    insert(otherWord, foundP);
			    if (foundP) {
				oldInfo->merge(*otherInfo);
			    } else {
				*oldInfo = *otherInfo;
			    }
			}

			Array<HypID> *otherHypList =
					other.hypMap[other.sortedAligns[j-1]]->
								find(otherWord);
			if (otherHypList) {
			    /*
			     * Add hyp IDs to the hyp list for this word 
			     * XXX: hyp IDs should be disjoint but there is no
			     * checking of this!
			     */
			    Array<HypID> &hypList =
						*hypMap[sortedAligns[i-1]]->
							insert(otherWord);
			    for (unsigned h = 0; h < otherHypList->size(); h ++)
			    {
				hypList[hypList.size()] = (*otherHypList)[h];
			    }
			}
		    }

		    if (alignScores) {
			alignScores[j-1] = totalScore;
		    }
		}

		columnPosteriors[sortedAligns[i-1]] +=
			score * other.columnPosteriors[other.sortedAligns[j-1]];
		transPosteriors[sortedAligns[i-1]] +=
			score * other.transPosteriors[other.sortedAligns[j-1]];

		i --; j --;

		break;
	    case DEL_ALIGN:
		*aligns[sortedAligns[i-1]]->insert(deleteIndex) +=
						score * other.totalPosterior;
		columnPosteriors[sortedAligns[i-1]] +=
						score * other.totalPosterior;
		transPosteriors[sortedAligns[i-1]] +=
						score * other.totalPosterior;

		/*
		 * Add all hyp IDs from "other" alignment to our delete hyp
		 */
		if (other.allHyps.numEntries() > 0) {
		    Array<HypID> &hypList = 
			*hypMap[sortedAligns[i-1]]->insert(deleteIndex);

		    SArrayIter<HypID,HypID> otherHypsIter(other.allHyps);
		    HypID id;
		    while (otherHypsIter.next(id)) {
			hypList[hypList.size()] = id;
		    }
		}

		i --;
		break;
	    case INS_ALIGN:
		/*
		 * Make room for new alignment column in sortedAligns
		 * and position new column
		 */
		for (unsigned k = numAligns; k > i; k --) {
		    sortedAligns[k] = sortedAligns[k-1];
		}
		sortedAligns[i] = numAligns;

		aligns[numAligns] = new LHash<VocabIndex,Prob>;
		assert(aligns[numAligns] != 0);

		wordInfo[numAligns] = new LHash<VocabIndex,NBestWordInfo>;
		assert(wordInfo[numAligns] != 0);

		hypMap[numAligns] = new LHash<VocabIndex,Array<HypID> >;
		assert(hypMap[numAligns] != 0);

		/*
		 * The transition posterior from the preceding to the following
		 * position becomes the posterior for *delete* at the new
		 * position (that's why we need transition posteriors!).
		 */
		Prob nullPosterior = totalPosterior;
		if (i > 0) {
		    nullPosterior = transPosteriors[sortedAligns[i-1]];
		}
		if (nullPosterior != 0.0) {
		    *aligns[numAligns]->insert(deleteIndex) = nullPosterior;
		}
		columnPosteriors[numAligns] = nullPosterior;
		transPosteriors[numAligns] = nullPosterior;

		/*
		 * Add all hypIDs previously recorded to the *DELETE*
		 * hypothesis at the newly created position.
		 */
		{
		    Array<HypID> *hypList = 0;

		    SArrayIter<HypID,HypID> myIter(allHyps);
		    HypID id;
		    while (myIter.next(id)) {
			/*
			 * Avoid inserting *DELETE* in hypMap unless needed
			 */
			if (hypList == 0) {
			    hypList = hypMap[numAligns]->insert(deleteIndex);
			}
			(*hypList)[hypList->size()] = id;
		    }
		}

		/*
		 * insert all words in "other" alignment at current position
		 */
		{
		    LHashIter<VocabIndex,Prob>
				iter(*other.aligns[other.sortedAligns[j-1]]);
		    Prob *otherProb;
		    VocabIndex otherWord;
		    while (otherProb = iter.next(otherWord)) {
			*aligns[numAligns]->insert(otherWord) +=
							score * *otherProb;

			/*
			 * Record word info if given
			 */
			NBestWordInfo *otherInfo =
				other.wordInfo[other.sortedAligns[j-1]]->
								find(otherWord);
			if (otherInfo) {
			    *wordInfo[numAligns]->insert(otherWord) =
								*otherInfo;
			}

			Array<HypID> *otherHypList =
				other.hypMap[other.sortedAligns[j-1]]->
								find(otherWord);
			if (otherHypList) {
			    /*
			     * Add hyp IDs to the hyp list for this word 
			     */
			    Array<HypID> &hypList = 
				      *hypMap[numAligns]->insert(otherWord);
			    for (unsigned h = 0; h < otherHypList->size(); h ++)
			    {
				hypList[hypList.size()] = (*otherHypList)[h];
			    }
			}
		    }

		    if (alignScores) {
			alignScores[j-1] = score;
		    }
		}

		columnPosteriors[numAligns] +=
			score * other.columnPosteriors[other.sortedAligns[j-1]];
		transPosteriors[numAligns] +=
			score * other.transPosteriors[other.sortedAligns[j-1]];

		numAligns ++;
		j --;
		break;
	    }
	}
    }

    /*
     * Add hyps from "other" alignment to global list of our IDs
     */
    SArrayIter<HypID,HypID> otherHypsIter(other.allHyps);
    HypID id;
    while (otherHypsIter.next(id)) {
	*allHyps.insert(id) = id;
    }

    totalPosterior += score * other.totalPosterior;
}

/*
 * Incremental partial alignments using alignWords() can leave the 
 * posteriors of null entries defective (because not all transition posteriors
 * were added to the alignment).
 * This function normalized the null posteriors so that the total column
 * posteriors sum to the totalPosterior.
 */
void
WordMesh::normalizeDeletes()
{
    for (unsigned i = 0; i < numAligns; i ++) {

	LHashIter<VocabIndex,Prob> alignIter(*aligns[i]);

	Prob wordPosteriorSum = 0.0;

	Prob *prob;
	VocabIndex word;
	while (prob = alignIter.next(word)) {
	    if (word != deleteIndex) {
		wordPosteriorSum += *prob;
	    }
	}

	Prob deletePosterior;
	if (wordPosteriorSum - totalPosterior > Prob_Epsilon) {
	    cerr << "WordMesh::normalizeDeletes: word posteriors exceed total: "
		 << wordPosteriorSum << endl;
	    deletePosterior = 0.0;
	} else {
	    deletePosterior = totalPosterior - wordPosteriorSum;
	}

	if (deletePosterior < Prob_Epsilon) {
	    aligns[i]->remove(deleteIndex);
	} else {
	    *aligns[i]->insert(deleteIndex) = deletePosterior;
	}

	columnPosteriors[i] = totalPosterior;
	transPosteriors[i] = totalPosterior;
    }
}

/*
 * Compute minimal word error with respect to existing alignment columns
 * (derived from WordAlign())
 */
unsigned
WordMesh::wordError(const VocabIndex *words,
				unsigned &sub, unsigned &ins, unsigned &del)
{
    unsigned hypLength = Vocab::length(words);
    unsigned refLength = numAligns;

    typedef struct {
	unsigned cost;			// minimal cost of partial alignment
	WordAlignType error;		// best predecessor
    } ChartEntry;

    /* 
     * Allocate chart statically, enlarging on demand
     */
    static unsigned maxHypLength = 0;
    static unsigned maxRefLength = 0;
    static ChartEntry **chart = 0;

    if (hypLength > maxHypLength || refLength > maxRefLength) {
	/*
	 * Free old chart
	 */
	if (chart != 0) {
	    for (unsigned i = 0; i <= maxRefLength; i ++) {
		delete [] chart[i];
	    }
	    delete [] chart;
	}

	/*
	 * Allocate new chart
	 */
	maxHypLength = hypLength;
	maxRefLength = refLength;
    
	chart = new ChartEntry*[maxRefLength + 1];
	assert(chart != 0);

	for (unsigned i = 0; i <= maxRefLength; i ++) {
	    chart[i] = new ChartEntry[maxHypLength + 1];
	    assert(chart[i] != 0);
	}

	/*
	 * Initialize the 0'th row, which never changes
	 */
	chart[0][0].cost = 0;
	chart[0][0].error = CORR_ALIGN;

	for (unsigned j = 1; j <= maxHypLength; j ++) {
	    chart[0][j].cost = chart[0][j-1].cost + INS_COST;
	    chart[0][j].error = INS_ALIGN;
	}
    }

    /*
     * Fill in the rest of the chart, row by row.
     */
    for (unsigned i = 1; i <= refLength; i ++) {

	/*
	 * deletion error only if alignment column doesn't have *DELETE*
	 */
	Prob *delProb = aligns[sortedAligns[i-1]]->find(deleteIndex);
	unsigned THIS_DEL_COST = delProb && *delProb > 0.0 ? 0 : DEL_COST;
	
	chart[i][0].cost = chart[i-1][0].cost + THIS_DEL_COST;
	chart[i][0].error = DEL_ALIGN;

	for (unsigned j = 1; j <= hypLength; j ++) {
	    unsigned minCost;
	    WordAlignType minError;
	
	    if (aligns[sortedAligns[i-1]]->find(words[j-1])) {
		minCost = chart[i-1][j-1].cost;
		minError = CORR_ALIGN;
	    } else {
		minCost = chart[i-1][j-1].cost + SUB_COST;
		minError = SUB_ALIGN;
	    }

	    unsigned delCost = chart[i-1][j].cost + THIS_DEL_COST;
	    if (delCost < minCost) {
		minCost = delCost;
		minError = DEL_ALIGN;
	    }

	    unsigned insCost = chart[i][j-1].cost + INS_COST;
	    if (insCost < minCost) {
		minCost = insCost;
		minError = INS_ALIGN;
	    }

	    chart[i][j].cost = minCost;
	    chart[i][j].error = minError;
	}
    }

    /*
     * Backtrace and add new words to alignment columns
     */
    {
	unsigned i = refLength;
	unsigned j = hypLength;

	sub = ins = del = 0;

	while (i > 0 || j > 0) {

	    switch (chart[i][j].error) {
	    case CORR_ALIGN:
		i --; j--;
		break;
	    case SUB_ALIGN:
		sub ++;
		i --; j --;
		break;
	    case DEL_ALIGN:
		/*
		 * deletion error only if alignment column doesn't 
		 * have *DELETE*
		 */
		{
		    Prob *delProb =
				aligns[sortedAligns[i-1]]->find(deleteIndex);
		    if (!delProb || *delProb == 0.0) {
			del ++;
		    }
		}
		i --;
		break;
	    case INS_ALIGN:
		ins ++;
		j --;
		break;
	    }
	}
    }

    return sub + ins + del;
}

double
WordMesh::minimizeWordError(VocabIndex *words, unsigned length,
				double &sub, double &ins, double &del,
				unsigned flags, double delBias)
{
    NBestWordInfo *winfo = new NBestWordInfo[length];
    assert(winfo != 0);

    double result =
		minimizeWordError(winfo, length, sub, ins, del, flags, delBias);

    for (unsigned i = 0; i < length; i ++) {
	words[i] = winfo[i].word;
	if (words[i] == Vocab_None) break;
    }

    delete [] winfo;

    return result;
}

double
WordMesh::minimizeWordError(NBestWordInfo *winfo, unsigned length,
				double &sub, double &ins, double &del,
				unsigned flags, double delBias)
{
    unsigned numWords = 0;
    sub = ins = del = 0.0;

    for (unsigned i = 0; i < numAligns; i ++) {

	LHashIter<VocabIndex,Prob> alignIter(*aligns[sortedAligns[i]]);

	Prob deleteProb = 0.0;
	Prob bestProb;
	VocabIndex bestWord = Vocab_None;

	Prob *prob;
	VocabIndex word;
	while (prob = alignIter.next(word)) {
	    Prob effectiveProb = *prob; // prob adjusted for deletion bias

	    if (word == deleteIndex) {
		effectiveProb *= delBias;
		deleteProb = effectiveProb;
	    }
	    if (bestWord == Vocab_None || effectiveProb > bestProb) {
		bestWord = word;
		bestProb = effectiveProb;
	    }
	}

	if (bestWord != deleteIndex) {
	    if (numWords < length) {
		NBestWordInfo *thisInfo =
				    wordInfo[sortedAligns[i]]->find(bestWord);
		if (thisInfo) {
		    winfo[numWords] = *thisInfo;
		} else {
		    winfo[numWords].word = bestWord;
		    winfo[numWords].invalidate();
		}

		/* 
		 * Always return the word posterior into the NBestWordInfo
		 */
		winfo[numWords].wordPosterior = bestProb;
		winfo[numWords].transPosterior = bestProb;

		numWords += 1;
	    }
	    ins += deleteProb;
	    sub += totalPosterior - deleteProb - bestProb;
	} else {
	    del += totalPosterior - deleteProb;
	}
    }
    if (numWords < length) {
	winfo[numWords].word = Vocab_None;;
	winfo[numWords].invalidate();
    }

    return sub + ins + del;
}

/*
 * Return confusion set for a given position
 */
LHash<VocabIndex,Prob> *
WordMesh::wordColumn(unsigned columnNumber) {
    if (columnNumber < numAligns) {
	return aligns[sortedAligns[columnNumber]];
    } else {
	return 0;
    }
}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -