📄 wordlattice.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 3 页
字号:
     * Construct alignment class to node maps
     */
    Array< Array<unsigned> > alignMap(0, numAligns);

    for (i = 0; i < numNodes; i ++) {
	unsigned align = nodes[i].align;
	unsigned n = alignMap[align].size();
	alignMap[align][n] = i;
    }

    /*
     * Sort alignment classes topologically
     */
    Array<unsigned> sortedAligns(0, numAligns);
    unsigned numSortedAligns = alat.sortNodes(sortedAligns.data());

    /*
     * Expand sorted alignment classes back into nodes
     */
    i = 0;
    unsigned lastAlign;
    for (unsigned k = 0; k < numSortedAligns; k ++) {
	/*
	 * If current and previous alignment class are not ordere w.r.t.
	 * each other, then merge them.
	 */
	Prob prob;
	Boolean mergeAligns = 
		    k > 0 &&
			!alat.hasArc(sortedAligns[k-1], sortedAligns[k], prob);

	for (unsigned j = 0; j < alignMap[sortedAligns[k]].size(); j ++, i ++) {
	    sortedNodes[i] = alignMap[sortedAligns[k]][j];
	    if (mergeAligns) {
		nodes[sortedNodes[i]].align = lastAlign;
	    }
	}

	if (!mergeAligns) {
	    lastAlign = sortedAligns[k];
	}
    }
    
    return i;
}

/*
 * addWords --
 *	Add new nodes representing word string
 */
void
WordLattice::addWords(const VocabIndex *words, Prob score, const HypID *hypID)
{
    unsigned prevNode = initial;

    nodes[initial].score += score;

    for (unsigned i = 0; words[i] != Vocab_None; i ++) {
	unsigned newNode = numNodes ++;

	nodes[newNode].word = words[i];
	nodes[newNode].score = score;
	nodes[newNode].align = numAligns++;
	addArc(prevNode, newNode, score);

	prevNode = newNode;
    }

    addArc(prevNode, final, score);
    nodes[final].score += score;
}

/*
 * NOTE: MAX_COST is small enough that we can add any of the *_COST constants
 * without overflow.
 */
const unsigned MAX_COST = (unsigned)(-10);	// an impossible path

/*
 * error type used in tracing back word/lattice alignments
 */
typedef enum {
	ERR_NONE, ERR_SUB, ERR_INS, ERR_DEL, ERR_ARC
} ErrorType;

/*
 * costs for local lattice alignments
 * (note these are different from word alignment costs)
 */
const unsigned LAT_SUB_COST = 2;
const unsigned LAT_DEL_COST = 3;
const unsigned LAT_INS_COST = 3;
const unsigned LAT_ARC_COST = 1;

/*
 * alignWords --
 *	Add a word string to lattice by finding best alignment
 */
void
WordLattice::alignWords(const VocabIndex *words, Prob score, Prob *wordScores,
							const HypID *hypID)
{
    unsigned numWords = Vocab::length(words);

    /*
     * The states indexing the DP chart correspond to lattice nodes.
     */
    const unsigned NO_PRED = (unsigned)(-1);	// default for pred link

    makeArray(unsigned, sortedNodes, numNodes);

    unsigned numReachable = sortAlignedNodes(sortedNodes);
    if (numReachable != numNodes) {
	cerr << "WARNING: alignWords called with unreachable nodes\n";
    }

    /*
     * Build a mapping of words to lattice nodes to locate matching
     * nodes quickly.  Note we store the nodes by their index in
     * the topological sort since we have to compare order later.
     */
    LHash<VocabIndex, Array<unsigned> > nodeWordMap;

    unsigned j;
    for (j = 0; j < numReachable; j ++) {
	VocabIndex word = nodes[sortedNodes[j]].word;

	if (word != Vocab_None) {
	    Array<unsigned> *nodeList = nodeWordMap.insert(word);

	    (*nodeList)[nodeList->size()] = j;
	}
    }

    /*
     * Allocate the DP chart.
     * chartEntries are indexed by [word_position][lattice_node],
     * where word_position = 0 is the  left string margin,
     * word_position = numWords + 1 is the right string margin.
     */
    typedef struct {
	unsigned cost;		// minimal error to this state
	unsigned predNode;	// predecessor state used in getting there
	ErrorType errType;	// error type
    } ChartEntry;

    ChartEntry **chart = new ChartEntry *[numWords + 2];
    assert(chart != 0);

    unsigned i;
    for (i = 0; i <= numWords + 1; i ++) {
	chart[i] = new ChartEntry[numNodes];
	assert(chart[i] != 0);
	for (j = 0; j < numNodes; j ++) {
	    chart[i][j].cost = MAX_COST;
	    chart[i][j].predNode = NO_PRED;
	    chart[i][j].errType = ERR_NONE;
	}
    }

    /*
     * Prime the chart by anchoring the alignment at the left edge
     */
    chart[0][initial].cost = 0;
    chart[0][initial].predNode = initial;
    chart[0][initial].errType = ERR_NONE;

    /*
     * Insertions before the first word
     * NOTE: since we process nodes in topological order this
     * will allow chains of multiple insertions.
     */
    for (j = 0; j < numReachable; j ++) {
	unsigned curr = sortedNodes[j];
	WordLatticeNode &node = nodes[curr];
	unsigned insCost = chart[0][curr].cost + LAT_INS_COST;

	if (insCost >= MAX_COST) continue;

	for (unsigned s = 0; s < node.numSuccs; s ++) {
	    unsigned next = node.succs[s];

	    if (insCost < chart[0][next].cost) {
		chart[0][next].cost = insCost;
		chart[0][next].predNode = curr;
		chart[0][next].errType = ERR_INS;
	    }
	}
    }

    /*
     * For all word positions, compute minimal alignment werr for each
     * state.
     */
    for (i = 1; i <= numWords + 1; i ++) {
	/*
	 * Compute partial alignment cost for all lattice nodes
	 */
	unsigned j;
	for (j = 0; j < numReachable; j ++) {
	    unsigned curr = sortedNodes[j];
	    WordLatticeNode &node = nodes[curr];
	    unsigned cost = chart[i - 1][curr].cost;

	    if (cost >= MAX_COST) continue;

	    /*
	     * Deletion error: current word not matched by lattice
	     * To align the word string we need to create a new node
	     * as well as arc from/to previous/next nodes.
	     */
	    {
		unsigned delCost = cost + LAT_DEL_COST;

		if (delCost < chart[i][curr].cost) {
		    chart[i][curr].cost = delCost;
		    chart[i][curr].predNode = curr;
		    chart[i][curr].errType = ERR_DEL;
		}
	    }

	    /*
	     * Substitution errors:
	     * To align the word string we need to create a new node
	     * and arc from/to it.
	     */
	    for (unsigned s = 0; s < node.numSuccs; s ++) {
		unsigned next = node.succs[s];
		unsigned haveSub =
			(nodes[next].word == words[i - 1]) ? 0 : 1;
		unsigned subCost = cost + haveSub * LAT_SUB_COST;

		if (subCost < chart[i][next].cost) {
		    chart[i][next].cost = subCost;
		    chart[i][next].predNode = curr;
		    chart[i][next].errType = haveSub ? ERR_SUB : ERR_NONE;
		}
	    }

	    /*
	     * Arc deletion:
	     * To align the word string we need to create only a new
	     * arc between existing nodes.
	     * For efficiency we look up the nodes that could possibly
	     * match the next word, instead of checking all nodes that
	     * come after the current one in topological order.
	     */
	    Array<unsigned> *nodeList =
				(words[i - 1] == Vocab_None) ? 0 :
					nodeWordMap.find(words[i - 1]);

	    for (unsigned k = 0; nodeList && k < nodeList->size(); k ++) {
		unsigned nextIndex = (*nodeList)[k];

		/*
		 * Check the the next node comes after the current one
		 * in the topological sort.
		 */
		if (nextIndex > j) {
		    unsigned next = sortedNodes[nextIndex];
		    unsigned newCost = cost + LAT_ARC_COST;

		    if (newCost < chart[i][next].cost &&
			(nodes[curr].align == NO_ALIGN ||
			 nodes[curr].align != nodes[next].align))
		    {
			chart[i][next].cost = newCost;
			chart[i][next].predNode = curr;
			chart[i][next].errType = ERR_ARC;
		    }
		}
	    }
	}

	for (j = 0; j < numReachable; j ++) {
	    unsigned curr = sortedNodes[j];
	    WordLatticeNode &node = nodes[curr];
	    unsigned insCost = chart[i][curr].cost + LAT_INS_COST;

	    if (insCost >= MAX_COST) continue;

	    /*
	     * Insertion errors: lattice node not matched by word
	     * NOTE: since we process nodes in topological order this
	     * will allow chains of multiple insertions.
	     */
	    for (unsigned s = 0; s < node.numSuccs; s ++) {
		unsigned next = node.succs[s];

		if (insCost < chart[i][next].cost) {
		    chart[i][next].cost = insCost;
		    chart[i][next].predNode = curr;
		    chart[i][next].errType = ERR_INS;
		}
	    }
	}
    }

    /*
     * Viterbi backtrace to find best alignment and add new nodes/arcs
     */
    {
	unsigned bestPred = final;
	unsigned lastState;

	for (i = numWords + 1; ; i --) {
	    assert(chart[i][bestPred].predNode != NO_PRED);

	    unsigned thisState;

	    /*
	     * Skip over any "inserted" lattice nodes
	     */
	    while (bestPred != NO_PRED && chart[i][bestPred].errType == ERR_INS)
	    {
		bestPred = chart[i][bestPred].predNode;
	    }

	    if (chart[i][bestPred].errType == ERR_DEL ||
		chart[i][bestPred].errType == ERR_SUB)
	    {
		/*
		 * Create a new node
		 */
		thisState = numNodes ++;
		nodes[thisState].word = words[i - 1];
		nodes[thisState].score = score;
		if (chart[i][bestPred].errType == ERR_SUB) {
		    nodes[thisState].align = nodes[bestPred].align;
		} else {
		    nodes[thisState].align = numAligns++;
		}

		if (wordScores && i > 0 && i <= numWords) {
		    wordScores[i - 1] = score;
		}
	    } else {
		/*
		 * Add the score to an existing node
		 */
		thisState = bestPred;
		nodes[thisState].score += score;

		if (wordScores && i > 0 && i <= numWords) {
		    wordScores[i - 1] = nodes[thisState].score;
		}
	    }

	    /*
	     * Add arc between this and the successor state on the path,
	     * (unless we're at the final state).
	     * If the arc already exists then this just adds the score
	     * to the existing score.
	     */
	    if (thisState != final && thisState != lastState) {
		addArc(thisState, lastState, score);
	    }

	    bestPred = chart[i][bestPred].predNode;
	    lastState = thisState;

	    if (i == 0) break;
	}
	assert(lastState == initial);
    }


    /*
     * XXX: LHash lossage.
     */
    LHashIter<VocabIndex, Array<unsigned> > nodeWordIter(nodeWordMap);
    VocabIndex word;
    Array<unsigned> *nodeList;
    while (nodeList = nodeWordIter.next(word)) {
	nodeList->~Array();
    }

    for (i = 0; i <= numWords + 1; i ++) {
	delete [] chart[i];
    }
    delete [] chart;
}

/*
 * Search for path with lowest expected word error.
 */
class AlignErrors
{
public:
    AlignErrors() : subs(0.0), inss(0.0), dels(0.0) {};
    double subs, inss, dels;
};

typedef struct {
    double errs;		// total minimal error to this state
    unsigned pred;		// predecessor state used in getting there
    unsigned nnodes;
    double subs, inss, dels;// error counts by type
} ChartEntry;

double
WordLattice::minimizeWordError(VocabIndex *words, unsigned length,
				    double &sub, double &ins, double &del,
				    unsigned flags, double delBias)
				// delBias is ignored, unimplemented
{
    const unsigned NO_PRED = (unsigned)(-1);	// default for pred link
    double expectedError;

    /*
     * Sort nodes topologically respecting alignments
     */
    Array<unsigned> sortedNodes(0, numNodes);
    unsigned numReachable = sortAlignedNodes(sortedNodes.data());

    if (numReachable != numNodes) {
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -