📄 wordlattice.cc
字号:
cerr << "WARNING: bestPath2 called with unreachable nodes\n";
}
assert(sortedNodes[numReachable - 1] == final);
/*
* Create arrays to keep track of error counts by node and by edge
*/
Array<AlignErrors> nodeErrors(0, numNodes);
Array< Array<AlignErrors> > edgeErrors(0, numNodes);
/*
* Create the set of pending edges, initially empty
*/
LHash<NodePair,Boolean> pendingEdges;
unsigned numWords = 0; /* result word count */
double totalSubs = 0.0;
double totalInss = 0.0;
double totalDels = 0.0;
for (unsigned i = 0; i < numReachable; ) {
/*
* Find the end of the current alignment class.
* Also total the posterior probs for this class.
*/
Prob totalAlignPost = nodes[sortedNodes[i]].score;
unsigned endOfAlign = i + 1;
while (endOfAlign < numReachable &&
nodes[sortedNodes[i]].align ==
nodes[sortedNodes[endOfAlign]].align)
{
totalAlignPost += nodes[sortedNodes[endOfAlign]].score;
endOfAlign ++;
}
/*
* Compute insertion/deletion errors:
* assign deletion errors to edges straddling the alignment class
* assign insertion errors to nodes in the alignment class
* compute total posterior for all unfinished edges
*/
Prob totalEdgePost = 0.0;
LHashIter<NodePair,Boolean> edgeIter(pendingEdges);
NodePair currEdge;
while (edgeIter.next(currEdge)) {
/*
* Check if edge terminates with current alignment class
*/
Boolean edgeEnds = false;
for (unsigned k = i; k < endOfAlign; k ++) {
if (currEdge.node2 == sortedNodes[k]) {
edgeEnds = true;
break;
}
}
if (edgeEnds) {
//cerr << "finishing edge " << currEdge
// << " dels = "
// << edgeErrors[currEdge.node1][currEdge.node2].dels
// << endl;
pendingEdges.remove(currEdge);
} else {
Prob edgePost;
hasArc(currEdge.node1, currEdge.node2, edgePost);
totalEdgePost += edgePost;
edgeErrors[currEdge.node1][currEdge.node2].dels +=
totalAlignPost;
for (unsigned k = i; k < endOfAlign; k ++) {
nodeErrors[sortedNodes[k]].inss += edgePost;
}
}
}
/*
* Compute substitution errors
* and start edges emanating from this alignment class.
* Also compute word with highest posterior.
*/
VocabIndex bestWord = Vocab_None;
Prob bestPosterior = totalEdgePost;
for (unsigned k = i; k < endOfAlign; k ++) {
unsigned currNode = sortedNodes[k];
if (nodes[currNode].score > bestPosterior) {
bestWord = nodes[currNode].word;
bestPosterior = nodes[currNode].score;
}
nodeErrors[currNode].subs += totalAlignPost - nodes[currNode].score;
//cerr << "node " << currNode
// << " subs = " << nodeErrors[currNode].subs
// << " inss = " << nodeErrors[currNode].inss << endl;
for (unsigned j = 0; j < nodes[currNode].numSuccs; j ++) {
NodePair newEdge(currNode, nodes[currNode].succs[j]);
pendingEdges.insert(newEdge);
//cerr << "starting edge " << newEdge << endl;
}
}
/*
* Save best word for this position
*/
if (sortedNodes[i] == initial || sortedNodes[i] == final) {
;
} else if (bestWord == Vocab_None) {
totalDels += totalAlignPost;
} else {
if (numWords < length) {
words[numWords ++] = bestWord;
}
totalInss += totalEdgePost;
totalSubs += totalAlignPost - bestPosterior;
}
/*
* Advance to next alignment class
*/
i = endOfAlign;
}
/*
* With the "no viterbi" option we return the sequence of most probable
* words.
*/
if (flags|WORDLATTICE_NOVITERBI) {
if (numWords < length) {
words[numWords] = Vocab_None;
}
sub = totalSubs;
ins = totalInss;
del = totalDels;
return totalSubs + totalInss + totalDels;
}
/*
* Viterbi search for path with lowest total error
*/
Array<ChartEntry> chart(0, numNodes);
unsigned j;
for (j = 0; j < numNodes; j ++) {
chart[j].errs = 0.0;
chart[j].pred = NO_PRED;
chart[j].nnodes = 0;
chart[j].subs = chart[j].inss = chart[j].dels = 0.0;
}
for (j = 0; j < numReachable; j ++) {
unsigned curr = sortedNodes[j];
WordLatticeNode &node = nodes[curr];
if (curr != initial && chart[curr].pred == NO_PRED) continue;
for (unsigned s = 0; s < node.numSuccs; s ++) {
unsigned next = node.succs[s];
double newSubs = chart[curr].subs +
edgeErrors[curr][next].subs +
nodeErrors[next].subs;
double newInss = chart[curr].inss +
edgeErrors[curr][next].inss +
nodeErrors[next].inss;
double newDels = chart[curr].dels +
edgeErrors[curr][next].dels +
nodeErrors[next].dels;
double newErrs = newSubs + newInss + newDels;
if (chart[next].pred == NO_PRED || newErrs < chart[next].errs) {
chart[next].pred = curr;
chart[next].errs = newErrs;
chart[next].subs = newSubs;
chart[next].inss = newInss;
chart[next].dels = newDels;
chart[next].nnodes = chart[curr].nnodes + 1;
}
}
}
/*
* Trace back the best path
*/
if (chart[final].pred == NO_PRED) {
cerr << "WARNING: minimizeWordError: final node not reachable\n";
if (length > 0) {
words[0] = Vocab_None;
}
expectedError = 0.0;
} else {
sub = chart[final].subs;
ins = chart[final].inss;
del = chart[final].dels;
unsigned curr = final;
numWords = 0;
while (numWords < length && curr != NO_PRED) {
if (nodes[curr].word != Vocab_None) {
words[numWords ++] = nodes[curr].word;
}
curr = chart[curr].pred;
}
if (numWords < length) {
words[numWords] = Vocab_None;
}
if (curr != NO_PRED) {
cerr << "WARNING: minimizeWordError: word buffer too short\n";
}
Vocab::reverse(words);
expectedError = chart[final].errs;
}
return expectedError;
}
/*
* wordError --
* compute minimal word error of path through lattice
*/
unsigned
WordLattice::wordError(const VocabIndex *words,
unsigned &sub, unsigned &ins, unsigned &del)
{
unsigned numWords = Vocab::length(words);
/*
* The states indexing the DP chart correspond to lattice nodes.
*/
const unsigned NO_PRED = (unsigned)(-1); // default for pred link
makeArray(unsigned, sortedNodes, numNodes);
unsigned numReachable = sortNodes(sortedNodes);
if (numReachable != numNodes) {
cerr << "WARNING: alignWords called with unreachable nodes\n";
}
/*
* Allocate the DP chart.
* chartEntries are indexed by [word_position][lattice_node],
* where word_position = 0 is the left string margin,
* word_position = numWords + 1 is the right string margin.
*/
typedef struct {
unsigned cost; // minimal path cost to this state
unsigned ins, del, sub; // error counts by type
unsigned predNode; // predecessor state used in getting there
ErrorType errType; // error type
} ChartEntry;
ChartEntry **chart = new ChartEntry *[numWords + 2];
assert(chart != 0);
unsigned i;
for (i = 0; i <= numWords + 1; i ++) {
chart[i] = new ChartEntry[numNodes];
assert(chart[i] != 0);
for (unsigned j = 0; j < numNodes; j ++) {
chart[i][j].cost = MAX_COST;
chart[i][j].sub = chart[i][j].ins = chart[i][j].del = 0;
chart[i][j].predNode = NO_PRED;
chart[i][j].errType = ERR_NONE;
}
}
/*
* Prime the chart by anchoring the alignment at the left edge
*/
chart[0][initial].cost = 0;
chart[0][initial].ins = chart[0][initial].del = chart[0][initial].sub = 0;
chart[0][initial].predNode = initial;
chart[0][initial].errType = ERR_NONE;
/*
* Insertions before the first word
* NOTE: since we process nodes in topological order this
* will allow chains of multiple insertions.
*/
for (unsigned j = 0; j < numReachable; j ++) {
unsigned curr = sortedNodes[j];
WordLatticeNode &node = nodes[curr];
unsigned insCost = chart[0][curr].cost + INS_COST;
if (insCost >= MAX_COST) continue;
for (unsigned s = 0; s < node.numSuccs; s ++) {
unsigned next = node.succs[s];
if (insCost < chart[0][next].cost) {
chart[0][next].cost = insCost;
chart[0][next].ins = chart[0][curr].ins + 1;
chart[0][next].del = chart[0][curr].del;
chart[0][next].sub = chart[0][curr].sub;
chart[0][next].predNode = curr;
chart[0][next].errType = ERR_INS;
}
}
}
/*
* For all word positions, compute minimal cost alignment for each
* state.
*/
for (i = 1; i <= numWords + 1; i ++) {
/*
* Compute partial alignment cost for all lattice nodes
*/
unsigned j;
for (j = 0; j < numReachable; j ++) {
unsigned curr = sortedNodes[j];
WordLatticeNode &node = nodes[curr];
unsigned cost = chart[i - 1][curr].cost;
if (cost >= MAX_COST) continue;
/*
* Deletion error: current word not matched by lattice
*/
{
unsigned delCost = cost + DEL_COST;
if (delCost < chart[i][curr].cost) {
chart[i][curr].cost = delCost;
chart[i][curr].del = chart[i - 1][curr].del + 1;
chart[i][curr].ins = chart[i - 1][curr].ins;
chart[i][curr].sub = chart[i - 1][curr].sub;
chart[i][curr].predNode = curr;
chart[i][curr].errType = ERR_DEL;
}
}
/*
* Substitution errors
*/
for (unsigned s = 0; s < node.numSuccs; s ++) {
unsigned next = node.succs[s];
unsigned haveSub =
(nodes[next].word == words[i - 1]) ? 0 : 1;
unsigned subCost = cost + haveSub * SUB_COST;
if (subCost < chart[i][next].cost) {
chart[i][next].cost = subCost;
chart[i][next].sub = chart[i - 1][curr].sub + haveSub;
chart[i][next].ins = chart[i - 1][curr].ins;
chart[i][next].del = chart[i - 1][curr].del;
chart[i][next].predNode = curr;
chart[i][next].errType = haveSub ? ERR_SUB : ERR_NONE;
}
}
}
for (j = 0; j < numReachable; j ++) {
unsigned curr = sortedNodes[j];
WordLatticeNode &node = nodes[curr];
unsigned insCost = chart[i][curr].cost + INS_COST;
if (insCost >= MAX_COST) continue;
/*
* Insertion errors: lattice node not matched by word
* NOTE: since we process nodes in topological order this
* will allow chains of multiple insertions.
*/
for (unsigned s = 0; s < node.numSuccs; s ++) {
unsigned next = node.succs[s];
if (insCost < chart[i][next].cost) {
chart[i][next].cost = insCost;
chart[i][next].ins = chart[i][curr].ins + 1;
chart[i][next].del = chart[i][curr].del;
chart[i][next].sub = chart[i][curr].sub;
chart[i][next].predNode = curr;
chart[i][next].errType = ERR_INS;
}
}
}
}
if (chart[numWords + 1][final].predNode == NO_PRED) {
/*
* Final node is unreachable
*/
sub = ins = 0;
del = numWords;
} else {
sub = chart[numWords + 1][final].sub;
ins = chart[numWords + 1][final].ins;
del = chart[numWords + 1][final].del;
}
for (i = 0; i <= numWords + 1; i ++) {
delete [] chart[i];
}
delete [] chart;
return sub + ins + del;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -