📄 wordlattice.cc
字号:
* Construct alignment class to node maps
*/
Array< Array<unsigned> > alignMap(0, numAligns);
for (i = 0; i < numNodes; i ++) {
unsigned align = nodes[i].align;
unsigned n = alignMap[align].size();
alignMap[align][n] = i;
}
/*
* Sort alignment classes topologically
*/
Array<unsigned> sortedAligns(0, numAligns);
unsigned numSortedAligns = alat.sortNodes(sortedAligns.data());
/*
* Expand sorted alignment classes back into nodes
*/
i = 0;
unsigned lastAlign;
for (unsigned k = 0; k < numSortedAligns; k ++) {
/*
* If current and previous alignment class are not ordere w.r.t.
* each other, then merge them.
*/
Prob prob;
Boolean mergeAligns =
k > 0 &&
!alat.hasArc(sortedAligns[k-1], sortedAligns[k], prob);
for (unsigned j = 0; j < alignMap[sortedAligns[k]].size(); j ++, i ++) {
sortedNodes[i] = alignMap[sortedAligns[k]][j];
if (mergeAligns) {
nodes[sortedNodes[i]].align = lastAlign;
}
}
if (!mergeAligns) {
lastAlign = sortedAligns[k];
}
}
return i;
}
/*
* addWords --
* Add new nodes representing word string
*/
void
WordLattice::addWords(const VocabIndex *words, Prob score, const HypID *hypID)
{
unsigned prevNode = initial;
nodes[initial].score += score;
for (unsigned i = 0; words[i] != Vocab_None; i ++) {
unsigned newNode = numNodes ++;
nodes[newNode].word = words[i];
nodes[newNode].score = score;
nodes[newNode].align = numAligns++;
addArc(prevNode, newNode, score);
prevNode = newNode;
}
addArc(prevNode, final, score);
nodes[final].score += score;
}
/*
* NOTE: MAX_COST is small enough that we can add any of the *_COST constants
* without overflow.
*/
const unsigned MAX_COST = (unsigned)(-10); // an impossible path
/*
* error type used in tracing back word/lattice alignments
*/
typedef enum {
ERR_NONE, ERR_SUB, ERR_INS, ERR_DEL, ERR_ARC
} ErrorType;
/*
* costs for local lattice alignments
* (note these are different from word alignment costs)
*/
const unsigned LAT_SUB_COST = 2;
const unsigned LAT_DEL_COST = 3;
const unsigned LAT_INS_COST = 3;
const unsigned LAT_ARC_COST = 1;
/*
* alignWords --
* Add a word string to lattice by finding best alignment
*/
void
WordLattice::alignWords(const VocabIndex *words, Prob score, Prob *wordScores,
const HypID *hypID)
{
unsigned numWords = Vocab::length(words);
/*
* The states indexing the DP chart correspond to lattice nodes.
*/
const unsigned NO_PRED = (unsigned)(-1); // default for pred link
makeArray(unsigned, sortedNodes, numNodes);
unsigned numReachable = sortAlignedNodes(sortedNodes);
if (numReachable != numNodes) {
cerr << "WARNING: alignWords called with unreachable nodes\n";
}
/*
* Build a mapping of words to lattice nodes to locate matching
* nodes quickly. Note we store the nodes by their index in
* the topological sort since we have to compare order later.
*/
LHash<VocabIndex, Array<unsigned> > nodeWordMap;
unsigned j;
for (j = 0; j < numReachable; j ++) {
VocabIndex word = nodes[sortedNodes[j]].word;
if (word != Vocab_None) {
Array<unsigned> *nodeList = nodeWordMap.insert(word);
(*nodeList)[nodeList->size()] = j;
}
}
/*
* Allocate the DP chart.
* chartEntries are indexed by [word_position][lattice_node],
* where word_position = 0 is the left string margin,
* word_position = numWords + 1 is the right string margin.
*/
typedef struct {
unsigned cost; // minimal error to this state
unsigned predNode; // predecessor state used in getting there
ErrorType errType; // error type
} ChartEntry;
ChartEntry **chart = new ChartEntry *[numWords + 2];
assert(chart != 0);
unsigned i;
for (i = 0; i <= numWords + 1; i ++) {
chart[i] = new ChartEntry[numNodes];
assert(chart[i] != 0);
for (j = 0; j < numNodes; j ++) {
chart[i][j].cost = MAX_COST;
chart[i][j].predNode = NO_PRED;
chart[i][j].errType = ERR_NONE;
}
}
/*
* Prime the chart by anchoring the alignment at the left edge
*/
chart[0][initial].cost = 0;
chart[0][initial].predNode = initial;
chart[0][initial].errType = ERR_NONE;
/*
* Insertions before the first word
* NOTE: since we process nodes in topological order this
* will allow chains of multiple insertions.
*/
for (j = 0; j < numReachable; j ++) {
unsigned curr = sortedNodes[j];
WordLatticeNode &node = nodes[curr];
unsigned insCost = chart[0][curr].cost + LAT_INS_COST;
if (insCost >= MAX_COST) continue;
for (unsigned s = 0; s < node.numSuccs; s ++) {
unsigned next = node.succs[s];
if (insCost < chart[0][next].cost) {
chart[0][next].cost = insCost;
chart[0][next].predNode = curr;
chart[0][next].errType = ERR_INS;
}
}
}
/*
* For all word positions, compute minimal alignment werr for each
* state.
*/
for (i = 1; i <= numWords + 1; i ++) {
/*
* Compute partial alignment cost for all lattice nodes
*/
unsigned j;
for (j = 0; j < numReachable; j ++) {
unsigned curr = sortedNodes[j];
WordLatticeNode &node = nodes[curr];
unsigned cost = chart[i - 1][curr].cost;
if (cost >= MAX_COST) continue;
/*
* Deletion error: current word not matched by lattice
* To align the word string we need to create a new node
* as well as arc from/to previous/next nodes.
*/
{
unsigned delCost = cost + LAT_DEL_COST;
if (delCost < chart[i][curr].cost) {
chart[i][curr].cost = delCost;
chart[i][curr].predNode = curr;
chart[i][curr].errType = ERR_DEL;
}
}
/*
* Substitution errors:
* To align the word string we need to create a new node
* and arc from/to it.
*/
for (unsigned s = 0; s < node.numSuccs; s ++) {
unsigned next = node.succs[s];
unsigned haveSub =
(nodes[next].word == words[i - 1]) ? 0 : 1;
unsigned subCost = cost + haveSub * LAT_SUB_COST;
if (subCost < chart[i][next].cost) {
chart[i][next].cost = subCost;
chart[i][next].predNode = curr;
chart[i][next].errType = haveSub ? ERR_SUB : ERR_NONE;
}
}
/*
* Arc deletion:
* To align the word string we need to create only a new
* arc between existing nodes.
* For efficiency we look up the nodes that could possibly
* match the next word, instead of checking all nodes that
* come after the current one in topological order.
*/
Array<unsigned> *nodeList =
(words[i - 1] == Vocab_None) ? 0 :
nodeWordMap.find(words[i - 1]);
for (unsigned k = 0; nodeList && k < nodeList->size(); k ++) {
unsigned nextIndex = (*nodeList)[k];
/*
* Check the the next node comes after the current one
* in the topological sort.
*/
if (nextIndex > j) {
unsigned next = sortedNodes[nextIndex];
unsigned newCost = cost + LAT_ARC_COST;
if (newCost < chart[i][next].cost &&
(nodes[curr].align == NO_ALIGN ||
nodes[curr].align != nodes[next].align))
{
chart[i][next].cost = newCost;
chart[i][next].predNode = curr;
chart[i][next].errType = ERR_ARC;
}
}
}
}
for (j = 0; j < numReachable; j ++) {
unsigned curr = sortedNodes[j];
WordLatticeNode &node = nodes[curr];
unsigned insCost = chart[i][curr].cost + LAT_INS_COST;
if (insCost >= MAX_COST) continue;
/*
* Insertion errors: lattice node not matched by word
* NOTE: since we process nodes in topological order this
* will allow chains of multiple insertions.
*/
for (unsigned s = 0; s < node.numSuccs; s ++) {
unsigned next = node.succs[s];
if (insCost < chart[i][next].cost) {
chart[i][next].cost = insCost;
chart[i][next].predNode = curr;
chart[i][next].errType = ERR_INS;
}
}
}
}
/*
* Viterbi backtrace to find best alignment and add new nodes/arcs
*/
{
unsigned bestPred = final;
unsigned lastState;
for (i = numWords + 1; ; i --) {
assert(chart[i][bestPred].predNode != NO_PRED);
unsigned thisState;
/*
* Skip over any "inserted" lattice nodes
*/
while (bestPred != NO_PRED && chart[i][bestPred].errType == ERR_INS)
{
bestPred = chart[i][bestPred].predNode;
}
if (chart[i][bestPred].errType == ERR_DEL ||
chart[i][bestPred].errType == ERR_SUB)
{
/*
* Create a new node
*/
thisState = numNodes ++;
nodes[thisState].word = words[i - 1];
nodes[thisState].score = score;
if (chart[i][bestPred].errType == ERR_SUB) {
nodes[thisState].align = nodes[bestPred].align;
} else {
nodes[thisState].align = numAligns++;
}
if (wordScores && i > 0 && i <= numWords) {
wordScores[i - 1] = score;
}
} else {
/*
* Add the score to an existing node
*/
thisState = bestPred;
nodes[thisState].score += score;
if (wordScores && i > 0 && i <= numWords) {
wordScores[i - 1] = nodes[thisState].score;
}
}
/*
* Add arc between this and the successor state on the path,
* (unless we're at the final state).
* If the arc already exists then this just adds the score
* to the existing score.
*/
if (thisState != final && thisState != lastState) {
addArc(thisState, lastState, score);
}
bestPred = chart[i][bestPred].predNode;
lastState = thisState;
if (i == 0) break;
}
assert(lastState == initial);
}
/*
* XXX: LHash lossage.
*/
LHashIter<VocabIndex, Array<unsigned> > nodeWordIter(nodeWordMap);
VocabIndex word;
Array<unsigned> *nodeList;
while (nodeList = nodeWordIter.next(word)) {
nodeList->~Array();
}
for (i = 0; i <= numWords + 1; i ++) {
delete [] chart[i];
}
delete [] chart;
}
/*
* Search for path with lowest expected word error.
*/
class AlignErrors
{
public:
AlignErrors() : subs(0.0), inss(0.0), dels(0.0) {};
double subs, inss, dels;
};
typedef struct {
double errs; // total minimal error to this state
unsigned pred; // predecessor state used in getting there
unsigned nnodes;
double subs, inss, dels;// error counts by type
} ChartEntry;
double
WordLattice::minimizeWordError(VocabIndex *words, unsigned length,
double &sub, double &ins, double &del,
unsigned flags, double delBias)
// delBias is ignored, unimplemented
{
const unsigned NO_PRED = (unsigned)(-1); // default for pred link
double expectedError;
/*
* Sort nodes topologically respecting alignments
*/
Array<unsigned> sortedNodes(0, numNodes);
unsigned numReachable = sortAlignedNodes(sortedNodes.data());
if (numReachable != numNodes) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -