📄 nbest-optimize.cc
字号:
if (decipherScores) {
/*
* Read decipher scores as floats event though they are supposed
* to be int's. This way we accomodate some preexisting rescoring
* programs.
*/
if (sscanf(line, "(%lf)", &score) != 1) {
file.position() << "bad Decipher score: " << line << endl;
break;
} else {
scores[hypNo ++] = BytelogToLogP((int)score);
}
} else {
if (sscanf(line, "%lf", &score) != 1) {
file.position() << "bad score: " << line << endl;
break;
} else {
scores[hypNo ++] = score;
}
}
}
/*
* Set missing scores to zero
*/
if (!file.error() && hypNo < numHyps) {
cerr << "warning: " << (numHyps - hypNo) << " scores missing from "
<< fileName << endl;
}
while (hypNo < numHyps) {
scores[hypNo ++] = 0;
}
return !file.error();
}
/*
* Read error counts file
*/
Boolean
readErrorsFile(const char *errorsDir, RefString id, NBestList &nbest,
unsigned &numWords)
{
unsigned numHyps = nbest.numHyps();
makeArray(char, fileName,
strlen(errorsDir) + 1 + strlen(id) + strlen(GZIP_SUFFIX) + 1);
sprintf(fileName, "%s/%s", errorsDir, id);
/*
* If plain file doesn't exist try gzipped version
*/
FILE *fp;
if ((fp = fopen(fileName, "r")) == NULL) {
strcat(fileName, GZIP_SUFFIX);
} else {
fclose(fp);
}
File file(fileName, "r", 0);
char *line;
unsigned hypNo = 0;
while (!file.error() && (line = file.getline())) {
if (hypNo >= numHyps) {
break;
}
/*
* parse errors line
*/
float corrRate, errRate;
unsigned numSub, numDel, numIns, numErrs, numWds;
if (sscanf(line, "%f %f %u %u %u %u %u", &corrRate, &errRate,
&numSub, &numDel, &numIns, &numErrs, &numWds) != 7)
{
file.position() << "bad errors: " << line << endl;
return 0;
} else if (hypNo > 0 && numWds != numWords) {
file.position() << "inconsistent number of words: " << line << endl;
return 0;
} else {
if (hypNo == 0) {
numWords = numWds;
}
nbest.getHyp(hypNo ++).numErrors = numErrs;
}
}
if (hypNo < numHyps) {
file.position() << "too few errors lines" << endl;
return 0;
}
return !file.error();
}
int
main(int argc, char **argv)
{
setlocale(LC_CTYPE, "");
setlocale(LC_COLLATE, "");
argc = Opt_Parse(argc, argv, options, Opt_Number(options), 0);
if (version) {
printVersion(RcsId);
exit(0);
}
if (!nbestFiles) {
cerr << "cannot proceed without nbest files\n";
exit(2);
}
if (!oneBest && !refFile) {
cerr << "cannot proceed without references\n";
exit(2);
}
if (oneBest && !refFile && !errorsDir) {
cerr << "cannot proceed without references or error counts\n";
exit(2);
}
if ((oneBest || oneBestFirst) && !initSimplex) {
cerr << "1-best optimization only supported in simplex mode\n";
exit(2);
}
Vocab vocab;
NullLM nullLM(vocab);
RefList refs(vocab);
NBestSet trainSet(vocab, refs, maxNbest, false, multiwords);
trainSet.debugme(debug);
trainSet.warn = false; // don't warn about missing refs
if (vocabFile) {
File file(vocabFile, "r");
vocab.read(file);
}
vocab.toLower() = toLower ? true : false;
/*
* Skip noise tags in scoring
*/
if (noiseVocabFile) {
File file(noiseVocabFile, "r");
nullLM.noiseVocab.read(file);
}
if (noiseTag) { /* backward compatibility */
nullLM.noiseVocab.addWord(noiseTag);
}
/*
* Optionally read a subvocabulary that is to be kept separate from
* regular words during alignment
*/
SubVocab hiddenVocab(vocab);
if (hiddenVocabFile) {
File file(hiddenVocabFile, "r");
hiddenVocab.read(file);
}
SubVocabDistance subvocabDistance(vocab, hiddenVocab);
/*
* Posterior scaling: if not specified (= 0.0) use LMW for
* backward compatibility.
*/
if (posteriorScale == 0.0) {
posteriorScale = (rescoreLMW == 0.0) ? 1.0 : rescoreLMW;
}
if (refFile) {
cerr << "reading references...\n";
File file(refFile, "r");
refs.read(file, true); // add reference words to vocabulary
}
{
cerr << "reading nbest lists...\n";
File file(nbestFiles, "r");
trainSet.read(file);
}
/*
* there are three scores in the N-best list, plus as many as
* user supplies in separate directories on the command line
*/
numScores = 3 + argc - 1;
numFixedWeights = 0;
lambdas[0] = 1/posteriorScale;
lambdas[1] = rescoreLMW/posteriorScale;
lambdas[2] = rescoreWTW/posteriorScale;
for (unsigned i = 0; i < 3; i ++) {
fixLambdas[i] = false;
lambdaSteps[i] = 1.0;
}
for (unsigned i = 3; i < numScores; i ++) {
lambdas[i] = 0.0;
fixLambdas[i] = false;
lambdaSteps[i] = 1.0;
}
/*
* Store directory names needed to write nbest-rover file
*/
{
/*
* infer nbest directory name from first file in list
*/
NBestSetIter iter(trainSet);
RefString id;
const char *nbestFilename = iter.nextFile(id);
if (nbestFilename) {
nbestDirectory = strdup(nbestFilename);
assert(nbestDirectory != 0);
char *basename = strrchr(nbestDirectory, '/');
if (basename != 0) {
*basename = '\0';
} else {
strcpy(nbestDirectory, ".");
}
} else {
nbestDirectory = ".";
}
}
scoreDirectories = &argv[1];
/*
* Initialize lambdas from command line values if specified
*/
if (initLambdas) {
unsigned offset = 0;
for (unsigned i = 0; i < numScores; i ++) {
unsigned consumed = 0;
if (sscanf(&initLambdas[offset], " =%lf%n",
&lambdas[i], &consumed) > 0)
{
lambdas[i] /= posteriorScale;
fixLambdas[i] = true;
numFixedWeights++;
} else if (sscanf(&initLambdas[offset], "%lf%n",
&lambdas[i], &consumed) > 0)
{
lambdas[i] /= posteriorScale;
lambdaSteps[i] = 1.0;
} else {
break;
}
offset += consumed;
}
}
/*
* Initialize simplex points
*/
if (initSimplex) {
unsigned offset = 0;
unsigned consumed = 0;
for (unsigned i = 0; i < numScores; i++) {
if (!fixLambdas[i]) {
if (sscanf(&initSimplex[offset], "%lf%n",
&lambdaSteps[i], &consumed) <= 0)
{
break;
}
if (lambdaSteps[i] == 0.0) {
cerr << "Fixing " << i << "th parameter\n";
fixLambdas[i] = true;
numFixedWeights++;
}
offset += consumed;
}
}
sscanf(&initSimplex[offset], "%lf%n", &posteriorScaleStep, &consumed);
}
/*
* Set up the score matrices
*/
cerr << "reading scores...\n";
NBestSetIter iter(trainSet);
RefString id;
NBestList *nbest;
while (nbest = iter.next(id)) {
/*
* Allocate score matrix for this nbest list
*/
NBestScore **scores = new NBestScore *[numScores];
assert(scores != 0);
for (unsigned i = 0; i < numScores; i ++) {
scores[i] = new NBestScore[nbest->numHyps()];
assert(scores[i] != 0);
}
/*
* Transfer the standard scores from N-best list to score matrix
*/
for (unsigned j = 0; j < nbest->numHyps(); j ++) {
scores[0][j] = nbest->getHyp(j).acousticScore;
scores[1][j] = nbest->getHyp(j).languageScore;
scores[2][j] = (NBestScore) nbest->getHyp(j).numWords;
}
/*
* Read additional scores
*/
for (unsigned i = 1; i < argc; i ++) {
if (!readScoreFile(argv[i], id, scores[i + 2], nbest->numHyps())) {
cerr << "warning: error reading scores for " << id
<< " from " << argv[i] << endl;
}
}
/*
* Scale scores to help prevent underflow
*/
if (!combineLinear) {
for (unsigned i = 0; i < numScores; i ++) {
for (unsigned j = nbest->numHyps(); j > 0; j --) {
scores[i][j-1] -= scores[i][0];
}
}
}
/*
* save score matrix under nbest id
*/
*nbestScores.insert(id) = scores;
}
if (debug >= DEBUG_SCORES) {
dumpScores(cerr, trainSet);
}
cerr << (errorsDir ? "reading" : "computing") << " error counts...\n";
iter.init();
numRefWords = 0;
/*
* Compute hyp errors
*/
while (nbest = iter.next(id)) {
unsigned numWords;
VocabIndex *ref = refs.findRef(id);
if (!(ref || (oneBest && !oneBestFirst) && errorsDir)) {
cerr << "missing reference for " << id << endl;
exit(1);
}
/*
* Remove pauses and noise from nbest hyps since these would
* confuse the inter-hyp alignments.
*/
nbest->removeNoise(nullLM);
/*
* In 1-best mode we only need the error counts for each hypothesis;
* in sausage (default) mode we need to construct multiple alignment
* of reference and all n-best hyps.
*/
if (errorsDir) {
/*
* read error counts
*/
if (!readErrorsFile(errorsDir, id, *nbest, numWords)) {
cerr << "couldn't get error counts for " << id << endl;
exit(2);
}
} else {
/*
* need to recompute hyp errors (after removeNoise() above)
*/
unsigned sub, ins, del;
nbest->wordError(ref, sub, ins, del);
numWords = Vocab::length(ref);
}
/*
* compute total length of references for later normalizations
*/
numRefWords += numWords;
}
cerr << numRefWords << " reference words\n";
/*
* preemptive trouble avoidance: prevent division by zero
*/
if (numRefWords == 0) {
numRefWords = 1;
}
#ifndef NO_TIMEOUT
/*
* set up search time-out handler
*/
if (maxTime) {
signal(SIGALRM, catchAlarm);
}
#endif /* !NO_TIMEOUT */
double oldPosteriorScaleStep = posteriorScaleStep;
if (oneBest || oneBestFirst) {
oneBest = true;
posteriorScaleStep = 0.0;
cerr << "Posterior scale step size set to " << posteriorScaleStep
<< endl;
unsigned errors = (int)computeErrors(trainSet, lambdas.data());
printLambdas(cout, lambdas);
if (initSimplex == 0) {
train(trainSet);
} else {
trainAmoeba(trainSet);
}
cout << "original errors = " << errors
<< " (" << ((double)errors/numRefWords) << "/word)"
<< endl;
cout << "best errors = " << bestError
<< " (" << ((double)bestError/numRefWords) << "/word)"
<< endl;
}
if (oneBestFirst) {
// restart search at best point found in 1-best search
lambdas = bestLambdas;
// scale weights to LMW==1
if (lambdas[1] != 0.0) {
posteriorScale = lambdas[1];
for (unsigned i = 0; i < numScores; i ++) {
lambdas[i] /= posteriorScale;
}
}
}
if (!oneBest || oneBestFirst) {
oneBest = false;
posteriorScaleStep = oldPosteriorScaleStep;
cerr << "Posterior scale step size set to " << posteriorScaleStep
<< endl;
cerr << "aligning nbest lists...\n";
alignNbest(trainSet, refs, subvocabDistance);
unsigned errors = (int) computeErrors(trainSet, lambdas.data());
printLambdas(cout, lambdas);
if (initSimplex == 0) {
train(trainSet);
} else {
trainAmoeba(trainSet);
}
cout << "original errors = " << errors
<< " (" << ((double)errors/numRefWords) << "/word)"
<< endl;
cout << "best errors = " << bestError
<< " (" << ((double)bestError/numRefWords) << "/word)"
<< endl;
}
printLambdas(cout, bestLambdas, writeRoverControl);
if (printHyps) {
File file(printHyps, "w");
lambdas = bestLambdas;
printTopHyps(file, trainSet);
}
exit(0);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -