📄 nbest-optimize.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34

	if (decipherScores) {
	    /*
	     * Read decipher scores as floats event though they are supposed
	     * to be int's.  This way we accomodate some preexisting rescoring
	     * programs.
	     */
	    if (sscanf(line, "(%lf)", &score) != 1) {
		file.position() << "bad Decipher score: " << line << endl;
		break;
	    } else  {
		scores[hypNo ++] = BytelogToLogP((int)score);
	    }
	} else {
	    if (sscanf(line, "%lf", &score) != 1) {
		file.position() << "bad score: " << line << endl;
		break;
	    } else  {
		scores[hypNo ++] = score;
	    }
	}
    }

    /* 
     * Set missing scores to zero
     */
    if (!file.error() && hypNo < numHyps) {
	cerr << "warning: " << (numHyps - hypNo) << " scores missing from "
	     << fileName << endl;
    }
	
    while (hypNo < numHyps) {
	scores[hypNo ++] = 0;
    }

    return !file.error();
}

/*
 * Read error counts file
 */
Boolean
readErrorsFile(const char *errorsDir, RefString id, NBestList &nbest,
							unsigned &numWords)
{
    unsigned numHyps = nbest.numHyps();
    makeArray(char, fileName,
	      strlen(errorsDir) + 1 + strlen(id) + strlen(GZIP_SUFFIX) + 1);
					
    sprintf(fileName, "%s/%s", errorsDir, id);

    /* 
     * If plain file doesn't exist try gzipped version
     */
    FILE *fp;
    if ((fp = fopen(fileName, "r")) == NULL) {
	strcat(fileName, GZIP_SUFFIX);
    } else {
	fclose(fp);
    }

    File file(fileName, "r", 0);

    char *line;
    unsigned hypNo = 0;

    while (!file.error() && (line = file.getline())) {

	if (hypNo >= numHyps) {
	    break;
	}

	/*
	 * parse errors line
	 */
	float corrRate, errRate;
	unsigned numSub, numDel, numIns, numErrs, numWds;

	if (sscanf(line, "%f %f %u %u %u %u %u", &corrRate, &errRate,
			 &numSub, &numDel, &numIns, &numErrs, &numWds) != 7)
	{
	    file.position() << "bad errors: " << line << endl;
	    return 0;
	} else if (hypNo > 0 && numWds != numWords) {
	    file.position() << "inconsistent number of words: " << line << endl;
	    return 0;
	} else {
	    if (hypNo == 0) {
		numWords = numWds;
	    }
	    nbest.getHyp(hypNo ++).numErrors = numErrs;
	}
    }

    if (hypNo < numHyps) {
	file.position() << "too few errors lines" << endl;
	return 0;
    }

    return !file.error();
}

int
main(int argc, char **argv)
{
    setlocale(LC_CTYPE, "");
    setlocale(LC_COLLATE, "");

    argc = Opt_Parse(argc, argv, options, Opt_Number(options), 0);

    if (version) {
	printVersion(RcsId);
	exit(0);
    }

    if (!nbestFiles) {
	cerr << "cannot proceed without nbest files\n";
	exit(2);
    }
    if (!oneBest && !refFile) {
	cerr << "cannot proceed without references\n";
	exit(2);
    }
    if (oneBest && !refFile && !errorsDir) {
	cerr << "cannot proceed without references or error counts\n";
	exit(2);
    }

    if ((oneBest || oneBestFirst) && !initSimplex) {
	cerr << "1-best optimization only supported in simplex mode\n";
	exit(2);
    }

    Vocab vocab;
    NullLM nullLM(vocab);
    RefList refs(vocab);

    NBestSet trainSet(vocab, refs, maxNbest, false, multiwords);
    trainSet.debugme(debug);
    trainSet.warn = false;	// don't warn about missing refs

    if (vocabFile) {
	File file(vocabFile, "r");
	vocab.read(file);
    }

    vocab.toLower() = toLower ? true : false;

    /*
     * Skip noise tags in scoring
     */
    if (noiseVocabFile) {
	File file(noiseVocabFile, "r");
	nullLM.noiseVocab.read(file);
    }
    if (noiseTag) {				/* backward compatibility */
	nullLM.noiseVocab.addWord(noiseTag);
    }

    /*
     * Optionally read a subvocabulary that is to be kept separate from
     * regular words during alignment
     */
    SubVocab hiddenVocab(vocab);
    if (hiddenVocabFile) {
	File file(hiddenVocabFile, "r");

	hiddenVocab.read(file);
    }
    SubVocabDistance subvocabDistance(vocab, hiddenVocab);

    /*
     * Posterior scaling:  if not specified (= 0.0) use LMW for
     * backward compatibility.
     */
    if (posteriorScale == 0.0) {
	posteriorScale = (rescoreLMW == 0.0) ? 1.0 : rescoreLMW;
    }

    if (refFile) {
	cerr << "reading references...\n";
	File file(refFile, "r");

	refs.read(file, true);	 // add reference words to vocabulary
    }

    {
	cerr << "reading nbest lists...\n";
	File file(nbestFiles, "r");
	trainSet.read(file);
    }

    /*
     * there are three scores in the N-best list, plus as many as 
     * user supplies in separate directories on the command line
     */
    numScores = 3 + argc - 1;
    numFixedWeights = 0;

    lambdas[0] = 1/posteriorScale;
    lambdas[1] = rescoreLMW/posteriorScale;
    lambdas[2] = rescoreWTW/posteriorScale;

    for (unsigned i = 0; i < 3; i ++) {
	fixLambdas[i] = false;
	lambdaSteps[i] = 1.0;
    }

    for (unsigned i = 3; i < numScores; i ++) {
	lambdas[i] = 0.0;
	fixLambdas[i] = false;
	lambdaSteps[i] = 1.0;
    }

    /*
     * Store directory names needed to write nbest-rover file
     */
    {
	/*
	 * infer nbest directory name from first file in list
	 */
	NBestSetIter iter(trainSet);
	RefString id;
	const char *nbestFilename = iter.nextFile(id);
	if (nbestFilename) {
	    nbestDirectory = strdup(nbestFilename);
	    assert(nbestDirectory != 0);

	    char *basename = strrchr(nbestDirectory, '/');
	    if (basename != 0) {
		*basename = '\0';
	    } else {
		strcpy(nbestDirectory, ".");
	    }
	} else {
	    nbestDirectory = ".";
	}
    }
    scoreDirectories = &argv[1];

    /*
     * Initialize lambdas from command line values if specified
     */
    if (initLambdas) {
	unsigned offset = 0;

	for (unsigned i = 0; i < numScores; i ++) {
	    unsigned consumed = 0;
	    if (sscanf(&initLambdas[offset], " =%lf%n",
						&lambdas[i], &consumed) > 0)
	    {
	        lambdas[i] /= posteriorScale;
		fixLambdas[i] = true;
		numFixedWeights++;
	    } else if (sscanf(&initLambdas[offset], "%lf%n",
						&lambdas[i], &consumed) > 0)
	    {
	        lambdas[i] /= posteriorScale;
		lambdaSteps[i] = 1.0;
	    } else {
		break;
	    }
	    offset += consumed;
	}
    }

    /*
     * Initialize simplex points
     */
    if (initSimplex) {
	unsigned offset = 0;

	unsigned consumed = 0;
	for (unsigned i = 0; i < numScores; i++) {

	    if (!fixLambdas[i]) {
	        if (sscanf(&initSimplex[offset], "%lf%n",
					&lambdaSteps[i], &consumed) <= 0)
		{
		    break;
		}

	        if (lambdaSteps[i] == 0.0) {
		    cerr << "Fixing " << i << "th parameter\n";
		    fixLambdas[i] = true;
		    numFixedWeights++;
		}

		offset += consumed;
	    }
	}

	sscanf(&initSimplex[offset], "%lf%n", &posteriorScaleStep, &consumed);
    }

    /*
     * Set up the score matrices
     */
    cerr << "reading scores...\n";

    NBestSetIter iter(trainSet);

    RefString id;
    NBestList *nbest;
    while (nbest = iter.next(id)) {
	/*
	 * Allocate score matrix for this nbest list
	 */
	NBestScore **scores = new NBestScore *[numScores];
	assert(scores != 0);

	for (unsigned i = 0; i < numScores; i ++) {
	    scores[i] = new NBestScore[nbest->numHyps()];
	    assert(scores[i] != 0);
	}

	/*
	 * Transfer the standard scores from N-best list to score matrix
	 */
	for (unsigned j = 0; j < nbest->numHyps(); j ++) {
	    scores[0][j] = nbest->getHyp(j).acousticScore;
	    scores[1][j] = nbest->getHyp(j).languageScore;
	    scores[2][j] = (NBestScore) nbest->getHyp(j).numWords;
	}

	/*
	 * Read additional scores
	 */
	for (unsigned i = 1; i < argc; i ++) {
	    if (!readScoreFile(argv[i], id, scores[i + 2], nbest->numHyps())) {
		cerr << "warning: error reading scores for " << id
		     << " from " << argv[i] << endl;
	    }
	}

	/*
	 * Scale scores to help prevent underflow
	 */
	if (!combineLinear) {
	    for (unsigned i = 0; i < numScores; i ++) {
		for (unsigned j = nbest->numHyps(); j > 0; j --) {
		    scores[i][j-1] -= scores[i][0];
		}
	    }
	}
	
	/* 
	 * save score matrix under nbest id
	 */
	*nbestScores.insert(id) = scores;
    }

    if (debug >= DEBUG_SCORES) {
	dumpScores(cerr, trainSet);
    }

    cerr << (errorsDir ? "reading" : "computing") << " error counts...\n";

    iter.init();

    numRefWords = 0;

    /*
     * Compute hyp errors
     */
    while (nbest = iter.next(id)) {
	unsigned numWords;
	VocabIndex *ref = refs.findRef(id);

	if (!(ref || (oneBest && !oneBestFirst) && errorsDir)) {
	    cerr << "missing reference for " << id << endl;
	    exit(1);
	}

	/*
	 * Remove pauses and noise from nbest hyps since these would
	 * confuse the inter-hyp alignments.
	 */
	nbest->removeNoise(nullLM);

	/*
	 * In 1-best mode we only need the error counts for each hypothesis;
	 * in sausage (default) mode we need to construct multiple alignment
	 * of reference and all n-best hyps.
	 */
	if (errorsDir) {
	    /*
	     *  read error counts 
	     */
	    if (!readErrorsFile(errorsDir, id, *nbest, numWords)) {
		cerr << "couldn't get error counts for " << id << endl;
		exit(2);
	    }
	} else {
	    /*
	     * need to recompute hyp errors (after removeNoise() above)
	     */
	    unsigned sub, ins, del;
	    nbest->wordError(ref, sub, ins, del);

	    numWords = Vocab::length(ref);
	}

	/*
	 * compute total length of references for later normalizations
	 */
	numRefWords += numWords;
    }

    cerr << numRefWords << " reference words\n";

    /*
     * preemptive trouble avoidance: prevent division by zero
     */
    if (numRefWords == 0) {
	numRefWords = 1;
    }

#ifndef NO_TIMEOUT
    /*
     * set up search time-out handler
     */
    if (maxTime) {
	signal(SIGALRM, catchAlarm);
    }
#endif /* !NO_TIMEOUT */

    double oldPosteriorScaleStep = posteriorScaleStep;
 
    if (oneBest || oneBestFirst) {
    	oneBest = true;
	posteriorScaleStep = 0.0;
	
	cerr << "Posterior scale step size set to " << posteriorScaleStep
	     << endl;

	unsigned errors = (int)computeErrors(trainSet, lambdas.data());
	printLambdas(cout, lambdas);

	if (initSimplex == 0) {
	    train(trainSet);
	} else {
	    trainAmoeba(trainSet);
	}
	cout << "original errors = " << errors
	     << " (" << ((double)errors/numRefWords) << "/word)"
	     << endl;
	cout << "best errors = " << bestError
	     << " (" << ((double)bestError/numRefWords) << "/word)" 
	     << endl;
    }

    if (oneBestFirst) {
	// restart search at best point found in 1-best search
	lambdas = bestLambdas;

	// scale weights to LMW==1
	if (lambdas[1] != 0.0) {
	    posteriorScale = lambdas[1];
	    for (unsigned i = 0; i < numScores; i ++) {
		lambdas[i] /= posteriorScale;
	    }
	}
    }

    if (!oneBest || oneBestFirst) {
    	oneBest = false;

        posteriorScaleStep = oldPosteriorScaleStep;
	cerr << "Posterior scale step size set to " << posteriorScaleStep
	     << endl;

	cerr << "aligning nbest lists...\n";
	alignNbest(trainSet, refs, subvocabDistance);
 
	unsigned errors = (int) computeErrors(trainSet, lambdas.data());
	printLambdas(cout, lambdas);

	if (initSimplex == 0) {
	    train(trainSet);
	} else {
	    trainAmoeba(trainSet);
	}
	cout << "original errors = " << errors
	     << " (" << ((double)errors/numRefWords) << "/word)"
	     << endl;
	cout << "best errors = " << bestError
	     << " (" << ((double)bestError/numRefWords) << "/word)" 
	     << endl;
    }

    printLambdas(cout, bestLambdas, writeRoverControl);

    if (printHyps) {
	File file(printHyps, "w");

	lambdas = bestLambdas;
	printTopHyps(file, trainSet);
    }

    exit(0);
}
上一页 1 2 34
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -