📄 nbest.cc
字号:
/*
* NBest.cc --
* N-best hypotheses and lists
*
*/
#ifndef lint
static char Copyright[] = "Copyright (c) 1995-2006 SRI International. All Rights Reserved.";
static char RcsId[] = "@(#)$Header: /home/srilm/devel/lm/src/RCS/NBest.cc,v 1.57 2006/01/09 18:08:21 stolcke Exp $";
#endif
#include <iostream>
using namespace std;
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include "NBest.h"
#include "WordAlign.h"
#include "Array.cc"
#ifdef INSTANTIATE_TEMPLATES
INSTANTIATE_ARRAY(NBestHyp);
#endif
#define DEBUG_PRINT_RANK 1
const char *phoneSeparator = ":"; // used for phones & phoneDurs strings
const NBestTimestamp frameLength = 0.01f; // quantization unit of word timemarks
/*
* N-best word backtrace information
*/
const unsigned phoneStringLength = 100;
NBestWordInfo::NBestWordInfo()
: word(Vocab_None), phones(0), phoneDurs(0),
wordPosterior(0.0), transPosterior(0.0)
{
}
NBestWordInfo::~NBestWordInfo()
{
if (phones) free(phones);
if (phoneDurs) free(phoneDurs);
}
NBestWordInfo &
NBestWordInfo::operator= (const NBestWordInfo &other)
{
if (&other == this) {
return *this;
}
if (phones) free(phones);
if (phoneDurs) free(phoneDurs);
word = other.word;
start = other.start;
duration = other.duration;
acousticScore = other.acousticScore;
languageScore = other.languageScore;
if (!other.phones) {
phones = 0;
} else {
phones = strdup(other.phones);
assert(phones != 0);
}
if (!other.phoneDurs) {
phoneDurs = 0;
} else {
phoneDurs = strdup(other.phoneDurs);
assert(phoneDurs != 0);
}
wordPosterior = other.wordPosterior;
transPosterior = other.transPosterior;
return *this;
}
void
NBestWordInfo::write(File &file)
{
fprintf(file, "%lg %lg %lg %lg %s %s",
(double)start, (double)duration,
(double)acousticScore, (double)languageScore,
phones ? phones : phoneSeparator,
phoneDurs ? phoneDurs : phoneSeparator);
}
Boolean
NBestWordInfo::parse(const char *s)
{
double sTime, dur, aScore, lScore;
char phs[phoneStringLength], phDurs[phoneStringLength];
if (sscanf(s, "%lg %lg %lg %lg %100s %100s",
&sTime, &dur, &aScore, &lScore, phs, phDurs) != 6)
{
return false;
} else {
start = (NBestTimestamp)sTime;
duration = (NBestTimestamp)dur;
acousticScore = (LogP)aScore;
languageScore = (LogP)lScore;
if (strcmp(phs, phoneSeparator) == 0) {
phones = 0;
} else {
phones = strdup(phs);
assert(phones != 0);
}
if (strcmp(phDurs, phoneSeparator) == 0) {
phoneDurs = 0;
} else {
phoneDurs = strdup(phDurs);
assert(phoneDurs != 0);
}
return true;
}
}
void
NBestWordInfo::invalidate()
{
duration = 0.0;
}
Boolean
NBestWordInfo::valid() const
{
return (duration > 0);
}
void
NBestWordInfo::merge(const NBestWordInfo &other)
{
/*
* let the "other" word information supercede our own if it has
* higher duration-normalized acoustic likelihood
*/
if (other.acousticScore/other.duration > acousticScore/duration)
{
*this = other;
}
}
/*
* N-Best hypotheses
*/
NBestHyp::NBestHyp()
{
words = 0;
wordInfo = 0;
acousticScore = languageScore = totalScore = 0.0;
posterior = 0.0;
numWords = numErrors = 0;
rank = 0;
}
NBestHyp::~NBestHyp()
{
delete [] words;
delete [] wordInfo;
}
NBestHyp &
NBestHyp::operator= (const NBestHyp &other)
{
// cerr << "warning: NBestHyp::operator= called\n";
if (&other == this) {
return *this;
}
delete [] words;
delete [] wordInfo;
acousticScore = other.acousticScore;
languageScore = other.languageScore;
totalScore = other.totalScore;
numWords = other.numWords;
posterior = other.posterior;
numErrors = other.numErrors;
rank = other.rank;
if (other.words) {
unsigned actualNumWords = Vocab::length(other.words) + 1;
words = new VocabIndex[actualNumWords];
assert(words != 0);
for (unsigned i = 0; i < actualNumWords; i++) {
words[i] = other.words[i];
}
if (other.wordInfo) {
wordInfo = new NBestWordInfo[actualNumWords];
assert(wordInfo != 0);
for (unsigned i = 0; i < actualNumWords; i++) {
wordInfo[i] = other.wordInfo[i];
}
} else {
wordInfo = 0;
}
} else {
words = 0;
wordInfo = 0;
}
return *this;
}
/*
* N-Best Hypotheses
*/
const char multiwordSeparator = '_';
static Boolean
addPhones(char *old, const char *ph, Boolean reversed = false)
{
unsigned oldLen = strlen(old);
unsigned newLen = strlen(ph);
if (oldLen + 1 + newLen + 1 > phoneStringLength) {
return false;
} else if (reversed) {
if (oldLen > 0) {
memmove(&old[newLen + 1], old, oldLen + 1);
}
strcpy(old, ph);
if (oldLen > 0) {
old[newLen] = phoneSeparator[0];
}
} else {
if (oldLen > 0) {
old[oldLen ++] = phoneSeparator[0];
}
strcpy(&old[oldLen], ph);
}
return true;
}
Boolean
NBestHyp::parse(char *line, Vocab &vocab, unsigned decipherFormat,
LogP acousticOffset, Boolean multiwords, Boolean backtrace)
{
const unsigned maxFieldsPerLine = 11 * maxWordsPerLine + 4;
/* NBestList2.0 format uses 11 fields per word */
static VocabString wstrings[maxFieldsPerLine];
static VocabString justWords[maxFieldsPerLine + 1];
Array<NBestWordInfo> backtraceInfo;
unsigned actualNumWords =
Vocab::parseWords(line, wstrings, maxFieldsPerLine);
if (actualNumWords == maxFieldsPerLine) {
cerr << "more than " << maxFieldsPerLine << " fields per line\n";
return false;
}
/*
* We don't do multiword splitting with backtraces -- that would require
* a dictionary (see external split-nbest-words script).
*/
if (backtrace) {
multiwords = false;
}
/*
* We accept three formats for N-best hyps.
* - The Decipher NBestList1.0 format, which has one combined bytelog score
* in parens preceding the hyp.
* - The Decipher NBestList2.0 format, where each word is followed by
* ( st: <starttime> et: <endtime> g: <grammar_score> a: <ac_score> )
* - Our own format, which has acoustic score, LM score, and number of
* words followed by the hyp.
* If (decipherFormat > 0) only the specified Decipher format is accepted.
*/
if (decipherFormat == 1 ||
decipherFormat == 0 && wstrings[0][0] == '(')
{
/*
* These formats don't support backtrace info
*/
backtrace = false;
actualNumWords --;
if (actualNumWords > maxWordsPerLine) {
cerr << "more than " << maxWordsPerLine << " words in hyp\n";
return false;
}
/*
* Parse the first word as a score (in parens)
*/
double score;
if (sscanf(wstrings[0], "(%lf)", &score) != 1)
{
cerr << "bad Decipher score: " << wstrings[0] << endl;
return false;
}
/*
* Save score
*/
totalScore = acousticScore = BytelogToLogP(score);
languageScore = 0.0;
/*
* Note: numWords includes pauses, consistent with the way the
* recognizer applies word transition weights. Elimination of pauses
* is the job of LM rescoring.
*/
numWords = actualNumWords;
Vocab::copy(justWords, &wstrings[1]);
} else if (decipherFormat == 2) {
if ((actualNumWords - 1) % 11) {
cerr << "badly formatted hyp\n";
return false;
}
unsigned numTokens = (actualNumWords - 1)/11;
if (numTokens > maxWordsPerLine) {
cerr << "more than " << maxWordsPerLine << " tokens in hyp\n";
return false;
}
/*
* Parse the first word as a score (in parens)
*/
double score;
if (sscanf(wstrings[0], "(%lf)", &score) != 1)
{
cerr << "bad Decipher score: " << wstrings[0] << endl;
return false;
}
/*
* Parse remaining line into words and scores
* skip over phone and state backtrace tokens, which can be
* identified by noting that their times are contained within
* the word duration.
*/
Bytelog acScore = 0;
Bytelog lmScore = 0;
NBestTimestamp prevEndTime = -1.0; /* end time of last token */
NBestTimestamp prevPhoneStart = 0.0;
NBestWordInfo *prevWordInfo = 0;
char phones[phoneStringLength];
char phoneDurs[phoneStringLength];
actualNumWords = 0;
for (unsigned i = 0; i < numTokens; i ++) {
const char *token = wstrings[1 + 11 * i];
NBestTimestamp startTime = (NBestTimestamp)atof(wstrings[1 + 11 * i + 3]);
NBestTimestamp endTime = (NBestTimestamp)atof(wstrings[1 + 11 * i + 5]);
/*
* Check if this token refers to an HMM state, i.e., if
* it matches the pattern /-[0-9]$/.
* XXX: because of a bug in Decipher we need to perform this
* check even if we're scanning for word tokens.
*/
const char *hyphen = strrchr(token, '-');
Boolean isStateToken = hyphen != 0 &&
hyphen[1] >= '0' && hyphen[1] <= '9' &&
hyphen[2] == '\0';
if (startTime > prevEndTime && !isStateToken) {
int acWordScore = atol(wstrings[1 + 11 * i + 9]);
int lmWordScore = atol(wstrings[1 + 11 * i + 7]);
justWords[actualNumWords] = token;
if (backtrace) {
/*
* save pronunciation info for previous word
*/
if (prevWordInfo) {
prevWordInfo->phones = strdup(phones);
assert(prevWordInfo->phones != 0);
prevWordInfo->phoneDurs = strdup(phoneDurs);
assert(prevWordInfo->phoneDurs != 0);
}
NBestWordInfo winfo;
winfo.word = Vocab_None;
winfo.start = startTime;
/*
* NB: "et" in nbest backtrace is actually the START time
* of the last frame
*/
winfo.duration = endTime - startTime + frameLength;
winfo.acousticScore = BytelogToLogP(acWordScore);
winfo.languageScore = BytelogToLogP(lmWordScore);
winfo.phones = winfo.phoneDurs = 0;
backtraceInfo[actualNumWords] = winfo;
/*
* prepare for collecting phone backtrace info
*/
prevWordInfo = &backtraceInfo[actualNumWords];
phones[0] = phoneDurs[0] = '\0';
}
acScore += acWordScore;
lmScore += lmWordScore;
actualNumWords ++;
prevEndTime = endTime;
} else {
/*
* check if this token refers to an HMM state, i.e., if
* if matches the pattern /-[0-9]$/
*/
if (isStateToken) {
continue;
}
/*
* a phone token: if we're extracting backtrace information,
* get phone identity and duration and store in word Info
*/
if (prevWordInfo) {
const char *lbracket = strchr(token, '[');
const char *phone = lbracket ? lbracket + 1 : token;
char *rbracket = (char *)strrchr(phone, ']');
if (rbracket) *rbracket = '\0';
addPhones(phones, phone, startTime < prevPhoneStart);
char phoneDur[20];
sprintf(phoneDur, "%d",
(int)((endTime - startTime)/frameLength + 0.5) + 1);
addPhones(phoneDurs, phoneDur, startTime < prevPhoneStart);
prevPhoneStart = startTime;
}
}
}
if (backtrace) {
/*
* save pronunciation info for last word
*/
if (prevWordInfo) {
prevWordInfo->phones = strdup(phones);
assert(prevWordInfo->phones != 0);
prevWordInfo->phoneDurs = strdup(phoneDurs);
assert(prevWordInfo->phoneDurs != 0);
}
}
justWords[actualNumWords] = 0;
/*
* Save scores
*/
totalScore = BytelogToLogP(score);
acousticScore = BytelogToLogP(acScore);
languageScore = BytelogToLogP(lmScore);
numWords = actualNumWords;
/*
if (score != acScore + lmScore) {
cerr << "acoustic and language model scores don't add up ("
<< acScore << " + " << lmScore << " != " << score << ")\n";
}
*/
} else {
actualNumWords -= 3;
if (actualNumWords > maxWordsPerLine) {
cerr << "more than " << maxWordsPerLine << " words in hyp\n";
return false;
}
/*
* Parse the first three columns as numbers
*/
if (!parseLogP(wstrings[0], acousticScore)) {
cerr << "bad acoustic score: " << wstrings[0] << endl;
return false;
}
if (!parseLogP(wstrings[1], languageScore)) {
cerr << "bad LM score: " << wstrings[1] << endl;
return false;
}
if (sscanf(wstrings[2], "%u", &numWords) != 1) {
cerr << "bad word count: " << wstrings[2] << endl;
return false;
}
/*
* Set the total score to the acoustic score so
* decipherFix() with a null language model leaves everything
* unchanged.
*/
totalScore = acousticScore;
Vocab::copy(justWords, &wstrings[3]);
}
/*
* Apply acoustic normalization in effect
*/
acousticScore -= acousticOffset;
totalScore -= acousticOffset;
/*
* Adjust number of words for multiwords if appropriate
*/
if (multiwords) {
for (unsigned j = 0; justWords[j] != 0; j ++) {
const char *cp = justWords[j];
for (cp = strchr(cp, multiwordSeparator);
cp != 0;
cp = strchr(cp + 1, multiwordSeparator))
{
actualNumWords ++;
}
}
}
/*
* Copy words to nbest list
*/
delete [] words;
words = new VocabIndex[actualNumWords + 1];
assert(words != 0);
Boolean unkIsWord = vocab.unkIsWord();
/*
* Map word strings to indices
*/
if (!multiwords) {
if (unkIsWord) {
vocab.getIndices(justWords, words, actualNumWords + 1,
vocab.unkIndex());
} else {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -