📄 htklattice.cc
字号:
/*
* HTKLattice.cc --
* HTK Standard Lattice Format support for SRILM lattices
*
* Note: there is no separate HTKLattice class, only I/O methods!
*
*/
#ifndef lint
static char Copyright[] = "Copyright (c) 2003-2006 SRI International. All Rights Reserved.";
static char RcsId[] = "@(#)$Header: /home/srilm/devel/lattice/src/RCS/HTKLattice.cc,v 1.40 2006/01/16 19:34:15 stolcke Exp $";
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <math.h>
#include <assert.h>
#include "Prob.h"
#include "Array.cc"
#include "LHash.cc"
#include "Lattice.h"
#include "MultiwordVocab.h"
#include "NBest.h" // for phoneSeparator defn
#ifdef INSTANTIATE_TEMPLATES
INSTANTIATE_ARRAY(HTKWordInfo);
#endif
/* from Lattice.cc */
#define DebugPrintFatalMessages 1
#define DebugPrintFunctionality 1
const char *HTKLattice_Version = "1.1";
const char *HTK_null_word = "!NULL";
const char HTK_single_quote = '\'';
const char HTK_double_quote = '\"';
const char HTK_escape_quote = '\\';
const float HTK_def_tscale = 1.0;
const float HTK_def_acscale = 1.0;
const float HTK_def_lmscale = 1.0;
const float HTK_def_ngscale = 1.0;
const float HTK_def_wdpenalty = 0.0;
const float HTK_def_prscale = 1.0;
const float HTK_def_duscale = 0.0;
const float HTK_def_xscale = 0.0;
HTKHeader::HTKHeader()
: logbase(10), tscale(HTK_def_tscale), acscale(HTK_def_acscale),
ngscale(HTK_def_ngscale), lmscale(HTK_def_lmscale),
wdpenalty(HTK_def_wdpenalty), prscale(HTK_def_prscale),
duscale(HTK_def_duscale), amscale(HTK_undef_float),
x1scale(HTK_def_xscale), x2scale(HTK_def_xscale), x3scale(HTK_def_xscale),
x4scale(HTK_def_xscale), x5scale(HTK_def_xscale), x6scale(HTK_def_xscale),
x7scale(HTK_def_xscale), x8scale(HTK_def_xscale), x9scale(HTK_def_xscale),
vocab(0), lmname(0), ngname(0), hmms(0),
wordsOnNodes(false), scoresOnNodes(false), useQuotes(false)
{
};
HTKHeader::HTKHeader(double acscale, double lmscale, double ngscale,
double prscale, double duscale, double wdpenalty,
double x1scale, double x2scale, double x3scale,
double x4scale, double x5scale, double x6scale,
double x7scale, double x8scale, double x9scale)
: logbase(10), tscale(HTK_def_tscale), acscale(acscale),
ngscale(ngscale), lmscale(lmscale),
wdpenalty(wdpenalty), prscale(prscale),
duscale(duscale), amscale(HTK_undef_float),
x1scale(x2scale), x2scale(x2scale), x3scale(x3scale),
x4scale(x4scale), x5scale(x5scale), x6scale(x6scale),
x7scale(x7scale), x8scale(x8scale), x9scale(x9scale),
vocab(0), lmname(0), ngname(0), hmms(0),
wordsOnNodes(false), scoresOnNodes(false), useQuotes(false)
{
};
HTKHeader::~HTKHeader()
{
if (vocab) free(vocab);
if (lmname) free(lmname);
if (ngname) free(ngname);
if (hmms) free(hmms);
}
HTKHeader &
HTKHeader::operator= (const HTKHeader &other)
{
if (&other == this) {
return *this;
}
if (vocab) free(vocab);
if (lmname) free(lmname);
if (ngname) free(ngname);
if (hmms) free(hmms);
tscale = other.tscale;
acscale = other.acscale;
ngscale = other.ngscale;
lmscale = other.lmscale;
wdpenalty = other.wdpenalty;
prscale = other.prscale;
duscale = other.duscale;
x1scale = other.x1scale;
x2scale = other.x2scale;
x3scale = other.x3scale;
x4scale = other.x4scale;
x5scale = other.x5scale;
x6scale = other.x6scale;
x7scale = other.x7scale;
x8scale = other.x8scale;
x9scale = other.x9scale;
amscale = other.amscale;
if (other.vocab == 0) {
vocab = 0;
} else {
vocab = strdup(other.vocab);
assert(vocab != 0);
}
if (other.lmname == 0) {
lmname = 0;
} else {
lmname = strdup(other.lmname);
assert(lmname != 0);
}
if (other.ngname == 0) {
ngname = 0;
} else {
ngname = strdup(other.ngname);
assert(ngname != 0);
}
if (other.hmms == 0) {
hmms = 0;
} else {
hmms = strdup(other.hmms);
assert(hmms != 0);
}
return *this;
}
HTKWordInfo::HTKWordInfo()
: time(HTK_undef_float), word(Vocab_None), var(HTK_undef_uint),
div(0), states(0),
acoustic(HTK_undef_float), ngram(HTK_undef_float),
language(HTK_undef_float), pron(HTK_undef_float),
duration(HTK_undef_float), xscore1(HTK_undef_float),
xscore2(HTK_undef_float), xscore3(HTK_undef_float),
xscore4(HTK_undef_float), xscore5(HTK_undef_float),
xscore6(HTK_undef_float), xscore7(HTK_undef_float),
xscore8(HTK_undef_float), xscore9(HTK_undef_float),
posterior(HTK_undef_float)
{
}
HTKWordInfo::HTKWordInfo(const HTKWordInfo &other)
: div(0), states(0)
{
*this = other;
}
HTKWordInfo::~HTKWordInfo()
{
if (div) free(div);
if (states) free(states);
}
HTKWordInfo &
HTKWordInfo::operator= (const HTKWordInfo &other)
{
if (&other == this) {
return *this;
}
if (div) free(div);
if (states) free(states);
time = other.time;
word = other.word;
var = other.var;
if (other.div == 0) {
div = 0;
} else {
div = strdup(other.div);
assert(div != 0);
}
if (other.states == 0) {
states = 0;
} else {
states = strdup(other.states);
assert(states != 0);
}
acoustic = other.acoustic;
ngram = other.ngram;
language = other.language;
pron = other.pron;
duration = other.duration;
xscore1 = other.xscore1;
xscore2 = other.xscore2;
xscore3 = other.xscore3;
xscore4 = other.xscore4;
xscore5 = other.xscore5;
xscore6 = other.xscore6;
xscore7 = other.xscore7;
xscore8 = other.xscore8;
xscore9 = other.xscore9;
posterior = other.posterior;
return *this;
}
/*
* Format HTKWordInfo (for debugging)
*/
ostream &
operator<< (ostream &stream, HTKWordInfo &link)
{
stream << "[HTKWordInfo";
if (link.word != Vocab_None) {
stream << " WORD=" << link.word;
}
if (link.time != HTK_undef_float) {
stream << " time=" << link.time;
}
if (link.var != HTK_undef_uint) {
stream << " var=" << link.var;
}
if (link.div != 0) {
stream << " div=" << link.div;
}
if (link.states != 0) {
stream << " s=" << link.states;
}
if (link.acoustic != HTK_undef_float) {
stream << " a=" << link.acoustic;
}
if (link.ngram != HTK_undef_float) {
stream << " n=" << link.ngram;
}
if (link.language != HTK_undef_float) {
stream << " l=" << link.language;
}
if (link.pron != HTK_undef_float) {
stream << " r=" << link.pron;
}
if (link.duration != HTK_undef_float) {
stream << " ds=" << link.duration;
}
if (link.xscore1 != HTK_undef_float) {
stream << " x1=" << link.xscore1;
}
if (link.xscore2 != HTK_undef_float) {
stream << " x2=" << link.xscore2;
}
if (link.xscore3 != HTK_undef_float) {
stream << " x3=" << link.xscore3;
}
if (link.xscore4 != HTK_undef_float) {
stream << " x4=" << link.xscore4;
}
if (link.xscore5 != HTK_undef_float) {
stream << " x5=" << link.xscore5;
}
if (link.xscore6 != HTK_undef_float) {
stream << " x6=" << link.xscore6;
}
if (link.xscore7 != HTK_undef_float) {
stream << " x7=" << link.xscore7;
}
if (link.xscore8 != HTK_undef_float) {
stream << " x8=" << link.xscore8;
}
if (link.xscore9 != HTK_undef_float) {
stream << " x9=" << link.xscore9;
}
if (link.posterior != HTK_undef_float) {
stream << " p=" << link.posterior;
}
stream << "]";
return stream;
}
/*
* Find the next key=value pair in line, return string value, nad
* advance line pointer past it.
* The string pointed to by line is modified in the process.
*/
static char *
getHTKField(char *&line, char *&value, Boolean useQuotes)
{
char *cp = line;
char *key;
do {
switch (*cp) {
case '\0':
case '#':
return 0;
break;
case ' ':
case '\t':
case '\n':
cp ++;
break;
default:
key = cp;
while (*cp != '\0' && !isspace(*cp) && *cp != '=') cp++;
if (*cp == '=') {
*(cp++) = '\0'; // terminate key string
value = cp; // beginning of value string
char *cpv = cp; // target location for copying value
char inquote = '\0';
/*
* Quotes are only treated specially if they
* occur in first position
*/
if (useQuotes &&
(*cp == HTK_single_quote || *cp == HTK_double_quote))
{
inquote = *(cp++);
}
while (*cp != '\0') {
if (useQuotes && *cp == HTK_escape_quote) {
/*
* Backslash quote processing
*/
cp ++;
if (*cp == '\0') {
/*
* Shouldn't happen, we just ignore it
*/
break;
} else if (*cp == '0') {
/*
* Octal char code
*/
unsigned charcode;
unsigned charlen;
sscanf(cp, "%o%n", &charcode, &charlen);
*(cpv++) = charcode;
cp += charlen;
} else {
/*
* Other quoted character
*/
*(cpv++) = *(cp++);
}
} else if (!inquote && isspace(*cp)) {
/*
* String deliminted by White-space
*/
cp ++;
break;
} else if (inquote && *cp == inquote) {
/*
* String delimited by end quote
*/
cp ++;
break;
} else {
/*
* Character in string
*/
*(cpv++) = *(cp++);
}
}
*cpv = '\0'; // terminate value string
} else {
value = cp; // beginning of value string
if (*cp != '\0') {
*(cp++) = '\0'; // terminate value string
}
}
line = cp;
return key;
}
} while (1);
}
/*
* Convert string to log score
*/
static inline LogP
getHTKscore(const char *value, double logbase, File &file)
{
if (logbase > 0.0) {
LogP score;
if (parseLogP(value, score)) {
return score * ProbToLogP(logbase);
} else {
file.position() << "warning: malformed HTK log score "
<< value << endl;
return LogP_Zero;
}
} else {
return ProbToLogP(atof(value));
}
}
/*
* Output quoted version of string
*/
static void
printQuoted(FILE *f, const char *name, Boolean useQuotes)
{
Boolean octalPrinted = false;
if (!useQuotes) {
fputs(name, f);
} else {
for (const char *cp = name; *cp != '\0'; cp ++) {
if (*cp == ' ' || *cp == HTK_escape_quote ||
cp == name &&
(*cp == HTK_single_quote || *cp == HTK_double_quote) ||
octalPrinted && isdigit(*cp))
{
/*
* This character needs to be quoted
*/
putc(HTK_escape_quote, f);
putc(*cp, f);
octalPrinted = false;
} else if (!isprint(*cp) || isspace(*cp)) {
/*
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -