📄 hidden-ngram.cc
字号:
cout << lm.vocab.getWord(wids[i]) << " ";
if (hiddenWids[n][i] != noEventIndex) {
cout << lm.vocab.getWord(hiddenWids[n][i]) << " ";
}
}
cout << endl;
}
}
for (unsigned n = 0; n < numNbest; n++) {
delete [] hiddenWids[n];
}
}
/*
* Read a combined text+map file,
* disambiguate it, and print out the result
*/
void
disambiguateTextMap(File &file, SubVocab &hiddenVocab, LM &lm,
NgramCounts<NgramFractCount> *hiddenCounts,
unsigned numNbest)
{
char *line;
unsigned escapeLen = escape ? strlen(escape) : 0;
while (line = file.getline()) {
/*
* Hack alert! We pass the map entries associated with the word
* instances in a VocabMap, but we encode the word position (not
* its identity) as the first VocabIndex.
*/
PosVocabMap map(hiddenVocab);
unsigned numWords = 0;
Array<VocabIndex> wids;
/*
* Process one sentence at a time
*/
Boolean haveEscape = false;
do {
/*
* Pass escaped lines through unprocessed
* We also terminate an "sentence" whenever an escape line is found,
* but printing the escaped line has to be deferred until we're
* done processing the sentence.
*/
if (escape && strncmp(line, escape, escapeLen) == 0) {
haveEscape = true;
break;
}
/*
* Read map line
*/
VocabString mapFields[maxWordsPerLine];
unsigned howmany =
Vocab::parseWords(line, mapFields, maxWordsPerLine);
if (howmany == maxWordsPerLine) {
file.position() << "text map line has too many fields\n";
return;
}
/*
* First field is the observed word
*/
wids[numWords] =
lm.vocab.getIndex(mapFields[0], lm.vocab.unkIndex());
/*
* Parse the remaining words as either probs or hidden events
*/
unsigned i = 1;
while (i < howmany) {
double prob;
/*
* Use addWord here so new event names are added as needed
* (this means the -hidden-vocab option become optional).
*/
VocabIndex w2 = hiddenVocab.addWord(mapFields[i++]);
if (i < howmany && sscanf(mapFields[i], "%lf", &prob)) {
i ++;
} else {
prob = logMap ? LogP_One : 1.0;
}
map.put((VocabIndex)numWords, w2, prob);
}
} while (wids[numWords ++] != lm.vocab.seIndex() &&
(line = file.getline()));
if (numWords > 0) {
wids[numWords] = Vocab_None;
makeArray(VocabIndex *, hiddenWids, numNbest);
makeArray(LogP, totalProb, numNbest);
for (unsigned n = 0; n < numNbest; n++) {
hiddenWids[n] = new VocabIndex[numWords + 1];
assert(hiddenWids[n] != 0);
}
unsigned numHyps =
disambiguateSentence(&wids[0], hiddenWids, totalProb,
map, lm, hiddenCounts, numNbest, true);
if (!numHyps) {
file.position() << "Disambiguation failed\n";
} else if (totals) {
cout << totalProb[0] << endl;
} else if (!posteriors) {
for (unsigned n = 0; n < numHyps; n++) {
if (numNbest > 1) {
cout << "NBEST_" << n << " " << totalProb[n] << " ";
}
for (unsigned i = 0; hiddenWids[n][i] != Vocab_None; i ++) {
cout << lm.vocab.getWord(wids[i]) << " ";
if (hiddenWids[n][i] != noEventIndex) {
cout << lm.vocab.getWord(hiddenWids[n][i]) << " ";
}
}
cout << endl;
}
}
for (unsigned n = 0; n < numNbest; n++) {
delete [] hiddenWids[n];
}
}
if (haveEscape) {
cout << line;
}
}
}
LM *
makeMixLM(const char *filename, Vocab &vocab, SubVocab *classVocab,
unsigned order, LM *oldLM, double lambda1, double lambda2)
{
File file(filename, "r");
/*
* create class-ngram if -classes were specified, otherwise a regular ngram
*/
Ngram *lm = factored ?
new ProductNgram((ProductVocab &)vocab, order) :
(classVocab != 0) ?
(simpleClasses ?
new SimpleClassNgram(vocab, *classVocab, order) :
new ClassNgram(vocab, *classVocab, order)) :
new Ngram(vocab, order);
assert(lm != 0);
lm->debugme(debug);
if (!lm->read(file)) {
cerr << "format error in mix-lm file " << filename << endl;
exit(1);
}
/*
* Each class LM needs to read the class definitions
*/
if (classesFile != 0) {
File file(classesFile, "r");
((ClassNgram *)lm)->readClasses(file);
}
if (oldLM) {
/*
* Compute mixture lambda (make sure 0/0 = 0)
*/
Prob lambda = (lambda1 == 0.0) ? 0.0 : lambda1/lambda2;
LM *newLM = new BayesMix(vocab, *lm, *oldLM, 0, lambda);
assert(newLM != 0);
newLM->debugme(debug);
return newLM;
} else {
return lm;
}
}
int
main(int argc, char **argv)
{
setlocale(LC_CTYPE, "");
setlocale(LC_COLLATE, "");
Opt_Parse(argc, argv, options, Opt_Number(options), 0);
if (version) {
printVersion(RcsId);
exit(0);
}
if (factored && classesFile) {
cerr << "factored and class N-gram models are mutually exclusive\n";
exit(2);
}
if (numNbest <= 0) numNbest = 1;
// Silent fix. Ought to say something here.
/*
* Construct language model
*/
Vocab *vocab;
vocab = factored ? new ProductVocab : new Vocab;
assert(vocab != 0);
vocab->unkIsWord() = keepUnk ? true : false;
vocab->toLower() = toLower ? true : false;
if (factored) {
((ProductVocab *)vocab)->nullIsWord() = keepnull ? true : false;
}
SubVocab hiddenVocab(*vocab);
SubVocab *classVocab = 0;
LM *hiddenLM = 0;
NgramCounts<NgramFractCount> *hiddenCounts = 0;
if (lmFile) {
File file(lmFile, "r");
/*
* create based N-gram model (either factored, word or class-based)
*/
if (factored) {
hiddenLM = new ProductNgram(*(ProductVocab *)vocab, order);
} else if (classesFile) {
classVocab = new SubVocab(*vocab);
assert(classVocab != 0);
if (simpleClasses) {
hiddenLM = new SimpleClassNgram(*vocab, *classVocab, order);
} else {
cerr << "warning: state space will get very large; consider using -simple-classes\n";
hiddenLM = new ClassNgram(*vocab, *classVocab, order);
}
} else {
hiddenLM = new Ngram(*vocab, order);
}
assert(hiddenLM != 0);
hiddenLM->debugme(debug);
hiddenLM->read(file);
if (classesFile) {
File file(classesFile, "r");
((ClassNgram *)hiddenLM)->readClasses(file);
}
} else {
hiddenLM = new NullLM(*vocab);
assert(hiddenLM != 0);
hiddenLM->debugme(debug);
}
/*
* Build the full LM used for hidden event decoding
*/
LM *useLM = hiddenLM;
if (mixFile) {
/*
* create a Bayes mixture LM
*/
double mixLambda1 = 1.0 - mixLambda - mixLambda2 - mixLambda3 -
mixLambda4 - mixLambda5 - mixLambda6 -
mixLambda7 - mixLambda8 - mixLambda9;
useLM = makeMixLM(mixFile, *vocab, classVocab, order, useLM,
mixLambda1,
mixLambda + mixLambda1);
if (mixFile2) {
useLM = makeMixLM(mixFile2, *vocab, classVocab, order, useLM,
mixLambda2,
mixLambda + mixLambda1 + mixLambda2);
}
if (mixFile3) {
useLM = makeMixLM(mixFile3, *vocab, classVocab, order, useLM,
mixLambda3,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3);
}
if (mixFile4) {
useLM = makeMixLM(mixFile4, *vocab, classVocab, order, useLM,
mixLambda4,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4);
}
if (mixFile5) {
useLM = makeMixLM(mixFile5, *vocab, classVocab, order, useLM,
mixLambda5,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4 + mixLambda5);
}
if (mixFile6) {
useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
mixLambda6,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4 + mixLambda5 +
mixLambda6);
}
if (mixFile7) {
useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
mixLambda7,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4 + mixLambda5 +
mixLambda6 + mixLambda7);
}
if (mixFile8) {
useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
mixLambda8,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4 + mixLambda5 +
mixLambda6 + mixLambda7 + mixLambda8);
}
if (mixFile9) {
useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
mixLambda9, 1.0);
}
}
/*
* Make sure noevent token is not used in LM
*/
if (hiddenVocab.getIndex(noHiddenEvent) != Vocab_None) {
cerr << "LM must not contain " << noHiddenEvent << endl;
exit(1);
}
/*
* Allocate fractional counts tree
*/
if (countsFile) {
hiddenCounts = new NgramCounts<NgramFractCount>(*vocab, order);
assert(hiddenCounts);
hiddenCounts->debugme(debug);
}
/*
* Read event vocabulary
*/
if (hiddenVocabFile) {
File file(hiddenVocabFile, "r");
hiddenVocab.read(file);
}
if (forceEvent) {
/*
* Omit the noevent token from hidden vocabulary.
* We still have to assign an index to it, so just use the regular
* vocabulary.
*/
noEventIndex = vocab->addWord(noHiddenEvent);
} else {
/*
* Add noevent token to hidden vocabulary
*/
noEventIndex = hiddenVocab.addWord(noHiddenEvent);
}
if (textFile) {
File file(textFile, "r");
if (continuous) {
disambiguateFileContinuous(file, hiddenVocab, *useLM,
hiddenCounts, numNbest);
} else {
disambiguateFile(file, hiddenVocab, *useLM, hiddenCounts, numNbest);
}
}
if (textMapFile) {
File file(textMapFile, "r");
disambiguateTextMap(file, hiddenVocab, *useLM, hiddenCounts, numNbest);
}
if (countsFile) {
File file(countsFile, "w");
hiddenCounts->write(file, 0, true);
}
#ifdef DEBUG
delete hiddenLM;
delete hiddenCounts;
return 0;
#endif /* DEBUG */
exit(0);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -