📄 hidden-ngram.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
		cout << lm.vocab.getWord(wids[i]) << " ";

		if (hiddenWids[n][i] != noEventIndex) {
		    cout << lm.vocab.getWord(hiddenWids[n][i]) << " ";
		}
	    }
	    cout << endl;
	}
    }
    for (unsigned n = 0; n < numNbest; n++) {
	delete [] hiddenWids[n];
    }
}

/*
 * Read a combined text+map file,
 * disambiguate it, and print out the result
 */
void
disambiguateTextMap(File &file, SubVocab &hiddenVocab, LM &lm,
		    NgramCounts<NgramFractCount> *hiddenCounts,
		    unsigned numNbest)
{
    char *line;

    unsigned escapeLen = escape ? strlen(escape) : 0;

    while (line = file.getline()) {

	/*
	 * Hack alert! We pass the map entries associated with the word
	 * instances in a VocabMap, but we encode the word position (not
	 * its identity) as the first VocabIndex.
	 */
	PosVocabMap map(hiddenVocab);

	unsigned numWords = 0;
	Array<VocabIndex> wids;

	/*
	 * Process one sentence at a time
	 */
	Boolean haveEscape = false;

	do {
	    /*
	     * Pass escaped lines through unprocessed
	     * We also terminate an "sentence" whenever an escape line is found,
	     * but printing the escaped line has to be deferred until we're
	     * done processing the sentence.
	     */
	    if (escape && strncmp(line, escape, escapeLen) == 0) {
		haveEscape = true;
		break;
	    }

	    /*
	     * Read map line
	     */
	    VocabString mapFields[maxWordsPerLine];

	    unsigned howmany =
			Vocab::parseWords(line, mapFields, maxWordsPerLine);

	    if (howmany == maxWordsPerLine) {
		file.position() << "text map line has too many fields\n";
		return;
	    }

	    /*
	     * First field is the observed word
	     */
	    wids[numWords] =
			lm.vocab.getIndex(mapFields[0], lm.vocab.unkIndex());
	    
	    /*
	     * Parse the remaining words as either probs or hidden events
	     */
	    unsigned i = 1;

	    while (i < howmany) {
		double prob;

		/*
		 * Use addWord here so new event names are added as needed
		 * (this means the -hidden-vocab option become optional).
		 */
		VocabIndex w2 = hiddenVocab.addWord(mapFields[i++]);

		if (i < howmany && sscanf(mapFields[i], "%lf", &prob)) {
		    i ++;
		} else {
		    prob = logMap ? LogP_One : 1.0;
		}

		map.put((VocabIndex)numWords, w2, prob);
	    }
	} while (wids[numWords ++] != lm.vocab.seIndex() &&
		 (line = file.getline()));

	if (numWords > 0) {
	    wids[numWords] = Vocab_None;

	    makeArray(VocabIndex *, hiddenWids, numNbest);
	    makeArray(LogP, totalProb, numNbest);

	    for (unsigned n = 0; n < numNbest; n++) {
		hiddenWids[n] = new VocabIndex[numWords + 1];
		assert(hiddenWids[n] != 0);
	    }

	    unsigned numHyps =
		    disambiguateSentence(&wids[0], hiddenWids, totalProb,
					 map, lm, hiddenCounts, numNbest, true);
	    if (!numHyps) {
		file.position() << "Disambiguation failed\n";
	    } else if (totals) {
		cout << totalProb[0] << endl;
	    } else if (!posteriors) {
		for (unsigned n = 0; n < numHyps; n++) {
		    if (numNbest > 1) {
		      cout << "NBEST_" << n << " " << totalProb[n] << " ";
		    }
		    for (unsigned i = 0; hiddenWids[n][i] != Vocab_None; i ++) {
			cout << lm.vocab.getWord(wids[i]) << " ";
			if (hiddenWids[n][i] != noEventIndex) {
			    cout << lm.vocab.getWord(hiddenWids[n][i]) << " ";
			}
		    }
		    cout << endl;
		}
	    }

	    for (unsigned n = 0; n < numNbest; n++) {
		delete [] hiddenWids[n];
	    }
	}

	if (haveEscape) {
	    cout << line;
	}
    }
}

LM *
makeMixLM(const char *filename, Vocab &vocab, SubVocab *classVocab,
		    unsigned order, LM *oldLM, double lambda1, double lambda2)
{
    File file(filename, "r");

    /*
     * create class-ngram if -classes were specified, otherwise a regular ngram
     */
    Ngram *lm = factored ? 
		  new ProductNgram((ProductVocab &)vocab, order) :
		  (classVocab != 0) ?
		    (simpleClasses ?
			new SimpleClassNgram(vocab, *classVocab, order) :
		        new ClassNgram(vocab, *classVocab, order)) :
		    new Ngram(vocab, order);
    assert(lm != 0);

    lm->debugme(debug);

    if (!lm->read(file)) {
	cerr << "format error in mix-lm file " << filename << endl;
	exit(1);
    }

    /*
     * Each class LM needs to read the class definitions
     */
    if (classesFile != 0) {
	File file(classesFile, "r");
	((ClassNgram *)lm)->readClasses(file);
    }

    if (oldLM) {
	/*
	 * Compute mixture lambda (make sure 0/0 = 0)
	 */
	Prob lambda = (lambda1 == 0.0) ? 0.0 : lambda1/lambda2;

	LM *newLM = new BayesMix(vocab, *lm, *oldLM, 0, lambda);
	assert(newLM != 0);

	newLM->debugme(debug);

	return newLM;
    } else {
	return lm;
    }
}

int
main(int argc, char **argv)
{
    setlocale(LC_CTYPE, "");
    setlocale(LC_COLLATE, "");

    Opt_Parse(argc, argv, options, Opt_Number(options), 0);

    if (version) {
	printVersion(RcsId);
	exit(0);
    }

    if (factored && classesFile) {
	cerr << "factored and class N-gram models are mutually exclusive\n";
	exit(2);
    }

    if (numNbest <= 0) numNbest = 1;		  
				// Silent fix.  Ought to say something here.

    /*
     * Construct language model
     */
    Vocab *vocab;

    vocab = factored ? new ProductVocab : new Vocab;
    assert(vocab != 0);

    vocab->unkIsWord() = keepUnk ? true : false;
    vocab->toLower() = toLower ? true : false;

    if (factored) {
	((ProductVocab *)vocab)->nullIsWord() = keepnull ? true : false;
    }

    SubVocab hiddenVocab(*vocab);
    SubVocab *classVocab = 0;

    LM    *hiddenLM = 0;
    NgramCounts<NgramFractCount> *hiddenCounts = 0;

    if (lmFile) {
	File file(lmFile, "r");

	/*
	 * create based N-gram model (either factored,  word or class-based)
	 */
	if (factored) {
	    hiddenLM = new ProductNgram(*(ProductVocab *)vocab, order);
	} else if (classesFile) {
	    classVocab = new SubVocab(*vocab);
	    assert(classVocab != 0);

	    if (simpleClasses) {
		hiddenLM = new SimpleClassNgram(*vocab, *classVocab, order);
	    } else {
		cerr << "warning: state space will get very large; consider using -simple-classes\n";
		hiddenLM = new ClassNgram(*vocab, *classVocab, order);
	    }
	} else {
	    hiddenLM = new Ngram(*vocab, order);
	}
	assert(hiddenLM != 0);

	hiddenLM->debugme(debug);
	hiddenLM->read(file);

	if (classesFile) {
	    File file(classesFile, "r");
	    ((ClassNgram *)hiddenLM)->readClasses(file);
	}
    } else {
	hiddenLM = new NullLM(*vocab);
	assert(hiddenLM != 0);
	hiddenLM->debugme(debug);
    }

    /*
     * Build the full LM used for hidden event decoding
     */
    LM *useLM = hiddenLM;

    if (mixFile) {
	/*
	 * create a Bayes mixture LM 
	 */
	double mixLambda1 = 1.0 - mixLambda - mixLambda2 - mixLambda3 -
				mixLambda4 - mixLambda5 - mixLambda6 -
				mixLambda7 - mixLambda8 - mixLambda9;

	useLM = makeMixLM(mixFile, *vocab, classVocab, order, useLM,
				mixLambda1,
				mixLambda + mixLambda1);

	if (mixFile2) {
	    useLM = makeMixLM(mixFile2, *vocab, classVocab, order, useLM,
				mixLambda2,
				mixLambda + mixLambda1 + mixLambda2);
	}
	if (mixFile3) {
	    useLM = makeMixLM(mixFile3, *vocab, classVocab, order, useLM,
				mixLambda3,
				mixLambda + mixLambda1 + mixLambda2 +
				mixLambda3);
	}
	if (mixFile4) {
	    useLM = makeMixLM(mixFile4, *vocab, classVocab, order, useLM,
				mixLambda4,
				mixLambda + mixLambda1 + mixLambda2 +
				mixLambda3 + mixLambda4);
	}
	if (mixFile5) {
	    useLM = makeMixLM(mixFile5, *vocab, classVocab, order, useLM,
				mixLambda5,
				mixLambda + mixLambda1 + mixLambda2 +
				mixLambda3 + mixLambda4 + mixLambda5);
	}
	if (mixFile6) {
	    useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
				mixLambda6,
				mixLambda + mixLambda1 + mixLambda2 +
				mixLambda3 + mixLambda4 + mixLambda5 +
				mixLambda6);
	}
	if (mixFile7) {
	    useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
				mixLambda7,
				mixLambda + mixLambda1 + mixLambda2 +
				mixLambda3 + mixLambda4 + mixLambda5 +
				mixLambda6 + mixLambda7);
	}
	if (mixFile8) {
	    useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
				mixLambda8,
				mixLambda + mixLambda1 + mixLambda2 +
				mixLambda3 + mixLambda4 + mixLambda5 +
				mixLambda6 + mixLambda7 + mixLambda8);
	}
	if (mixFile9) {
	    useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
				mixLambda9, 1.0);
	}
    }

    /*
     * Make sure noevent token is not used in LM
     */
    if (hiddenVocab.getIndex(noHiddenEvent) != Vocab_None) {
	cerr << "LM must not contain " << noHiddenEvent << endl;
	exit(1);
    }

    /*
     * Allocate fractional counts tree
     */
    if (countsFile) {
	hiddenCounts = new NgramCounts<NgramFractCount>(*vocab, order);
	assert(hiddenCounts);
	hiddenCounts->debugme(debug);
    }

    /*
     * Read event vocabulary
     */
    if (hiddenVocabFile) {
	File file(hiddenVocabFile, "r");

	hiddenVocab.read(file);
    }

    if (forceEvent) {
	/*
	 * Omit the noevent token from hidden vocabulary.
	 * We still have to assign an index to it, so just use the regular
	 * vocabulary.
	 */
	noEventIndex = vocab->addWord(noHiddenEvent);
    } else {
	/*
	 * Add noevent token to hidden vocabulary
	 */
	noEventIndex = hiddenVocab.addWord(noHiddenEvent);
    }

    if (textFile) {
	File file(textFile, "r");

	if (continuous) {
	    disambiguateFileContinuous(file, hiddenVocab, *useLM,
							hiddenCounts, numNbest);
	} else {
	    disambiguateFile(file, hiddenVocab, *useLM, hiddenCounts, numNbest);
	}
    }

    if (textMapFile) {
	File file(textMapFile, "r");

	disambiguateTextMap(file, hiddenVocab, *useLM, hiddenCounts, numNbest);
    }

    if (countsFile) {
	File file(countsFile, "w");

	hiddenCounts->write(file, 0, true);
    }

#ifdef DEBUG
    delete hiddenLM;
    delete hiddenCounts;

    return 0;
#endif /* DEBUG */

    exit(0);
}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -