📄 vocabmap.cc
字号:
/*
* VocabMap.cc --
* Probabilistic mappings between vocabularies
*
*/
#ifndef lint
static char Copyright[] = "Copyright (c) 1995,1998,2003 SRI International. All Rights Reserved.";
static char RcsId[] = "@(#)$Header: /home/srilm/devel/lm/src/RCS/VocabMap.cc,v 1.13 2006/01/05 20:21:27 stolcke Exp $";
#endif
#include <iostream>
using namespace std;
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include "VocabMap.h"
#include "Map2.cc"
#ifdef INSTANTIATE_TEMPLATES
INSTANTIATE_MAP2(VocabIndex,VocabIndex,Prob);
#endif
VocabMap::VocabMap(Vocab &v1, Vocab &v2, Boolean logmap)
: vocab1(v1), vocab2(v2), logmap(logmap)
{
/*
* Establish default mappings between special vocab items
*/
if (v1.ssIndex() != Vocab_None && v2.ssIndex() != Vocab_None) {
*map.insert(v1.ssIndex(), v2.ssIndex()) = logmap ? LogP_One : 1.0;
}
if (v1.seIndex() != Vocab_None && v2.seIndex() != Vocab_None) {
*map.insert(v1.seIndex(), v2.seIndex()) = logmap ? LogP_One : 1.0;
}
if (v1.unkIndex() != Vocab_None && v2.unkIndex() != Vocab_None) {
*map.insert(v1.unkIndex(), v2.unkIndex()) = logmap ? LogP_One : 1.0;
}
}
Prob
VocabMap::get(VocabIndex w1, VocabIndex w2)
{
Prob *prob = map.find(w1, w2);
if (prob) {
return *prob;
} else {
return 0.0;
}
}
void
VocabMap::put(VocabIndex w1, VocabIndex w2, Prob prob)
{
*map.insert(w1, w2) = prob;
}
void
VocabMap::remove(VocabIndex w1, VocabIndex w2)
{
(void)map.remove(w1, w2);
}
void
VocabMap::remove(VocabIndex w1)
{
(void)map.remove(w1);
}
Boolean
VocabMap::read(File &file)
{
char *line;
while (line = file.getline()) {
VocabString words[maxWordsPerLine];
unsigned howmany = Vocab::parseWords(line, words, maxWordsPerLine);
if (howmany == maxWordsPerLine) {
file.position() << "map line has too many fields\n";
return false;
}
/*
* The first word is always the source of the map
*/
VocabIndex w1 = vocab1.addWord(words[0]);
if (map.numEntries(w1) > 0) {
file.position() << "warning: map redefining entry "
<< words[0] << endl;
map.remove(w1);
}
/*
* Parse the remaining words as either probs or target words
*/
unsigned i = 1;
while (i < howmany) {
double prob;
VocabIndex w2 = vocab2.addWord(words[i++]);
if (i < howmany && sscanf(words[i], "%lf", &prob)) {
i ++;
} else {
prob = logmap ? LogP_One : 1.0;
}
*(map.insert(w1, w2)) = prob;
}
}
return true;
}
/*
* Read classes(5) format file, interpreted as VocabMap
* (mostly borrowed from ClassNgram::readClasses())
*/
Boolean
VocabMap::readClasses(File &file)
{
char *line;
while (line = file.getline()) {
VocabString words[maxWordsPerLine];
unsigned howmany = Vocab::parseWords(line, words, maxWordsPerLine);
if (howmany == maxWordsPerLine) {
file.position() << "class definition has too many fields\n";
return false;
}
/*
* First word contains class name
*/
VocabIndex clasz = vocab2.addWord(words[0]);
double prob = logmap ? LogP_One : 1.0;
unsigned numExpansionWords;
/*
* If second word is numeral, assume it's the class expansion prob
*/
if (howmany > 1 && sscanf(words[1], "%lf", &prob)) {
numExpansionWords = howmany - 2;
} else {
numExpansionWords = howmany - 1;
}
if (numExpansionWords != 1) {
file.position() << "class definition must have exactly one word\n";
return false;
}
VocabIndex expansionWord = vocab1.addWord(words[howmany - 1]);
*(map.insert(expansionWord, clasz)) = prob;
}
return true;
}
Boolean
VocabMap::write(File &file)
{
Map2Iter<VocabIndex,VocabIndex,Prob> iter1(map);
VocabIndex w1;
while (iter1.next(w1)) {
VocabString word1 = vocab1.getWord(w1);
assert(word1 != 0);
fprintf(file, "%s", word1);
Map2Iter2<VocabIndex,VocabIndex,Prob> iter2(map, w1);
VocabIndex w2;
Prob *prob;
unsigned i = 0;
while (prob = iter2.next(w2)) {
VocabString word2 = vocab2.getWord(w2);
assert(word1 != 0);
char sep = (i++ == 0) ? '\t' : ' ';
if (*prob == (logmap ? LogP_One : 1.0)) {
fprintf(file, "%c%s", sep, word2);
} else {
fprintf(file, "%c%s %lg", sep, word2, *prob);
}
}
fprintf(file, "\n");
}
return true;
}
/*
* Write VocabMap in bigram count file format
*/
Boolean
VocabMap::writeBigrams(File &file)
{
Map2Iter<VocabIndex,VocabIndex,Prob> iter1(map);
VocabIndex w1;
while (iter1.next(w1)) {
VocabString word1 = vocab1.getWord(w1);
assert(word1 != 0);
Map2Iter2<VocabIndex,VocabIndex,Prob> iter2(map, w1);
VocabIndex w2;
Prob *prob;
unsigned i = 0;
while (prob = iter2.next(w2)) {
VocabString word2 = vocab2.getWord(w2);
assert(word1 != 0);
// prob = P(word1|word2), hence the bigram word order
fprintf(file, "%s %s\t%lg\n", word2, word1, *prob);
}
}
return true;
}
/*
* Iteration over map entries
*/
VocabMapIter::VocabMapIter(VocabMap &vmap, VocabIndex w) :
mapIter(vmap.map, w)
{
}
void
VocabMapIter::init()
{
mapIter.init();
}
Boolean
VocabMapIter::next(VocabIndex &w, Prob &prob)
{
Prob *myprob = mapIter.next(w);
if (myprob) {
prob = *myprob;
return true;
} else {
return false;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -