📄 vocabmap.cc

📁 这是一款很好用的工具包
💻 CC
字号:
/*
 * VocabMap.cc --
 *	Probabilistic mappings between vocabularies
 *
 */

#ifndef lint
static char Copyright[] = "Copyright (c) 1995,1998,2003 SRI International.  All Rights Reserved.";
static char RcsId[] = "@(#)$Header: /home/srilm/devel/lm/src/RCS/VocabMap.cc,v 1.13 2006/01/05 20:21:27 stolcke Exp $";
#endif

#include <iostream>
using namespace std;
#include <string.h>
#include <stdlib.h>
#include <assert.h>

#include "VocabMap.h"

#include "Map2.cc"
#ifdef INSTANTIATE_TEMPLATES
INSTANTIATE_MAP2(VocabIndex,VocabIndex,Prob);
#endif

VocabMap::VocabMap(Vocab &v1, Vocab &v2, Boolean logmap)
    : vocab1(v1), vocab2(v2), logmap(logmap)
{
    /*
     * Establish default mappings between special vocab items
     */
    if (v1.ssIndex() != Vocab_None && v2.ssIndex() != Vocab_None) {
	*map.insert(v1.ssIndex(), v2.ssIndex()) = logmap ? LogP_One : 1.0;
    }
    if (v1.seIndex() != Vocab_None && v2.seIndex() != Vocab_None) {
	*map.insert(v1.seIndex(), v2.seIndex()) = logmap ? LogP_One : 1.0;
    }
    if (v1.unkIndex() != Vocab_None && v2.unkIndex() != Vocab_None) {
	*map.insert(v1.unkIndex(), v2.unkIndex()) = logmap ? LogP_One : 1.0;
    }
}

Prob
VocabMap::get(VocabIndex w1, VocabIndex w2)
{
    Prob *prob = map.find(w1, w2);
    if (prob) {
	return *prob;
    } else {
	return 0.0;
    }
}

void
VocabMap::put(VocabIndex w1, VocabIndex w2, Prob prob)
{
    *map.insert(w1, w2) = prob;
}

void
VocabMap::remove(VocabIndex w1, VocabIndex w2)
{
    (void)map.remove(w1, w2);
}

void
VocabMap::remove(VocabIndex w1)
{
    (void)map.remove(w1);
}

Boolean
VocabMap::read(File &file)
{
    char *line;

    while (line = file.getline()) {
	VocabString words[maxWordsPerLine];

	unsigned howmany = Vocab::parseWords(line, words, maxWordsPerLine);

	if (howmany == maxWordsPerLine) {
	    file.position() << "map line has too many fields\n";
	    return false;
	}

	/*
	 * The first word is always the source of the map
	 */
	VocabIndex w1 = vocab1.addWord(words[0]);

	if (map.numEntries(w1) > 0) {
	    file.position() << "warning: map redefining entry "
			    << words[0] << endl;
	    map.remove(w1);
	}

	/*
	 * Parse the remaining words as either probs or target words
	 */
	unsigned i = 1;

	while (i < howmany) {
	    double prob;

	    VocabIndex w2 = vocab2.addWord(words[i++]);

	    if (i < howmany && sscanf(words[i], "%lf", &prob)) {
		i ++;
	    } else {
		prob = logmap ? LogP_One : 1.0;
	    }

	    *(map.insert(w1, w2)) = prob;
	}
    }

    return true;
}

/* 
 * Read classes(5) format file, interpreted as VocabMap
 * (mostly borrowed from ClassNgram::readClasses())
 */
Boolean
VocabMap::readClasses(File &file)
{
    char *line;

    while (line = file.getline()) {
	VocabString words[maxWordsPerLine];

	unsigned howmany = Vocab::parseWords(line, words, maxWordsPerLine);

	if (howmany == maxWordsPerLine) {
	    file.position() << "class definition has too many fields\n";
	    return false;
	}

	/*
	 * First word contains class name
	 */
	VocabIndex clasz = vocab2.addWord(words[0]);

	double prob = logmap ? LogP_One : 1.0;
	unsigned numExpansionWords;

	/*
	 * If second word is numeral, assume it's the class expansion prob
	 */
	if (howmany > 1 && sscanf(words[1], "%lf", &prob)) {
	    numExpansionWords = howmany - 2;
	} else {
	    numExpansionWords = howmany - 1;
	}

	if (numExpansionWords != 1) {
	    file.position() << "class definition must have exactly one word\n";
	    return false;
	}

	VocabIndex expansionWord = vocab1.addWord(words[howmany - 1]);

	*(map.insert(expansionWord, clasz)) = prob;
    }
	
    return true;
}

Boolean
VocabMap::write(File &file)
{
    Map2Iter<VocabIndex,VocabIndex,Prob> iter1(map);

    VocabIndex w1;

    while (iter1.next(w1)) {

	VocabString word1 = vocab1.getWord(w1);
	assert(word1 != 0);

	fprintf(file, "%s", word1);

	Map2Iter2<VocabIndex,VocabIndex,Prob> iter2(map, w1);

	VocabIndex w2;
	Prob *prob;

	unsigned i = 0;
	while (prob = iter2.next(w2)) {
	    VocabString word2 = vocab2.getWord(w2);
	    assert(word1 != 0);

	    char sep = (i++ == 0)  ? '\t' : ' ';

	    if (*prob == (logmap ? LogP_One : 1.0)) {
		fprintf(file, "%c%s", sep, word2);
	    } else {
		fprintf(file, "%c%s %lg", sep, word2, *prob);
	    }
	}
	fprintf(file, "\n");
    }
    return true;
}

/*
 * Write VocabMap in bigram count file format
 */
Boolean
VocabMap::writeBigrams(File &file)
{
    Map2Iter<VocabIndex,VocabIndex,Prob> iter1(map);

    VocabIndex w1;

    while (iter1.next(w1)) {
	VocabString word1 = vocab1.getWord(w1);
	assert(word1 != 0);

	Map2Iter2<VocabIndex,VocabIndex,Prob> iter2(map, w1);

	VocabIndex w2;
	Prob *prob;

	unsigned i = 0;
	while (prob = iter2.next(w2)) {
	    VocabString word2 = vocab2.getWord(w2);
	    assert(word1 != 0);

	    // prob = P(word1|word2), hence the bigram word order
	    fprintf(file, "%s %s\t%lg\n", word2, word1, *prob);
	}
    }
    return true;
}

/*
 * Iteration over map entries
 */

VocabMapIter::VocabMapIter(VocabMap &vmap, VocabIndex w) :
   mapIter(vmap.map, w)
{
}

void
VocabMapIter::init()
{
    mapIter.init();
}

Boolean
VocabMapIter::next(VocabIndex &w, Prob &prob)
{
    Prob *myprob = mapIter.next(w);

    if (myprob) {
	prob = *myprob;
	return true;
    } else {
	return false;
    }
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -