⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vocab.cpp

📁 解码器是基于短语的统计机器翻译系统的核心模块
💻 CPP
字号:
#include "Vocab.h"

int Vocab::ID = -1;

using namespace std;

Vocab::Vocab()
{
	indexWord.push_back("");
}

bool Vocab::load(string vcb)
{
	string strTmp;
	string sentStart = "<s>";
	string sentEnd = "</s>";

	vcbFileName.open(vcb.c_str(), std::ios::in);
	if (!vcbFileName) 
	{
		cout << "open vocab " << vcb << " error!" << endl;
		return 0;
	}
	while (getline(vcbFileName, strTmp))
	{
		int index;
		int firstTab, nextTab;
		firstTab = strTmp.find_first_of('\t');
		nextTab = strTmp.find_last_of('\t');
		string id(strTmp, 0, firstTab);
		string word(strTmp, firstTab + 1, nextTab - firstTab -1);

		index = atoi(id.c_str());
		indexWord.push_back(word);
		wordIndex.insert(make_pair(word, index));
	}
	vcbFileName.clear();
	vcbFileName.close();
	int len = indexWord.size();
	indexWord.push_back(sentStart);
	wordIndex.insert(make_pair(sentStart, len));
	indexWord.push_back(sentEnd);
	wordIndex.insert(make_pair(sentEnd, len + 1));
	return 1;
}

int Vocab::getIndex(string word)
{
	WordIndex::iterator pos = wordIndex.find(word);
	if (pos != wordIndex.end()) {
		return pos->second;
	}
	return 1; //UNK
}

string Vocab::getWord(int index)
{
	string str;
	if ( (index > 0)) {//(index < MaxIndex) &&
		str = indexWord[index];
	}
	else
	{
		str = unkTMP[index];
	}
	return str;
}

int Vocab::getIndices(string words, vector<int>& indices)
{
	vector<string> vectmp;
	split(words, vectmp);
	int len = vectmp.size();
	for(int i = 0; i < len; i++)
	{
		int fdsa = wordIndex.size();
		WordIndex::iterator pos = wordIndex.find(vectmp[i]);
		if (pos != wordIndex.end()) {
			indices.push_back(pos->second);
		}
		else {
			int idex = indexWord.size();
			indexWord.push_back(vectmp[i]);
			wordIndex.insert(make_pair(vectmp[i], idex));
			indices.push_back(idex);
		}
	}
	return len;
}

string Vocab::getWords(vector<int> indices)
{
	string str;
	int len = indices.size();
	for(int i = 0; i < len; i++)
	{
		int tm = indices[i];
		if ((tm > 0)) {//(tm < MaxIndex) && 
			str += indexWord[tm];
			str += " ";
		}
		else 
		{
			str += unkTMP[tm];
			str += " ";
		}
	}
	string::iterator pos = str.end() - 1;	
	str.erase(pos);
	return str;
}

int Vocab::senToIDs(string sen, vector<int>& senids)//将输入句子转化为ID
{
	vector<string> vectmp;
	split(sen, vectmp);
	int len = vectmp.size();
	for(int i = 0; i < len; i++)
	{
		string str = vectmp[i];
		WordIndex::iterator pos = wordIndex.find(vectmp[i]);
		if (pos != wordIndex.end()) 
		{
			senids.push_back(wordIndex[vectmp[i]]);
		}
		else
		{
			int idfake = ID--;
			senids.push_back(idfake);
			unkTMP.insert(make_pair(idfake, vectmp[i]));
		}
	}
	return len;
}

string Vocab::IDsTosen(deque<int> senids)//将ID转换为string
{
	string str;
	int len = senids.size();
	for(int i = 0; i < len; i++)
	{
		if ((senids[i] > 0))//(senids[i] < MaxIndex) && 
		{
			str += indexWord[senids[i]];
			str += " ";
		}
		else
		{
			str += unkTMP[senids[i]];
			str += " ";
		}
	}
	string::iterator pos = str.end() - 1;	
	str.erase(pos);
	return str;
}

void Vocab::mapClear()
{
	unkTMP.clear();
}







⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -