⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 decoder.cpp

📁 解码器是基于短语的统计机器翻译系统的核心模块
💻 CPP
📖 第 1 页 / 共 2 页
字号:
/** Decoder.cpp  -  Decoder class definition** Copyright (C) 2006 by Zhongjun He <zjhe@ict.ac.cn>                         Yajuan Lv <lvyajuan@ict.ac.cn>Multilingual Interaction Technology and Evaluation Laboratory, ICT, CAS* Begin       : 04/13/2006* Last Change : 04/13/2006** This program is free software; you can redistribute it and/or* modify it under the terms of the GNU Lesser General Public* License as published by the Free Software Foundation; either* version 2.1 of the License, or (at your option) any later version.** This program is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the* GNU General Public License for more details.** You should have received a copy of the GNU Lesser General Public* License along with this program; if not, write to the Free Software* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.*/#include "Decoder.h"/************************************************************************/
/* construction function                                                */
/************************************************************************/Decoder::Decoder(){
	BP_TABLE_LIMIT = 10;

	MAX_DISTORTION = 5;

	HP_STACK_SIZE = 100;

	NBEST_LIST = 200;

	IS_PRINT_NBEST = 0;

	INFO = 0;

	resultfile = "result.txt";

	nbestfile = "nbest.xml";
}
/************************************************************************/
/* Read Configure file                                                  */
/************************************************************************/void Decoder::ReadConfigure(const char *conf)
{
	ifstream in(conf);
	if (!in)
	{
		cerr<<"open file error in Decode::Initialize()"<<endl;
		exit(1);
	}

	string line;
	while (getline(in,line))
	{
		string f1,f2;
		istrstream buffer(line.c_str());
		buffer >> f1;
		buffer >> f2;
    	
		if (f1 == "[ttable-limit]")//bp limit
			BP_TABLE_LIMIT = atoi(f2.c_str());

		if (f1 == "[stack]")//stack limit
			HP_STACK_SIZE = atoi(f2.c_str());

		if (f1 == "[nbest-list]")//Nbest size
			NBEST_LIST = atoi(f2.c_str());

		if (f1 == "[distortion]") //distortion limit: 0 for monotone search
			MAX_DISTORTION = atoi(f2.c_str());

		if (f1 == "[ttable-file]")
			bpfile = f2;  //bilingual phrase   

		if (f1 == "[lm-file]") 
			lmfile = f2;  //language model

		if(f1 == "[lm-ngram]")
			lmngram = atoi(f2.c_str());

		if (f1 == "[para]") //weight
		{
			string feat,value;
			while (getline(in,line))
			{
				istrstream b(line.c_str());
				b >> feat >> value;

				if(feat=="[end]")
					break;

				double v = atof(value.c_str());
				lambda.push_back(v);
			}
		}
		
		if (f1 == "[print-info]")
            INFO = atoi(f2.c_str());

		if (f1 == "[print-nbest]")
			IS_PRINT_NBEST = atoi(f2.c_str());

	}

}

/************************************************************************/
/* Set arguments                                                        */
/************************************************************************/
void Decoder::SetArg(int bplimit, int dis, int stack, int nbestlist, int print_nbest, int print_info,
					 const string &tf, const string &rf, const string &nf)
{
	if (bplimit > 0)
	   BP_TABLE_LIMIT = bplimit; 
	if (dis > -1)
	   MAX_DISTORTION = dis;
	if (stack > 0)
	   HP_STACK_SIZE = stack;
	if (nbestlist > 0)
	   NBEST_LIST = nbestlist;
	if (print_nbest != -1)
	   IS_PRINT_NBEST = print_nbest;
    if (print_info != -1)
	   INFO = print_info;
    if (tf.size() > 0)
	   testfile = tf;
    if (rf.size() > 0)
	   resultfile = rf;
    if (nf.size() > 0)
 	   nbestfile = nf;
	
}
/************************************************************************/
/* initialize                                                           */
/************************************************************************/
void Decoder::Initialize()
{
	//initialize Language Model
	lm.init(lmfile.c_str(),lmngram);

	//read BP table
	bptable.Set(lambda,BP_TABLE_LIMIT);
	bptable.ReadFromFile(bpfile.c_str());

	if(INFO)
	{
		logs.open("search_info.xml");
		logs<<"<?xml version=\"1.0\" encoding=\"gbk\" ?> "<<endl;
		logs<<"<translog>"<<endl;
	}
}
/************************************************************************/
/* ~Decoder                                                             */
/************************************************************************/
Decoder::~Decoder()
{
	if (INFO)
	{
		logs<<"</translog>"<<endl;
	}
}
/************************************************************************/
/* Translate File                                                       */
/************************************************************************/
void Decoder::TranslateFile()
{
	vector<string> sents;

	ReadChinese(testfile.c_str(),sents);

	ofstream nbest;
	if(IS_PRINT_NBEST)
	{
		nbest.open(nbestfile.c_str());
		nbest<<"<?xml version=\"1.0\" encoding=\"GB2312\"?>"<<endl;
		nbest<<"<text>"<<endl;
	}

	string temp_rslt("temp.rslt");
	ofstream temp(temp_rslt.c_str());
	int i;
	for (i=0; i<sents.size(); i++)
	{
		cout << "No. " << i+1 << endl;
		cout << sents[i] << endl;

		vector<CandTrans> candidate = TranslateSent(sents[i]);
		
		vector<CandTrans>::iterator it = max_element(candidate.begin(),candidate.end());

		TrueCase((*it).english);
		temp << (*it).english << endl;
		cout << (*it).english << endl<<endl;

		if(IS_PRINT_NBEST)
		{
			sort(candidate.begin(),candidate.end(),greater<CandTrans>());

			nbest << "<sent No=\"" << i+1 << "\" nbest=\"" << candidate.size() <<"\">" << endl;
			nbest << "<chinese>" << sents[i] << "</chinese>" << endl;
			for (int j=0; j<candidate.size(); j++)
			{
				nbest << "<candidate No=\"" << j+1 << "\">" << endl;
				TrueCase(candidate[j].english);
				candidate[j].Show(nbest, 0);
				nbest << "</candidate>" << endl;
			}
			nbest << "</sent>" << endl;
		}
	}

	if (IS_PRINT_NBEST)
	{
		nbest<<"</text>"<<endl;
	}

	ChangeFormatTo863(testfile.c_str(), temp_rslt.c_str(), resultfile.c_str());

	temp.close();
	unlink(temp_rslt.c_str());
}
/************************************************************************/
/* Translate a sentence                                                 */
/************************************************************************/
vector<CandTrans> Decoder::TranslateSent(const string &chinese)
{
	SentPair sp(chinese);

	if (INFO)
	{
		logs<<"<srcsent>";
		copy(sp.ChineseWord.begin(),sp.ChineseWord.end(),ostream_iterator<string>(logs," "));
		logs<<"</srcsent>"<<endl;
	}

	//step1: search translation options for chinese sentence
	int i=0, j=0; 
	for (i=0; i<sp.ChineseWord.size(); i++)
	{
		for (j=i; j<sp.ChineseWord.size(); j++)
		{
			if ( (j-i) > PHRASE_LEN )
				break;

			vector<string> phrase;
			for(int k=i; k<=j; k++)
				phrase.push_back(sp.ChineseWord[k]);

			TransMap transoption;
			SearchPhrase(phrase, transoption);
			
			if (transoption.size() > 0)
			{
				pair<int,int> pp=make_pair(i,j);
				sp.transoption[pp] = transoption;
			}
		}
	}
	if (INFO)
	{
		sp.ShowTransOption(logs);
	}

	//step2: compute future cost
	ComputeFutureCost(sp.ChineseWord.size(),sp.transoption,sp.FutureCost);
	if(INFO)
	{
		sp.ShowFutureCost(logs);
	}

	//step3: beam search
	vector<vector<Hypotheses> > HpStack;
	BeamSearch(sp.ChineseWord.size(), sp.transoption, HpStack, sp.FutureCost);

   //step4: generate Nbest_list
	if (NBEST_LIST == 1)
		Generate1best(HpStack,sp.CandidateTranslation);
	else if(NBEST_LIST > 1)
	    GenerateNbest(HpStack,sp.CandidateTranslation);

    return sp.CandidateTranslation;

}

/************************************************************************/
/* Search possible translations for a Chinese phrase                    */
/************************************************************************/
void Decoder::SearchPhrase(const vector<string> &phrase, TransMap &transoption)
{
	string s;
	int i = 0;
	bool istrans = false;

	for(i=0; i<(int)phrase.size(); i++)//Chinese phrase (without blank between words)
	{
		s += phrase[i];
	}
        
        if(phrase.size()==1)//if is English or not
	{
		bool ishan=false;
		for(i=0; i<s.size()-1; i++)
		{
			unsigned int c1=s[i];
			if(c1&0x80)
			{
				ishan=true;
				break;
			}
		}
		if(!ishan)
		{
		  vector<double> smooth;
		  double p = bptable.prob_num * log(PROB_SMOOTH);
		  smooth.push_back(p);
		  for (i=1; i<bptable.prob_num+1; i++)
		  {
			smooth.push_back(log(PROB_SMOOTH));
		  }
		  smooth.push_back(1.0);// word length
		  transoption[s] = smooth;
		  return;
		}
	}
	
	istrans = bptable.GetTranslations(s,transoption);

        
	if(!istrans && phrase.size() == 1)//Chinese word
	{
		vector<double> smooth;
		double p = bptable.prob_num * log(PROB_SMOOTH);
		smooth.push_back(p);
		for (i=1; i<bptable.prob_num+1; i++)
		{
			smooth.push_back(log(PROB_SMOOTH));
		}
		smooth.push_back(1.0);// word length
		transoption[s] = smooth;		  
	}
	
	
}

/************************************************************************/
/* Compute Future Cost                                                  */
/************************************************************************/
void Decoder::ComputeFutureCost(int WordLen, const map<pair<int,int>, TransMap> &TransOption, 
								map<pair<int,int> ,double> &FutureCost)
{
	map<pair<int,int>, TransMap>::const_iterator it;

	int i,j,len;

	//initialization
	for (i = 0; i < WordLen; i++)
		for (j = i; j < WordLen; j++)
		{
			pair<int,int> pp = make_pair(i,j);
			FutureCost[pp] = INT_MIN;
		}

	for (it=TransOption.begin(); it!=TransOption.end(); it++)
	{
		TransMap::const_iterator it2;
		double MaxP = INT_MIN;  

		for (it2=(*it).second.begin(); it2!=(*it).second.end(); it2++)
		{
			double TransProb = 0.0;
			double lmprob = lm.getLMProb("", (*it2).first, lmngram); //Language Model

			for (i=1; i < bptable.prob_num + 2 ; i++) //translation probability and length
			{
				TransProb += (*it2).second[i] * lambda[i-1];  
			}

			TransProb += lmprob * lambda[bptable.prob_num+1]; //Language Model

			if (TransProb > MaxP)
				 MaxP = TransProb;
		}

		FutureCost[(*it).first]  = MaxP;
	}
	
	for (len = 1; len < WordLen ; len ++)
	{
		for (i = 0; i < WordLen - len ; i++)
		{
			double MaxP = INT_MIN;
			pair<int,int> pp1,pp2;
			for ( j = i; j < i + len; j++)
			{			
				pp1 = make_pair(i,j);
				pp2 = make_pair(j+1,i + len);
			           
				double TransProb = FutureCost[pp1] + FutureCost[pp2];
				if(TransProb > MaxP)
					MaxP = TransProb;
			}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -