📄 decoder.cpp
字号:
/** Decoder.cpp - Decoder class definition** Copyright (C) 2006 by Zhongjun He <zjhe@ict.ac.cn> Yajuan Lv <lvyajuan@ict.ac.cn>Multilingual Interaction Technology and Evaluation Laboratory, ICT, CAS* Begin : 04/13/2006* Last Change : 04/13/2006** This program is free software; you can redistribute it and/or* modify it under the terms of the GNU Lesser General Public* License as published by the Free Software Foundation; either* version 2.1 of the License, or (at your option) any later version.** This program is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the* GNU General Public License for more details.** You should have received a copy of the GNU Lesser General Public* License along with this program; if not, write to the Free Software* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.*/#include "Decoder.h"/************************************************************************/
/* construction function */
/************************************************************************/Decoder::Decoder(){
BP_TABLE_LIMIT = 10;
MAX_DISTORTION = 5;
HP_STACK_SIZE = 100;
NBEST_LIST = 200;
IS_PRINT_NBEST = 0;
INFO = 0;
resultfile = "result.txt";
nbestfile = "nbest.xml";
}
/************************************************************************/
/* Read Configure file */
/************************************************************************/void Decoder::ReadConfigure(const char *conf)
{
ifstream in(conf);
if (!in)
{
cerr<<"open file error in Decode::Initialize()"<<endl;
exit(1);
}
string line;
while (getline(in,line))
{
string f1,f2;
istrstream buffer(line.c_str());
buffer >> f1;
buffer >> f2;
if (f1 == "[ttable-limit]")//bp limit
BP_TABLE_LIMIT = atoi(f2.c_str());
if (f1 == "[stack]")//stack limit
HP_STACK_SIZE = atoi(f2.c_str());
if (f1 == "[nbest-list]")//Nbest size
NBEST_LIST = atoi(f2.c_str());
if (f1 == "[distortion]") //distortion limit: 0 for monotone search
MAX_DISTORTION = atoi(f2.c_str());
if (f1 == "[ttable-file]")
bpfile = f2; //bilingual phrase
if (f1 == "[lm-file]")
lmfile = f2; //language model
if(f1 == "[lm-ngram]")
lmngram = atoi(f2.c_str());
if (f1 == "[para]") //weight
{
string feat,value;
while (getline(in,line))
{
istrstream b(line.c_str());
b >> feat >> value;
if(feat=="[end]")
break;
double v = atof(value.c_str());
lambda.push_back(v);
}
}
if (f1 == "[print-info]")
INFO = atoi(f2.c_str());
if (f1 == "[print-nbest]")
IS_PRINT_NBEST = atoi(f2.c_str());
}
}
/************************************************************************/
/* Set arguments */
/************************************************************************/
void Decoder::SetArg(int bplimit, int dis, int stack, int nbestlist, int print_nbest, int print_info,
const string &tf, const string &rf, const string &nf)
{
if (bplimit > 0)
BP_TABLE_LIMIT = bplimit;
if (dis > -1)
MAX_DISTORTION = dis;
if (stack > 0)
HP_STACK_SIZE = stack;
if (nbestlist > 0)
NBEST_LIST = nbestlist;
if (print_nbest != -1)
IS_PRINT_NBEST = print_nbest;
if (print_info != -1)
INFO = print_info;
if (tf.size() > 0)
testfile = tf;
if (rf.size() > 0)
resultfile = rf;
if (nf.size() > 0)
nbestfile = nf;
}
/************************************************************************/
/* initialize */
/************************************************************************/
void Decoder::Initialize()
{
//initialize Language Model
lm.init(lmfile.c_str(),lmngram);
//read BP table
bptable.Set(lambda,BP_TABLE_LIMIT);
bptable.ReadFromFile(bpfile.c_str());
if(INFO)
{
logs.open("search_info.xml");
logs<<"<?xml version=\"1.0\" encoding=\"gbk\" ?> "<<endl;
logs<<"<translog>"<<endl;
}
}
/************************************************************************/
/* ~Decoder */
/************************************************************************/
Decoder::~Decoder()
{
if (INFO)
{
logs<<"</translog>"<<endl;
}
}
/************************************************************************/
/* Translate File */
/************************************************************************/
void Decoder::TranslateFile()
{
vector<string> sents;
ReadChinese(testfile.c_str(),sents);
ofstream nbest;
if(IS_PRINT_NBEST)
{
nbest.open(nbestfile.c_str());
nbest<<"<?xml version=\"1.0\" encoding=\"GB2312\"?>"<<endl;
nbest<<"<text>"<<endl;
}
string temp_rslt("temp.rslt");
ofstream temp(temp_rslt.c_str());
int i;
for (i=0; i<sents.size(); i++)
{
cout << "No. " << i+1 << endl;
cout << sents[i] << endl;
vector<CandTrans> candidate = TranslateSent(sents[i]);
vector<CandTrans>::iterator it = max_element(candidate.begin(),candidate.end());
TrueCase((*it).english);
temp << (*it).english << endl;
cout << (*it).english << endl<<endl;
if(IS_PRINT_NBEST)
{
sort(candidate.begin(),candidate.end(),greater<CandTrans>());
nbest << "<sent No=\"" << i+1 << "\" nbest=\"" << candidate.size() <<"\">" << endl;
nbest << "<chinese>" << sents[i] << "</chinese>" << endl;
for (int j=0; j<candidate.size(); j++)
{
nbest << "<candidate No=\"" << j+1 << "\">" << endl;
TrueCase(candidate[j].english);
candidate[j].Show(nbest, 0);
nbest << "</candidate>" << endl;
}
nbest << "</sent>" << endl;
}
}
if (IS_PRINT_NBEST)
{
nbest<<"</text>"<<endl;
}
ChangeFormatTo863(testfile.c_str(), temp_rslt.c_str(), resultfile.c_str());
temp.close();
unlink(temp_rslt.c_str());
}
/************************************************************************/
/* Translate a sentence */
/************************************************************************/
vector<CandTrans> Decoder::TranslateSent(const string &chinese)
{
SentPair sp(chinese);
if (INFO)
{
logs<<"<srcsent>";
copy(sp.ChineseWord.begin(),sp.ChineseWord.end(),ostream_iterator<string>(logs," "));
logs<<"</srcsent>"<<endl;
}
//step1: search translation options for chinese sentence
int i=0, j=0;
for (i=0; i<sp.ChineseWord.size(); i++)
{
for (j=i; j<sp.ChineseWord.size(); j++)
{
if ( (j-i) > PHRASE_LEN )
break;
vector<string> phrase;
for(int k=i; k<=j; k++)
phrase.push_back(sp.ChineseWord[k]);
TransMap transoption;
SearchPhrase(phrase, transoption);
if (transoption.size() > 0)
{
pair<int,int> pp=make_pair(i,j);
sp.transoption[pp] = transoption;
}
}
}
if (INFO)
{
sp.ShowTransOption(logs);
}
//step2: compute future cost
ComputeFutureCost(sp.ChineseWord.size(),sp.transoption,sp.FutureCost);
if(INFO)
{
sp.ShowFutureCost(logs);
}
//step3: beam search
vector<vector<Hypotheses> > HpStack;
BeamSearch(sp.ChineseWord.size(), sp.transoption, HpStack, sp.FutureCost);
//step4: generate Nbest_list
if (NBEST_LIST == 1)
Generate1best(HpStack,sp.CandidateTranslation);
else if(NBEST_LIST > 1)
GenerateNbest(HpStack,sp.CandidateTranslation);
return sp.CandidateTranslation;
}
/************************************************************************/
/* Search possible translations for a Chinese phrase */
/************************************************************************/
void Decoder::SearchPhrase(const vector<string> &phrase, TransMap &transoption)
{
string s;
int i = 0;
bool istrans = false;
for(i=0; i<(int)phrase.size(); i++)//Chinese phrase (without blank between words)
{
s += phrase[i];
}
if(phrase.size()==1)//if is English or not
{
bool ishan=false;
for(i=0; i<s.size()-1; i++)
{
unsigned int c1=s[i];
if(c1&0x80)
{
ishan=true;
break;
}
}
if(!ishan)
{
vector<double> smooth;
double p = bptable.prob_num * log(PROB_SMOOTH);
smooth.push_back(p);
for (i=1; i<bptable.prob_num+1; i++)
{
smooth.push_back(log(PROB_SMOOTH));
}
smooth.push_back(1.0);// word length
transoption[s] = smooth;
return;
}
}
istrans = bptable.GetTranslations(s,transoption);
if(!istrans && phrase.size() == 1)//Chinese word
{
vector<double> smooth;
double p = bptable.prob_num * log(PROB_SMOOTH);
smooth.push_back(p);
for (i=1; i<bptable.prob_num+1; i++)
{
smooth.push_back(log(PROB_SMOOTH));
}
smooth.push_back(1.0);// word length
transoption[s] = smooth;
}
}
/************************************************************************/
/* Compute Future Cost */
/************************************************************************/
void Decoder::ComputeFutureCost(int WordLen, const map<pair<int,int>, TransMap> &TransOption,
map<pair<int,int> ,double> &FutureCost)
{
map<pair<int,int>, TransMap>::const_iterator it;
int i,j,len;
//initialization
for (i = 0; i < WordLen; i++)
for (j = i; j < WordLen; j++)
{
pair<int,int> pp = make_pair(i,j);
FutureCost[pp] = INT_MIN;
}
for (it=TransOption.begin(); it!=TransOption.end(); it++)
{
TransMap::const_iterator it2;
double MaxP = INT_MIN;
for (it2=(*it).second.begin(); it2!=(*it).second.end(); it2++)
{
double TransProb = 0.0;
double lmprob = lm.getLMProb("", (*it2).first, lmngram); //Language Model
for (i=1; i < bptable.prob_num + 2 ; i++) //translation probability and length
{
TransProb += (*it2).second[i] * lambda[i-1];
}
TransProb += lmprob * lambda[bptable.prob_num+1]; //Language Model
if (TransProb > MaxP)
MaxP = TransProb;
}
FutureCost[(*it).first] = MaxP;
}
for (len = 1; len < WordLen ; len ++)
{
for (i = 0; i < WordLen - len ; i++)
{
double MaxP = INT_MIN;
pair<int,int> pp1,pp2;
for ( j = i; j < i + len; j++)
{
pp1 = make_pair(i,j);
pp2 = make_pair(j+1,i + len);
double TransProb = FutureCost[pp1] + FutureCost[pp2];
if(TransProb > MaxP)
MaxP = TransProb;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -