📄 merge_lm.cpp
字号:
// merge_lm.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include "MyNgram.h"#include <string>
#include <vector>
using namespace std;
string g_in_arpa;
string g_out_arpa;
string g_out_vocab;
string g_order;
string g_lambda;
const char* GetDirFromPath(const char* szPathName);
LM * makeMixLM(const char *filename, Vocab &vocab, unsigned order, LM *oldLM, double lambda1, double lambda2);
bool parse_option_file(const char* option_file);
bool split(string str, vector<string>& out)
{
out.clear();
CString csTemp = str.c_str();
csTemp.Replace(" ", "");
csTemp.Replace("\t", "");
int curPos= 0;
CString resToken = csTemp.Tokenize(",", curPos);
while (resToken != "")
{
out.push_back(string(resToken));
resToken = csTemp.Tokenize(",",curPos);
};
return true;
}
int _tmain(int argc, _TCHAR* argv[])
{
if (argc != 2)
{
printf("Usage: merge_lm cfg_file!\n");
exit(-1);
}
parse_option_file( argv[1] );
vector<string> in_lms;
vector<string> in_lambdas;
unsigned order = (unsigned)atoi(g_order.c_str());
split(g_in_arpa, in_lms);
split(g_lambda, in_lambdas);
if ( in_lambdas.size() != in_lms.size() || (order != 2 && order != 3 ))
{
printf("config file %s format error!", argv[1]);
exit(-1);
}
for(int i = 0 ; i < (int)in_lms.size() ; i++)
{
printf("%s\t%s\n", in_lms[i].c_str(), in_lambdas[i].c_str());
}
const char *lmFile1 = in_lms[0].c_str() ;
double lambda = atof(in_lambdas[0].c_str());
Vocab *vocab ;
Ngram *ngramLM ;
vocab = new Vocab; assert(vocab != 0); vocab->remove(vocab->unkIndex()); vocab->remove(vocab->pauseIndex()); vocab->unkIndex() = vocab->addWord("<UNK>");
ngramLM = new MyNgram(*vocab, order); assert(ngramLM != 0); File file(lmFile1, "r"); if (!ngramLM->read(file)) { cerr << "format error in lm file\n"; exit(1); }
const char *lmFile2 ;
double mixLambda = lambda; for(int i = 1 ; i < (int)in_lms.size(); i++) { lmFile2 = in_lms[i].c_str(); double d = atof(in_lambdas[i].c_str()); mixLambda += d; ngramLM = (Ngram *)makeMixLM(lmFile2, *vocab, order, ngramLM, d, mixLambda); }
ngramLM->write(File(g_out_arpa.c_str(), "w"));
ngramLM->vocab.write(File(g_out_vocab.c_str(), "w"));
delete vocab;
delete ngramLM;
return 0;
}
const char* GetDirFromPath(const char* szPathName)
{
static char szPath[1024];
const char* p = strrchr(szPathName, '\\');
if (p)
{
strncpy(szPath, szPathName, p-szPathName);
szPath[p-szPathName] = 0;
}
else
{
strcpy(szPath, ".");
}
return szPath;
}
//lambda1为要mix的filename的权重,lambda2为2个lm的权重和LM * makeMixLM(const char *filename, Vocab &vocab, unsigned order, LM *oldLM, double lambda1, double lambda2){ File file(filename, "r"); /* * create factored LM if -factored was specified, * class-ngram if -classes were specified, * and otherwise a regular ngram */ MyNgram *lm = new MyNgram(vocab, order); assert(lm != 0); if (!lm->read(file)) { cerr << "format error in second-lm file " << filename << endl; exit(1); } /* * Compute mixture lambda (make sure 0/0 = 0) */ Prob lambda = (lambda1 == 0.0) ? 0.0 : lambda1/lambda2; if (oldLM == 0) { return lm; } else { /* * static mixture */ ((MyNgram *)oldLM)->mixProbs(*lm, 1-lambda,true); //((MyNgram *)oldLM)->mixProbs(*lm, 1-lambda,false); delete lm; return oldLM; }}bool parse_option_file(const char* option_file)
{
FILE* fp = fopen(option_file, "rt");
if ( !fp )
{
cout << option_file << " open for error!" << endl ;
return false;
}
CString csTemp;
char szBuffer[1024];
while ( fgets(szBuffer, 1024, fp) )
{
csTemp = szBuffer;
csTemp.TrimRight();
csTemp.TrimLeft();
csTemp.Replace(" ", "");
csTemp.Replace("\t", "");
if ( csTemp.Mid(0, 8) == "in_arpa=" )
{
g_in_arpa = csTemp.Mid(8);
continue;
}
if ( csTemp.Mid(0, 9) == "out_arpa=" )
{
g_out_arpa = csTemp.Mid(9);
continue;
}
if ( csTemp.Mid(0, 6) == "order=" )
{
g_order = csTemp.Mid(6);
continue;
}
if ( csTemp.Mid(0, 10) == "out_vocab=" )
{
g_out_vocab = csTemp.Mid(10);
continue;
}
if ( csTemp.Mid(0, 7) == "lambda=" )
{
g_lambda = csTemp.Mid(7);
continue;
}
}
fclose(fp);
if ( g_in_arpa.empty() || g_order.empty() || g_out_arpa.empty() || g_out_vocab.empty() || g_lambda.empty() )
{
printf("some values in %s is empty", option_file);
return false;
}
return true;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -