📄 hikerwbfunc.cpp
字号:
tp += itemtp.tp;
wordcnum++;
}
return tp;
}
static BOOL IsGB2312(CString word)
{
if (word.GetLength() < 1)
return TRUE;
CString code = CPT.GetAt(CPFind(word)).code;
if (code[0] == 'z')
return TRUE;
unsigned char c1 = word[0];
unsigned char c2 = word[1];
if (c1 >= 0xB0 && c1 <= 0xF7 && c2 >= 0xA1 && c2 <= 0xFE)
return TRUE;
return FALSE;
}
BOOL SegmentCode(unsigned char* pcs, int pcslen, unsigned char** pcode, int& numc)
{
int lenc = 0;
BOOL bSpcCode;
//unsigned char code[5];
numc = 0;
memset(pcode[numc], 0, CODELEN);
for (int i = 0; i < pcslen; i++)
{
if (pcs[i] != 32)
{
if (lenc == 0)
{
numc++;
if (pcs[i] == 'z')
bSpcCode = TRUE;
else
bSpcCode = FALSE;
}
pcode[numc-1][lenc++] = pcs[i];
if (lenc == ((bSpcCode)?4:maxcodelen))
{
lenc = 0;
memset(pcode[numc], 0, CODELEN);
}
}
else
{
if (lenc > 0)
{
lenc = 0;
memset(pcode[numc], 0, CODELEN);
}
}
}
return TRUE;
}
static BOOL CheckCodes(unsigned char** pcode, int numc)
{
CString code;
POSITION ic;
for (int i = 0; i < numc; i++)
{
code = pcode[i];
ic = CFind(code); //Get the Code's index in array CArr;
if (ic == 0)
strcpy((char*)(pcode[i]), (const char*)CString('z', code.GetLength()));
}
return TRUE;
}
//word1 is before word2
static BOOL CWIsBefore(POSITION pos1, POSITION pos2, int nw)
{
POSITION pos = pos1;
for (int i = 0; i < nw; i++)
{
CWT.GetNext(pos);
if (pos)
{
if (pos == pos2)
return TRUE;
}
else
break;
}
return FALSE;
}
static double GetTransPorb(CString word, CString wordc, double tpsum, int transwdnum)
{
double tp;
POSITION itp = TPFind(word, wordc);
if (itp == 0)
{
//itp = TPFind(word);
if (/*itp*/transwdnum == 0)
//tp = 1.0/WT.GetCount();
tp = WT.GetAt(WFind(wordc)).prior/sumwordfre; //using prior instead of transition
else
//tp = (1.0f-tpsum)/(WT.GetCount()-transwdnum); //assign rest transtition probabilities averagely to no existing items
tp = (1.0-tpsum)*WT.GetAt(WFind(wordc)).prior/sumwordfre;
}
else
{
tp = TPT.GetAt(itp).tp;
}
return tp;
}
static BOOL Backward(unsigned char** pcode, double** b, int numc)
{
// unsigned char code[CODELEN];
// unsigned char codec[CODELEN];
// unsigned char word[WORDLEN];
// unsigned char wordc[WORDLEN];
CString code;
CString codec;
CString word;
CString wordc;
POSITION ic, icc, icw, icwc, icp;
int nw, nwc;
double tp, tpsum;
int i, iw, iwc, transwdnum;
POSITION k, kc;
double sumiwb;
//calculate p(yi/si)
for (i = 0; i < numc; i++)
{
code = pcode[i];
ic = CFind(code); //Get the Code's index in array CArr;
if (ic == 0)
return FALSE;
nw = ((maxcodelen == 4) ? CT.GetAt(ic).wnum : CT.GetAt(ic).wnum2);
icw = CWFind(code);
// if (icw >= CWCArr.GetSize()-1)
// code = "";
// else
// code = CWCArr[icw];
k = icw;
for (iw = 0; iw < nw; iw++)
{
word = CWT.GetNext(k).word;
icp = CPFind(word, code); //a map has key word+code?
b[i][iw] = ((maxcodelen == 4) ? CPT.GetAt(icp).cp : CPT.GetAt(icp).cp2);
}
}
for (i = numc-2; i >= 0; i--)
{
code = pcode[i];
ic = CFind(code); //Get the Code's index in array CArr;
if (ic == 0)
return FALSE;
nw = ((maxcodelen == 4) ? CT.GetAt(ic).wnum : CT.GetAt(ic).wnum2);
icw = CWFind(code);
k = icw;
for (iw = 0; iw < nw; iw++)
{
word = CWT.GetNext(k).word;
tpsum = TPSum(word, transwdnum);
ASSERT(tpsum < 1.0);
codec = pcode[i+1];
icc = CFind(codec); //Get the Code's index in array CArr;
nwc = ((maxcodelen == 4) ? CT.GetAt(icc).wnum : CT.GetAt(icc).wnum2);
icwc = CWFind(codec);
kc = icwc;
sumiwb = 0;
for (iwc = 0; iwc < nwc; iwc++)
{
wordc = CWT.GetNext(kc).word;
tp = GetTransPorb(word, wordc, tpsum, transwdnum);
ASSERT(tp >= -1e-5);
sumiwb += b[i+1][iwc] * tp;
}
b[i][iw] *= sumiwb;
}
}
return TRUE;
}
static BOOL Foreward(unsigned char** pcode, double** a, double** b, int numc)
{
POSITION ic, icp, ipp, icw, icwp;
int iw, nw, iwp, nwp, transwdnum;
double pp, tp, tpsum;
double sumaiw[DUPWORD];
double sumaiwp[DUPWORD];
// unsigned char *code, *codep;
// unsigned char *word, *wordp;
CString code, codep, word, wordp;
POSITION k, kp;
code = pcode[0];
ic = CFind(code); //Get the Code's index in array CArr;
if (ic == 0)
return FALSE;
nw = ((maxcodelen == 4) ? CT.GetAt(ic).wnum : CT.GetAt(ic).wnum2);
icw = CWFind(code);
for (int m = 0; m < nw; m++)
{
ipp = WFind(CWT.GetNext(icw).word); //prior probability index
pp = WT.GetAt(ipp).prior/sumwordfre;
a[0][m] = b[0][m] * pp;
}
for (int i = 1; i < numc; i++)
{
code = pcode[i];
ic = CFind(code); //Get the Code's index in array CArr;
if (ic == 0)
return FALSE;
nw = ((maxcodelen == 4) ? CT.GetAt(ic).wnum : CT.GetAt(ic).wnum2);
icw = CWFind(code);
codep = pcode[i-1];
icp = CFind(codep); //Get the Code's index in array CArr;
nwp = ((maxcodelen == 4) ? CT.GetAt(icp).wnum : CT.GetAt(icp).wnum2);
icwp = CWFind(codep);
for (iw = 0; iw < nw; iw++)
sumaiwp[iw] = 0;
kp = icwp;
for (iwp = 0; iwp < nwp; iwp++)
{
wordp = CWT.GetNext(kp).word;
tpsum = TPSum(wordp, transwdnum);
ASSERT(tpsum < 1.0);
sumaiw[iwp] = 0;
k = icw;
for (iw = 0; iw < nw; iw++)
{
word = CWT.GetNext(k).word;
tp = GetTransPorb(wordp, word, tpsum, transwdnum);
if (tp <= 0)
Sleep(0);
a[i][iw] = b[i][iw] * tp;
sumaiw[iwp] += a[i][iw];
}
for (iw = 0; iw < nw; iw++)
{
//if (sumaiw[iwp] > 0) //always > 0 if tp always > 0
sumaiwp[iw] += a[i][iw]/sumaiw[iwp] * a[i-1][iwp];
}
}
for (iw = 0; iw < nw; iw++)
{
a[i][iw] = sumaiwp[iw];
}
}
return TRUE;
}
static BOOL MpmEstm(unsigned char** pcode, double** a, int* c, int numc)
{
//MPM estimation
POSITION ic;
int nw;
int i;
double maxp;
CString word;
for (i = 0; i < numc; i++)
{
ic = CFind(pcode[i]); //Get the Code's index in array C1Arr;
maxp=-0.001;
if (ic >= 0)
{
nw = ((maxcodelen == 4) ? CT.GetAt(ic).wnum : CT.GetAt(ic).wnum2); //get duplicate word number for this code
POSITION iw = CWFind(pcode[i]);
c[i] = -1; //06.4.4
for (int m=0; m < nw; m++)
{
word = CWT.GetNext(iw).word;
if (a[i][m]/*-0.001*/ > maxp)
{
if (1 == charset || 0 == charset && (IsGB2312(word)))
{
maxp = a[i][m];
c[i] = m;
}
}
else if (a[i][m]/*+0.001 >*/== maxp) //the two value is near
{
POSITION iw1 = CWFind(pcode[i], m);
POSITION iw2 = CWFind(pcode[i], c[i]);
//if (iw1 < iw2) //???
if (CWIsBefore(iw1, iw2, nw))
c[i] = m;
}
}
}
else
{
c[i] = -1;
}
}
return TRUE;
}
static BOOL Code2Word(unsigned char** pcode, int numc, unsigned char** pword)
{
int i;
int c[SENTLEN];
BOOL bstop = FALSE;
CheckCodes(pcode, numc);
if (!Backward(pcode, (double**)b, numc))
return FALSE;
if (!Foreward(pcode, (double**)a, (double**)b, numc))
return FALSE;
if (!MpmEstm(pcode, (double**)a, c, numc))
return FALSE;
//convert duplicate word no. to word stream
for (i = 0; i < numc; i++)
{
if (c[i] >= 0)
{
POSITION icw = CWFind(pcode[i], c[i]);
strcpy((char*)pword[i], (const char*)CWT.GetAt(icw).word);
}
else
{
strcpy((char*)pword[i], CString(15, strlen((char*)pcode[i]))); //??? no use?
//maybe should set pcode[i] to "zz" and goto the "if(!Backward...)" sentence
}
}
return TRUE;
}
//BOOL WordID2Word(int* pwordid, int numw, unsigned char* pword, int& pwlen)
//{
// pwlen = 0;
// unsigned char word[9];
// memset(word, 0, 9);
//
// for (int i = 0; i < numw; i++)
// {
// FindWord(pwordid[i], word);
//
// for (int j = 0; word[j] != 0; j++)
// pword[pwlen++] = word[j];
// }
//
// return TRUE;
//}
static BOOL CreateData()
{
int i;
for (i = 0; i < 256; i++)
{
pcode[i] = new unsigned char[CODELEN];
pword[i] = new unsigned char[WORDLEN];
}
for (i = 0; i < 256; i++)
{
b[i] = new double[DUPWORD];
a[i] = new double[DUPWORD];
}
return TRUE;
}
static BOOL DeleteData()
{
int i;
for (i = 0; i < 256; i++)
{
delete [] pcode[i];
delete [] pword[i];;
}
for (i = 0; i < 256; i++)
{
delete [] b[i];
delete [] a[i];;
}
return TRUE;
}
static int ReadInt(LPBYTE& lpf, int& val)
{
val = *((int*)lpf);
lpf += 4;
return 4;
}
static int ReadFloat(LPBYTE& lpf, float& val)
{
val = *((float*)lpf);
lpf += 4;
return 4;
}
static int ReadStr(LPBYTE& lpf, int len, unsigned char* str)
{
strncpy((char*)str, (char*)lpf, len);
lpf += len;
return len;
}
static int WriteInt(LPBYTE& lpf, int val)
{
*((int*)lpf) = val;
lpf += 4;
return 4;
}
static int WriteFloat(LPBYTE& lpf, float val)
{
*((float*)lpf) = val;
lpf += 4;
return 4;
}
static int WriteStr(LPBYTE& lpf, char* str, int len)
{
strncpy((char*)lpf, str, len);
lpf += len;
return len;
}
//read mb file to arrays or lists, as well as generating indexs
static BOOL InitMB()
{
CFile filec;
CFile filew;
CFile filecw;
CFile filewc;
CFile fileww;
CFile filepy;
//CFileException e;
CString csCap, csDir;
AfxGetMainWnd()->GetWindowText(csCap);
GetModuleFileName(AfxGetInstanceHandle(), csDir.GetBuffer(MAX_PATH), MAX_PATH);
csDir.ReleaseBuffer();
csDir = csDir.Left(csDir.ReverseFind('\\'));
SetCurrentDirectory(csDir);
if (!filec.Open("..\\mb\\code.dat", CFile::modeRead) ||
!filew.Open("..\\mb\\word.dat", CFile::modeRead) ||
!filecw.Open("..\\mb\\cwmap.dat", CFile::modeRead) ||
!filewc.Open("..\\mb\\wcmap.dat", CFile::modeRead) ||
!fileww.Open("..\\mb\\wwmap.dat", CFile::modeRead) ||
!filepy.Open("..\\mb\\pinyin.dat", CFile::modeRead) )
MessageBox(NULL, "打开码表文件失败!", csCap, MB_OK|MB_ICONSTOP);
unsigned char strc[CODELEN], strw[WORDLEN], strw1[WORDLEN];
int lenc, lenw, nw, nw2, lenw1;
float cp, cp2, tp, pp; //!must be float type.
int i;
POSITION pos;
unsigned int id;
int len, filelen;
unsigned char last_strc[CODELEN], last_strw[WORDLEN];
struct _timeb time0, time1;
time_t elptime;
_ftime(&time0);
HANDLE hfilec = CreateFileMapping((HANDLE)filec.m_hFile, NULL, PAGE_READONLY, 0, 0, 0);
HANDLE hfilew = CreateFileMapping((HANDLE)filew.m_hFile, NULL, PAGE_READONLY, 0, 0, 0);
HANDLE hfilecw = CreateFileMapping((HANDLE)filecw.m_hFile, NULL, PAGE_READONLY, 0, 0, 0);
HANDLE hfilewc = CreateFileMapping((HANDLE)filewc.m_hFile, NULL, PAGE_READONLY, 0, 0, 0);
HANDLE hfileww = CreateFileMapping((HANDLE)fileww.m_hFile, NULL, PAGE_READONLY, 0, 0, 0);
HANDLE hfilepy = CreateFileMapping((HANDLE)filepy.m_hFile, NULL, PAGE_READONLY, 0, 0, 0);
LPBYTE lpfc = (LPBYTE)MapViewOfFile(hfilec, FILE_MAP_READ, 0, 0, 0);
LPBYTE lpfw = (LPBYTE)MapViewOfFile(hfilew, FILE_MAP_READ, 0, 0, 0);
LPBYTE lpfcw = (LPBYTE)MapViewOfFile(hfilecw, FILE_MAP_READ, 0, 0, 0);
LPBYTE lpfwc = (LPBYTE)MapViewOfFile(hfilewc, FILE_MAP_READ, 0, 0, 0);
LPBYTE lpfww = (LPBYTE)MapViewOfFile(hfileww, FILE_MAP_READ, 0, 0, 0);
LPBYTE lpfpy = (LPBYTE)MapViewOfFile(hfilepy, FILE_MAP_READ, 0, 0, 0);
//allocate memory for arrays and index table
CIdx.SetSize( 0x100000, 1000);
WIdx.SetSize( 0x10000, 1000);
CWIdx.SetSize(0x100000, 1000);
CPIdx.SetSize(0x10000, 1000);
TPIdx.SetSize(0x10000, 1000);
PYIdx.SetSize(0x100000, 1000);
//initialize all items in index tables with -1
for (i = 0; i < 0x100000; i++)
{
CIdx[i] = 0;
CWIdx[i] = 0;
}
for (i = 0; i < 0x10000; i++)
{
WIdx[i] = 0;
CPIdx[i] = 0;
TPIdx[i] = 0;
}
//read code file
filelen = filec.GetLength();
for (len = 0; len < filelen; )
{
memset(strc, 0, CODELEN) ;
len += ReadInt(lpfc, lenc);
if (lenc == 0)
break;
len += ReadStr(lpfc, lenc, strc);
len += ReadInt(lpfc, nw); //read number of word for this code
len += ReadInt(lpfc, nw2); //read number of word for this code
pos = CT.AddTail(CTItem(strc, nw, nw2));
//set code index
id = GetCodeId((char*)strc);
//if (CIdx[id] == 0)
CIdx[id] = pos;
}
//read word file
sumwordfre = 0.0;
//wordnum = 0;
filelen = filew.GetLength();
for (len = 0; len < filelen; )
{
memset(strw, 0, WORDLEN) ;
len += ReadInt(lpfw, lenw);
if (lenw == 0)
break;
len += ReadStr(lpfw, lenw, strw);
len += ReadFloat(lpfw, pp); //read word prior-probabilities
sumwordfre += pp;
pos = WT.AddTail(WTItem(strw, pp));
//set word index
if (lenw <= 2)
{
id = GetWordId((char*)strw);
//if (WIdx[id] == 0)
WIdx[id] = pos;
}
}
//read cwmap file
filelen = filecw.GetLength();
for (len = 0; len < filelen; )
{
memset(strc, 0, CODELEN) ;
memset(strw, 0, WORDLEN) ;
len += ReadInt(lpfcw, lenc);
if (lenc == 0)
break;
len += ReadStr(lpfcw, lenc, strc);
len += ReadInt(lpfcw, lenw);
len += ReadStr(lpfcw, lenw, strw);
pos = CWT.AddTail(CWTItem(strc, strw));
//set cwmap index
if (strcmp((char*)strc, (char*)last_strc) != 0)
{
id = GetCodeId((char*)strc);
//if (CWIdx[id] == 0)
CWIdx[id] = pos;
}
strcpy((char*)last_strc, (char*)strc);
}
//read wcmap file
filelen = filewc.GetLength();
for (len = 0; len < filelen; )
{
memset(strc, 0, CODELEN) ;
memset(strw, 0, WORDLEN) ;
len += ReadInt(lpfwc, lenw);
if (lenw == 0)
break;
len += ReadStr(lpfwc, lenw, strw);
len += ReadInt(lpfwc, lenc);
len += ReadStr(lpfwc, lenc, strc);
len += ReadFloat(lpfwc, cp); //read conditional probabilities
len += ReadFloat(lpfwc, cp2); //read conditional probabilities
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -