📄 hikerwbfunc.cpp
字号:
pos = CPT.AddTail(CPTItem(strw, strc, cp, cp2));
//set wcmap index
if (strncmp((char*)strw, (char*)last_strw, 2) != 0)
{
id = GetWordId((char*)strw);
//if (CPIdx[id] == 0)
CPIdx[id] = pos;
}
strcpy((char*)last_strw, (char*)strw);
}
//read wwmap file
filelen = fileww.GetLength();
for (len = 0; len < filelen; )
{
memset(strw, 0, CODELEN) ;
memset(strw1, 0, WORDLEN) ;
len += ReadInt(lpfww, lenw);
if (lenw == 0)
break;
len += ReadStr(lpfww, lenw, strw);
len += ReadInt(lpfww, lenw1);
len += ReadStr(lpfww, lenw1, strw1);
len += ReadFloat(lpfww, tp); //read transition probabilities
pos = TPT.AddTail(TPTItem(strw, strw1, tp));
if (strncmp((char*)strw, (char*)last_strw, 2) != 0)
{
id = GetWordId((char*)strw);
//if (TPIdx[id] == 0)
TPIdx[id] = pos;
}
strcpy((char*)last_strw, (char*)strw);
}
//read pinyin file
filelen = filepy.GetLength();
for (len = 0; len < filelen; )
{
memset(strc, 0, CODELEN) ;
memset(strw, 0, WORDLEN) ;
len += ReadInt(lpfpy, lenc);
if (lenc == 0)
break;
len += ReadStr(lpfpy, lenc, strc);
len += ReadStr(lpfpy, 2, strw);
pos = PYT.AddTail(PYTItem(strc, strw));
//set cwmap index
if (strcmp((char*)strc, (char*)last_strc) != 0)
{
id = GetCodeId((char*)strc);
//if (PYIdx[id] == 0)
PYIdx[id] = pos;
}
strcpy((char*)last_strc, (char*)strc);
}
UnmapViewOfFile(lpfc);
UnmapViewOfFile(lpfw);
UnmapViewOfFile(lpfcw);
UnmapViewOfFile(lpfwc);
UnmapViewOfFile(lpfww);
UnmapViewOfFile(lpfpy);
CloseHandle(hfilec);
CloseHandle(hfilew);
CloseHandle(hfilecw);
CloseHandle(hfilewc);
CloseHandle(hfileww);
CloseHandle(hfilepy);
filec.Close();
filew.Close();
filecw.Close();
filewc.Close();
fileww.Close();
filepy.Close();
_ftime(&time1);
elptime = time1.time*1000+time1.millitm - time0.time*1000+time0.millitm;
return TRUE;
}
//read mb file to arrays or lists, as well as generating indexs
static BOOL SaveMB()
{
CFile filec;
CFile filew;
CFile filecw;
CFile filewc;
CFile fileww;
//CFileException e;
CString csCap;
AfxGetMainWnd()->GetWindowText(csCap);
if (!filec.Open("..\\mb\\code.dat", CFile::modeReadWrite) ||
!filew.Open("..\\mb\\word.dat", CFile::modeReadWrite) ||
!filecw.Open("..\\mb\\cwmap.dat", CFile::modeReadWrite) ||
!filewc.Open("..\\mb\\wcmap.dat", CFile::modeReadWrite) ||
!fileww.Open("..\\mb\\wwmap.dat", CFile::modeReadWrite))
MessageBox(NULL, "打开码表文件失败!", csCap, MB_OK|MB_ICONSTOP);
CString strc, strw, strw1;
int lenc, lenw, nw, nw2, lenw1;
float cp, cp2, tp, pp; //!must be float type.
POSITION pos;
CTItem itemc;
WTItem itemw;
CWTItem itemcw;
CPTItem itemcp;
TPTItem itemtp;
DWORD lenfl, lenfh, lenfc, lenfw, lenfcw, lenfwc, lenfww;
// lenfl = GetFileSize((HANDLE)(filec.m_hFile), &lenfh);
// HANDLE hfilec = CreateFileMapping((HANDLE)filec.m_hFile, NULL, PAGE_READWRITE, lenfh, lenfl+1000, 0);
// lenfl = GetFileSize((HANDLE)(filew.m_hFile), &lenfh);
// HANDLE hfilew = CreateFileMapping((HANDLE)filew.m_hFile, NULL, PAGE_READWRITE, lenfh, lenfl+10000, 0);
// lenfl = GetFileSize((HANDLE)(filecw.m_hFile), &lenfh);
// HANDLE hfilecw = CreateFileMapping((HANDLE)filecw.m_hFile, NULL, PAGE_READWRITE, lenfh, lenfl+10000, 0);
// lenfl = GetFileSize((HANDLE)(filewc.m_hFile), &lenfh);
// HANDLE hfilewc = CreateFileMapping((HANDLE)filewc.m_hFile, NULL, PAGE_READWRITE, lenfh, lenfl+10000, 0);
// lenfl = GetFileSize((HANDLE)(fileww.m_hFile), &lenfh);
// HANDLE hfileww = CreateFileMapping((HANDLE)fileww.m_hFile, NULL, PAGE_READWRITE, lenfh, lenfl+1000000, 0);
//calculate new file length
lenfc = 0;
for (pos = CT.GetHeadPosition(); pos != NULL; )
{
lenfc += CT.GetNext(pos).code.GetLength();
lenfc += 12;
}
lenfw = 0;
for (pos = WT.GetHeadPosition(); pos != NULL; )
{
lenfw += WT.GetNext(pos).word.GetLength();
lenfw += 8;
}
lenfcw = 0;
for (pos = CWT.GetHeadPosition(); pos != NULL; )
{
itemcw = CWT.GetNext(pos);
lenfcw += itemcw.code.GetLength();
lenfcw += itemcw.word.GetLength();
lenfcw += 8;
}
lenfwc = 0;
for (pos = CPT.GetHeadPosition(); pos != NULL; )
{
itemcp = CPT.GetNext(pos);
lenfwc += itemcp.word.GetLength();
lenfwc += itemcp.code.GetLength();
lenfwc += 16;
}
lenfww = 0;
for (pos = TPT.GetHeadPosition(); pos != NULL; )
{
itemtp = TPT.GetNext(pos);
lenfww += itemtp.word.GetLength();
lenfww += itemtp.wordc.GetLength();
lenfww += 12;
}
HANDLE hfilec = CreateFileMapping((HANDLE)filec.m_hFile, NULL, PAGE_READWRITE, 0, lenfc, 0);
HANDLE hfilew = CreateFileMapping((HANDLE)filew.m_hFile, NULL, PAGE_READWRITE, 0, lenfw, 0);
HANDLE hfilecw = CreateFileMapping((HANDLE)filecw.m_hFile, NULL, PAGE_READWRITE, 0, lenfcw, 0);
HANDLE hfilewc = CreateFileMapping((HANDLE)filewc.m_hFile, NULL, PAGE_READWRITE, 0, lenfwc, 0);
HANDLE hfileww = CreateFileMapping((HANDLE)fileww.m_hFile, NULL, PAGE_READWRITE, 0, lenfww, 0);
LPBYTE lpfc = (LPBYTE)MapViewOfFile(hfilec, FILE_MAP_WRITE, 0, 0, 0);
LPBYTE lpfw = (LPBYTE)MapViewOfFile(hfilew, FILE_MAP_WRITE, 0, 0, 0);
LPBYTE lpfcw = (LPBYTE)MapViewOfFile(hfilecw, FILE_MAP_WRITE, 0, 0, 0);
LPBYTE lpfwc = (LPBYTE)MapViewOfFile(hfilewc, FILE_MAP_WRITE, 0, 0, 0);
LPBYTE lpfww = (LPBYTE)MapViewOfFile(hfileww, FILE_MAP_WRITE, 0, 0, 0);
struct _timeb time0, time1;
time_t elptime;
_ftime(&time0);
try
{
//write Code table
for (pos = CT.GetHeadPosition(); pos != NULL; )
{
itemc = CT.GetNext(pos);
strc = itemc.code;
nw = itemc.wnum;
nw2 = itemc.wnum2;
lenc = strc.GetLength();
WriteInt(lpfc, lenc);
WriteStr(lpfc, (LPSTR)(LPCSTR)strc, lenc);
WriteInt(lpfc, nw);
WriteInt(lpfc, nw2);
//
// filec.Write(&lenc, 4);
// filec.Write(strc, lenc);
// filec.Write(&nw, 4);
// filec.Write(&nw2, 4);
}
}
catch(...)
{
MessageBox(0, "保存码表时出现错误!", "终结者五笔", MB_OK|MB_ICONERROR);
}
try
{
//write word table
for (pos = WT.GetHeadPosition(); pos != NULL; )
{
itemw = WT.GetNext(pos);
strw = itemw.word;
pp = itemw.prior;
lenw = strw.GetLength();
WriteInt(lpfw, lenw);
WriteStr(lpfw, (LPSTR)(LPCSTR)strw, lenw);
WriteFloat(lpfw, pp);
// filew.Write(&lenw, 4);
// filew.Write(strw, lenw);
// filew.Write(&pp, 4);
}
}
catch(...)
{
MessageBox(0, "保存词表时出现错误!", "终结者五笔", MB_OK|MB_ICONERROR);
}
try
{
//write cwmap file
for (pos = CWT.GetHeadPosition(); pos != NULL; )
{
itemcw = CWT.GetNext(pos);
strc = itemcw.code;
strw = itemcw.word;
lenc = strc.GetLength();
lenw = strw.GetLength();
WriteInt(lpfcw, lenc);
WriteStr(lpfcw, (LPSTR)(LPCSTR)strc, lenc);
WriteInt(lpfcw, lenw);
WriteStr(lpfcw, (LPSTR)(LPCSTR)strw, lenw);
// filecw.Write(&lenc, 4);
// filecw.Write(strc, lenc);
// filecw.Write(&lenw, 4);
// filecw.Write(strw, lenw);
}
}
catch(...)
{
MessageBox(0, "保存码词表时出现错误!", "终结者五笔", MB_OK|MB_ICONERROR);
}
try
{
//write wcmap file and word file
for (pos = CPT.GetHeadPosition(); pos != NULL; )
{
itemcp = CPT.GetNext(pos);
strw = itemcp.word;
strc = itemcp.code;
cp = itemcp.cp;
cp2 = itemcp.cp2;
lenw = strw.GetLength();
lenc = strc.GetLength();
WriteInt(lpfwc, lenw);
WriteStr(lpfwc, (LPSTR)(LPCSTR)strw, lenw);
WriteInt(lpfwc, lenc);
WriteStr(lpfwc, (LPSTR)(LPCSTR)strc, lenc);
WriteFloat(lpfwc, cp);
WriteFloat(lpfwc, cp2);
}
}
catch(...)
{
MessageBox(0, "保存词码表时出现错误!", "终结者五笔", MB_OK|MB_ICONERROR);
}
try
{
//write transition probability table
for (pos = TPT.GetHeadPosition(); pos != NULL; )
{
itemtp = TPT.GetNext(pos);
strw = itemtp.word;
strw1 = itemtp.wordc;
tp = itemtp.tp;
lenw = strw.GetLength();
lenw1 = strw1.GetLength();
WriteInt(lpfww, lenw);
WriteStr(lpfww, (LPSTR)(LPCSTR)strw, lenw);
WriteInt(lpfww, lenw1);
WriteStr(lpfww, (LPSTR)(LPCSTR)strw1, lenw1);
WriteFloat(lpfww, tp);
}
}
catch(...)
{
MessageBox(0, "保存词词表时出现错误!", "终结者五笔", MB_OK|MB_ICONERROR);
}
UnmapViewOfFile(lpfc);
UnmapViewOfFile(lpfw);
UnmapViewOfFile(lpfcw);
UnmapViewOfFile(lpfwc);
UnmapViewOfFile(lpfww);
CloseHandle(hfilec);
CloseHandle(hfilew);
CloseHandle(hfilecw);
CloseHandle(hfilewc);
CloseHandle(hfileww);
filec.SetLength(lenfc);
filew.SetLength(lenfw);
filecw.SetLength(lenfcw);
filewc.SetLength(lenfwc);
fileww.SetLength(lenfww);
filec.Close();
filew.Close();
filecw.Close();
filewc.Close();
fileww.Close();
_ftime(&time1);
elptime = time1.time*1000+time1.millitm - time0.time*1000+time0.millitm;
return TRUE;
}
HKWB_API BOOL Translate(unsigned char* pcstream, unsigned char* pwstream,
int& cdnum, int& wdnum, int* pcdlen, int* pwdlen)
{
int i;
int numc;
BOOL bstop = FALSE;
unsigned char* pws = pwstream;
int wlen;
int cslen = strlen((char*)pcstream);
SegmentCode(pcstream, cslen, (unsigned char**)pcode, numc);
cdnum = numc;
if (numc < 1)
return FALSE;
//set code length array
for (i = 0; i < numc; i++)
pcdlen[i] = strlen((char*)pcode[i]);
//set last code string
// memset(plastcode, 0, CODELEN);
// strcpy((char*)plastcode, (const char*)pcode[numc-1]);
// lastcodelen = strlen((const char*)pcode[numc-1]);
// if (lastcodelen < 4 && pcstream[cslen-1] != 32)
// return FALSE;
//translate
if (!Code2Word(pcode, numc, pword))
return FALSE;
//set word length array and word stream length var.
for (i = 0; i < numc; i++)
{
strcpy((char*)pws, (const char*)pword[i]);
wlen = strlen((const char*)pword[i]);
pws += wlen;
pwdlen[i] = wlen;
}
wdnum = cdnum; //if Code2Word false, wslen and wdnum remains original value
return TRUE;
}
HKWB_API BOOL Train(unsigned char* pcstream, unsigned char* pwstream, int* pwdlen)
{
unsigned char buffw[WORDLEN];
CString code, word, wordc;
unsigned char* pws = pwstream;
WTItem itemw;
CWTItem itemwc;
TPTItem itemtp, itemtp0;
CPTItem itemcp;
int wdlen, numc, transwdnum;
POSITION pos, pos0, posw;
double delta, tpsum, dlt, ppsum;
SegmentCode(pcstream, strlen((char*)pcstream), pcode, numc);
word = "";
for (int n = 0; ; n++)
{
wdlen = pwdlen[n];
if (0 == wdlen)
break;
memset(buffw, 0, sizeof(buffw));
strncpy((char*)buffw, (char*)pws, wdlen);
pws += wdlen;
wordc = CString(buffw);
//update prior probabilities
if (n == 0)
{//shield above if sentence only update first word's prior
pos0 = WFind(wordc);
//calculate prior sum for this code's all words
ppsum = 0;
code = pcode[n];
int nw = 0;
for (pos = CWFind(code); pos; CWT.GetNext(pos))
{
itemwc = CWT.GetAt(pos);
if (itemwc.code != code)
break;
ppsum += WT.GetAt(WFind(CWT.GetAt(pos).word)).prior;
nw++;
}
if (pos0) //06.4.4
{
//delta = (ppsum - WT.GetAt(pos0).prior)/2; //seems too sensitive
delta = (ppsum - WT.GetAt(pos0).prior)/3; //seems too sensitive
//delta = ppsum/nw; //seems too insensitive
//adjust priors of this code's all words
for (pos = CWFind(pcode[n]); pos; CWT.GetNext(pos))
{
itemwc = CWT.GetAt(pos);
if (itemwc.code != code)
break;
posw = WFind(itemwc.word);
itemw = WT.GetAt(posw);
itemw.prior -= itemw.prior/ppsum*delta;
ASSERT(itemw.prior > 0);
WT.SetAt(posw, itemw);
}
itemw = WT.GetAt(pos0);
itemw.prior += delta;
ASSERT(itemw.prior > 0);
WT.SetAt(pos0, itemw);
}
}
//update transition probabilities
if (word != "")
{
if (TPFind(word) != 0) //the lastword-word item exist in TPmap table
{
pos0 = TPFind(word, wordc);
if (pos0)
itemtp0 = TPT.GetAt(pos0);
else
{
itemtp0.word = word;
itemtp0.wordc = wordc;
itemtp0.tp = 0.0;
}
tpsum = TPSum(word, transwdnum);
delta = (((itemtp0.tp+TPINC) < TPSUMLMT) ? TPINC : TPSUMLMT-itemtp0.tp);
dlt = tpsum+delta-TPSUMLMT;
//adjust all existing item's tp
if (dlt > 0.0)
{
pos = TPFind(word);
for (; pos; TPT.GetNext(pos))
{
itemtp = TPT.GetAt(pos);
if (itemtp.word != word)
break;
itemtp.tp -= itemtp.tp/tpsum*dlt;
ASSERT(itemtp.tp > 0);
TPT.SetAt(pos, itemtp);
}
}
if (pos0) //refresh itemtp0
itemtp0 = TPT.GetAt(pos0);
itemtp0.tp += delta;
ASSERT(itemtp0.tp > 0);
if (pos0)
TPT.SetAt(pos0, itemtp0);
else
{
pos = TPFind(word); //locate the last word-wordc item
for (; pos; TPT.GetNext(pos))
{
if (TPT.GetAt(pos).word != word)
break;
}
if (pos) //the above loop are breaked, this means the pos is not the end of Word-word table
TPT.InsertBefore(pos, itemtp0);
else
TPT.AddTail(itemtp0);
}
ASSERT(TPSum(word, transwdnum) <= TPSUMLMT+1e-5);
}
else
{
itemtp0.word = word;
itemtp0.wordc = wordc;
itemtp0.tp = TPINC;
pos = TPT.AddTail(itemtp0);
TPIdx[GetWordId((LPSTR)(LPCSTR)word)] = pos;
}
}
word = wordc;
//update conditional probabilities
pos = CPFind(word);
for (; pos; CPT.GetNext(pos))
{
itemcp = CPT.GetAt(pos);
if (itemcp.word != word)
break;
if (maxcodelen == 2)
{
itemcp.cp2 -= CPINC*itemcp.cp2;
ASSERT(itemcp.cp2 > 0 || itemcp.code.GetLength() > 2);
}
else
{
itemcp.cp -= CPINC*itemcp.cp;
//ASSERT(itemcp.cp > 0);
}
if (itemcp.code == pcode[n])
{
if (maxcodelen == 2)
itemcp.cp2 += CPINC;
else
itemcp.cp += CPINC;
}
CPT.SetAt(pos, itemcp);
}
}
return TRUE;
}
//generate select words table
HKWB_API BOOL GetWords(unsigned char* nofullcode, unsigned char* pcstream, int codeno, unsigned char* pwords, int& wdnum)
{
unsigned char code[CODELEN];
double ai[DUPWORD];
int wi[DUPWORD];
double dTmp;
int c, d, numc, nw, len;
CString word;
POSITION ic;
wdnum = 0;
len = strlen((char*)nofullcode);
if ( nofullcode[len-1] != 32 && len < maxcodelen)
{//generate select word by prior
strcpy((char*)code, (char*)nofullcode);
len = strlen((char*)code);
ic = CFind(code);
if (0 == ic)
return FALSE;
//transfer a[codeno][.] to ai
nw = ((maxcodelen == 4) ? CT.GetAt(ic).wnum : CT.GetAt(ic).wnum2); //get duplicate word number for this code
ic = CWFind(code);
for (int m=0; m < nw; m++)
{
ai[m] = WT.GetAt(WFind(CWT.GetNext(ic).word)).prior/sumwordfre;
wi[m] = m;
}
}
else
{//generate select word by post
if (codeno < 0)
return FALSE;
SegmentCode(pcstream, strlen((char*)pcstream), pcode, numc);
strcpy((char*)code, (char*)pcode[codeno]);
len = strlen((char*)code);
if (32 == code[len-1])
code[len-1] = 0;
ic = CFind(code);
if (0 == ic)
return FALSE;
//transfer a[codeno][.] to ai
nw = ((maxcodelen == 4) ? CT.GetAt(ic).wnum : CT.GetAt(ic).wnum2); //get duplicate word number for this code
for (int m=0; m < nw; m++)
{
ai[m] = a[codeno][m];
wi[m] = m;
}
}
//sort ai and send the index to wi
for (int n = 0; n < nw-1; n++)
{
c = n;
for (int m=n+1; m < nw; m++)
{
if (ai[m] > ai[c]) //swap wi[n] and wi[m]
{
c = m;
}
}
//swap ai[n] and ai[wi[n]]
if (c != n)
{
dTmp = ai[n];
ai[n] = ai[c];
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -