📄 pageinit.cpp
字号:
return TRUE;
}
return FALSE;
}
//find the word-code item in word-code table.
//if the item found, return TRUE, outherwise FALSE
//poswc gives the start position,
//if item not found but the word's first chinses char found, poswc gives word's the last appearing position.
//if item not found but the word's first chinses char not found, poswc is 0.
static BOOL WCLocate(CString strw, CString strc,
CList<WCLItem, WCLItem&> *listwc,
CMap<int, int&, POSITION, POSITION&> *wcidx, POSITION& poswc, BOOL& bWC2Find)
{
CString str1, str2;
bWC2Find = FALSE;
for (; poswc != 0; listwc->GetNext(poswc)) //var "i" for recording if the strw shoule replace first word
{
str1 = listwc->GetAt(poswc).word;
str2 = listwc->GetAt(poswc).code;
if (strw == str1)
{
if (strc == str2)
return TRUE;
else if (strc.GetLength() < str2.GetLength()) //change cond. (strc < str2) to this for arrange first level word at the beginner
break;
else if (strc.Left(2) == str2)
bWC2Find = TRUE;
}
else
{
if (strw.Left(2) != str1.Left(2))
break;
else if (strw < str1)
break;
}
}
return FALSE;
}
//find the word-word item in word-word table.
//if the item not found, return FALSE, outherwise, ture
//posloc gives the position when found,
//if not found, posloc gives strw1's the last appearing position.
//if strw1 not exist, posloc is 0.
static BOOL WWLocate(CString word, CString wordc,
CList<WWLItem, WWLItem&> *listww,
CMap<int, int&, POSITION, POSITION&> *wwidx, POSITION& posloc)
{
posloc = 0;
int id = GetWordId(word);
POSITION pos;
if (!wwidx->Lookup(id, pos))
return FALSE;
BOOL bHasLoc = FALSE;
for (; pos; listww->GetNext(pos))
{
if (listww->GetAt(pos).word == word)
bHasLoc = TRUE;
if (bHasLoc)
{
if (listww->GetAt(pos).word != word)
{
posloc = pos;
listww->GetPrev(posloc);
return FALSE;
}
else if (listww->GetAt(pos).wordc == wordc)
{
posloc = pos;
return TRUE;
}
}
}
posloc = listww->GetTailPosition();
return FALSE;
}
static BOOL Convert(CString csMBFileName, CMessWnd* cMessWnd)
{
CFile filemb;
CFile filec;
CFile filew;
CFile filecw;
CFile filewc;
CFile fileww;
CFileException e;
if (!filemb.Open(csMBFileName, CFile::modeRead, &e))
{
MessageBox(NULL, "打开码表文件失败!", "错误", MB_ICONSTOP|MB_OK);
return FALSE;
}
filec.Open("..\\mb\\code.dat", CFile::modeCreate|CFile::modeWrite, &e);
filew.Open("..\\mb\\word.dat", CFile::modeCreate|CFile::modeWrite, &e);
filecw.Open("..\\mb\\cwmap.dat", CFile::modeCreate|CFile::modeWrite, &e);
filewc.Open("..\\mb\\wcmap.dat", CFile::modeCreate|CFile::modeWrite, &e);
fileww.Open("..\\mb\\wwmap.dat", CFile::modeCreate|CFile::modeWrite, &e);
//unsigned char buffc[WORD_NUM][CODE_LEN];
//unsigned char buffw[WORD_NUM][WORD_LEN];
// CStringList listc1;
// CList<int, int&> listc2; //duplicate word number for a code
CMap<int, int&, POSITION, POSITION&> cidx; //index for wcmap word
CList<CLItem, CLItem&> listc;
// CStringList listcw1;
// CStringList listcw2;
// CList<float, float&> listcw3; //for computer transition probabilities
CMap<int, int&, POSITION, POSITION&> cwidx; //index for wcmap word
CList<CWLItem, CWLItem&> listcw;
// CStringList listwc1;
// CStringList listwc2;
// CList<float, float&> listwc3; //transition probabilities
// CMap<POSITION, POSITION&, POSITION, POSITION&> wcmap;
CMap<int, int&, POSITION, POSITION&> wcidx; //index for wcmap word
CList<WCLItem, WCLItem&> listwc;
CMap<int, int&, POSITION, POSITION&> widx; //index for wcmap word
CList<WLItem, WLItem&> listw;
CMap<int, int&, POSITION, POSITION&> wwidx; //index for wwmap word
CList<WWLItem, WWLItem&> listww;
unsigned char buffl[LINE_LEN];
unsigned char buffc[CODE_LEN];
unsigned char buffw[WORD_LEN];
POSITION posc, posw, poscw, poswc, poswc0, posww, posww0;
CString strw, strc, last_strw, last_strc, str1, str2;
unsigned char newl[2] = {13, 10};
int stop = 0;
int numl; //line number of origin mb-file
int lenl, lenc, lenw;
int nw, nw2; //numw is not used now
float fcp, fcp2, ftp, fpp, fpp2; //conditional probability and prior probability
int idc, idw;
BOOL bUpdateIdx;
int n;
CLItem itemc;
WLItem itemw;
CWLItem itemcw;
WCLItem itemwc;
WWLItem itemww;
float area, area2, sump, sump2;
int state;
int wordnum, chcharnum;
BOOL bWCFind, bWC2Find;
numl = CalcLineNum(&filemb);
cMessWnd->SetMessage("生成码表");
cMessWnd->m_prog.SetRange32(0, numl);
cMessWnd->m_prog.SetStep(1);
//ic = 1;
for ( ;; )
{
memset(buffl, 0, 256);
lenl = ReadLine(&filemb, buffl);
if (lenl == 0)
break;
unsigned char* blp = buffl;
memset(buffc, 0, CODE_LEN);
lenc = ReadCode(blp, buffc);
if (lenc == 0)
break;
idc = GetCodeId(CString(buffc));
BOOL bAutoAdd = (AUTOADD2CCODE && (strlen((char*)buffc) > 2));
nw = nw2 = 0; //number of duplicate word for the same code
for( ; ; )
{
memset(buffw, 0, WORD_LEN);
lenw = ReadWord(blp, buffw);
if (CString(buffw) == "你")
Sleep(0);
if (lenw == 0)
break;
if (lenw%2 == 1) // a word' length must be integral times of 2
{
MessageBox(NULL, CString("转换码表时发生错误,错误码")+CString(buffc), "错误", MB_ICONSTOP|MB_OK);
stop = 1;
break;
}
//add code-word item into CWT
BOOL bAutoAddCW = bAutoAdd && (strlen((char*)buffw) == 2);
if (bAutoAddCW)
{
strc = CString(buffc).Left(2);
strw = CString(buffw);
int id = GetCodeId(strc);
POSITION pos;
if (!CWLocate(strc, strw, &listcw, &cwidx, pos))
{//if the strc-strw item not exists, add it
if (pos)
listcw.InsertBefore(pos, CWLItem(strc, strw, 0.0, 0.0, TRUE));
else
{
pos = listcw.AddTail(CWLItem(strc, strw, 0.0, 0.0, TRUE));
if (cwidx[id] == 0) //no 1 item of the code in code-word table
cwidx[id] = pos;
}
if (cidx[id] == 0) //code does not exist
{
cidx[id] = listc.AddTail(CLItem(strc, 0, 1));
}
else
{
itemc = listc.GetAt(cidx[id]);
itemc.numw2++;
listc.SetAt(cidx[id], itemc);
}
}
}
if (cwidx[idc] == 0)
cwidx[idc] = listcw.AddTail(CWLItem(buffc, buffw, 0.0, 0.0, FALSE));
else
listcw.InsertAfter(cwidx[idc], CWLItem(buffc, buffw, 0.0, 0.0, FALSE));
nw++;
nw2++;
//add word and it's corresponding code index into listwc;
BOOL bAutoAddWC = bAutoAddCW;
idw = GetWordId(CString(buffw));
poswc = wcidx[idw];
if (!poswc) //the word's fist chinese character doesn't exist
{
if (bAutoAddWC)
{
poswc = listwc.AddTail(WCLItem(buffw, CString(buffc).Left(2), 0.0f, 0.0f, TRUE));
listwc.AddTail(WCLItem(buffw, buffc, 0.0f, 0.0f, FALSE));
wcidx.SetAt(idw, poswc);
}
else
{
poswc = listwc.AddTail(WCLItem(buffw, buffc, 0.0f, 0.0f, FALSE));
wcidx.SetAt(idw, poswc);
}
}
else
{
strw = CString(buffw);
strc = CString(buffc);
bWCFind = WCLocate(strw, strc, &listwc, &wcidx, poswc, bWC2Find);
if (!bWCFind) //item not exist
{
//find the apporiate position to insert buffw-buffc item, note all items are sort by word and code's length
if (!poswc) //insert position is at the tail
{
if (bAutoAddWC) //the buffc.left(2)-buffw item does not exist
{
int id = GetWordId(strw.Left(2));
if (wcidx[id] == 0)
wcidx[id] = listwc.AddTail(WCLItem(strw, strc.Left(2), 0.0f, 0.0f, TRUE));
}
listwc.AddTail(WCLItem(strw, strc, 0.0f, 0.0f, FALSE));
}
else
{
if (poswc == wcidx[idw]) //the insert position is the word-two-first-bytes's first appearing position
bUpdateIdx = TRUE; //update index table
else
bUpdateIdx = FALSE;
if (bAutoAddWC) //the buffc.left(2)-buffw item does not exist
{
poswc = listwc.InsertBefore(poswc, WCLItem(strw, strc, 0.0f, 0.0f, FALSE));
if (!bWC2Find)
poswc = listwc.InsertBefore(poswc, WCLItem(strw, strc.Left(2), 0.0f, 0.0f, TRUE));
}
else
{
poswc = listwc.InsertBefore(poswc, WCLItem(strw, strc, 0.0f, 0.0f, FALSE));
}
if (bUpdateIdx)
wcidx[idw] = poswc;
}
}
}
}
//add code into CT table
posc = listc.AddTail(CLItem(buffc, nw, nw2));
cidx.SetAt(idc, posc);
cMessWnd->m_prog.StepIt();
if (stop)
break;
}
if (stop)
return FALSE;
//calculate conditional probability
cMessWnd->SetMessage("计算条件概率");
cMessWnd->m_prog.SetRange32(0, listcw.GetCount()+listwc.GetCount()*2);
cMessWnd->m_prog.SetStep(1);
cMessWnd->m_prog.SetPos(0);
last_strc = "";
for (poscw = listcw.GetHeadPosition(); poscw; )
{
itemcw = listcw.GetAt(poscw);
strc = itemcw.code;
if (strc != last_strc)
{
idc = GetCodeId(strc);
nw = listc.GetAt(cidx[idc]).numw;
area = float(nw*nw);
if (AUTOADD2CCODE)
{
area2 = float(nw2*nw2);
nw2 = listc.GetAt(cidx[idc]).numw2;
}
n = 0;
}
if (0 == n)
itemcw.post = (2*(nw-n)-1)/area*POST_SUM + (1-POST_SUM);
else
{
if (n < nw) //note! n may be more than nw, if AUTOADD2CCODE set
itemcw.post = (2*(nw-n)-1)/area*POST_SUM; //按直角三角形面积分配后验概率, 但有
}
if (AUTOADD2CCODE && strc.GetLength() <= 2)
{
if (0 == n)
itemcw.post2 = (2*(nw2-n)-1)/area2*POST_SUM + (1-POST_SUM);
else
itemcw.post2 = (2*(nw2-n)-1)/area2*POST_SUM;
}
listcw.SetAt(poscw, itemcw);
last_strc = strc;
n++;
listcw.GetNext(poscw);
cMessWnd->m_prog.StepIt();
}
poswc = listwc.GetHeadPosition();
last_strw = listwc.GetAt(poswc).word;
last_strc = listwc.GetAt(poswc).code;
poswc0 = poswc;
sump = 0.0f;
sump2 = 0.0f;
state = 0;
for (; poswc; listwc.GetNext(poswc))
{
itemwc = listwc.GetAt(poswc);
strw = itemwc.word;
strc = itemwc.code;
if (strw == "你")
Sleep(0);
if (strw != last_strw)
{
if (state == 0)
{
poswc = poswc0; //if state == 0, return to the first item of this word and calc post prob.
state = 1;
itemwc = listwc.GetAt(poswc);
strw = itemwc.word;
strc = itemwc.code;
}
else
{
poswc0 = poswc;
sump = 0.0f;
sump2 = 0.0f;
state = 0;
}
}
if (state == 0)
{
poscw = CWFind(strc, strw, &listcw, &cwidx);
itemwc.cp = listcw.GetAt(poscw).post;
itemwc.cp2 = listcw.GetAt(poscw).post2;
listwc.SetAt(poswc, itemwc);
sump += itemwc.cp;
sump2 += itemwc.cp2;
}
else
{
itemwc.cp /= sump;
itemwc.cp2 /= sump2;
listwc.SetAt(poswc, itemwc);
}
last_strw = strw;
cMessWnd->m_prog.StepIt();
}
// if (nc == 1)
// {
// fcp = 1.0f;
// listwc3.AddTail(fcp);
// }
// else
// {
//// if (Is1CWord(last_strw)) //一级字优先
//// {
//// fcp = 0.9f;
//// listwc3.AddTail(fcp);
//// fcp = 0.1f/(nc-1);
//// }
//// else
// {
// fcp = 0.9f;
// listwc3.AddTail(fcp);
// fcp = 0.1f/(nc-1);
// }
//
// for (int n = 1; n < nc; n++)
// listwc3.AddTail(fcp);
// }
//
// numw++;
// nc = 1;
// last_strw = strw;
// }
// else
// {
// nc++;
// }
// }
//generate word table
cMessWnd->SetMessage("生成词表");
cMessWnd->m_prog.SetRange32(0, listwc.GetCount() + listw.GetCount());
cMessWnd->m_prog.SetStep(1);
cMessWnd->m_prog.SetPos(0);
wordnum = 0;
chcharnum = 0;
last_strw = "";
fpp = DEFAULT_PRIOR; //initialize the word prior probabilities with uniform distribution
fpp2 = DEFAULT_PRIOR;
for (poswc = listwc.GetHeadPosition(); poswc != NULL; )
{
itemwc = listwc.GetNext(poswc);
strw = itemwc.word;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -