⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pageinit.cpp

📁 自己用Markov模型做的一个整句物笔输入法的原型
💻 CPP
📖 第 1 页 / 共 3 页
字号:
			return TRUE;
	}

	return FALSE;
}

//find the word-code item in word-code table.
//if the item found, return TRUE, outherwise FALSE
//poswc gives the start position,
//if item not found but the word's first chinses char found, poswc gives word's the last appearing position.
//if item not found but the word's first chinses char not found, poswc is 0.
static BOOL WCLocate(CString strw, CString strc,
				CList<WCLItem, WCLItem&> *listwc,
				CMap<int, int&, POSITION, POSITION&> *wcidx, POSITION& poswc, BOOL& bWC2Find)
{
	CString str1, str2;

	bWC2Find = FALSE;

	for (; poswc != 0; listwc->GetNext(poswc)) 	//var "i" for recording if the strw shoule replace first word
	{
		str1 = listwc->GetAt(poswc).word;
		str2 = listwc->GetAt(poswc).code;

		if (strw == str1)
		{
			if (strc == str2)
				return TRUE;
			else if (strc.GetLength() < str2.GetLength())	//change cond. (strc < str2) to this for arrange first level word at the beginner
				break;
			else if (strc.Left(2) == str2)
				bWC2Find = TRUE;
		}
		else
		{
			if (strw.Left(2) != str1.Left(2))
				break;
			else if (strw < str1)
				break;
		}
	}

	return FALSE;
}

//find the word-word item in word-word table.
//if the item not found, return FALSE, outherwise, ture
//posloc gives the position when found, 
//if not found, posloc gives strw1's the last appearing position.
//if strw1 not exist, posloc is 0.
static BOOL WWLocate(CString word, CString wordc,
				CList<WWLItem, WWLItem&> *listww,
				CMap<int, int&, POSITION, POSITION&> *wwidx, POSITION& posloc)
{
	posloc = 0;
	int id = GetWordId(word);
	POSITION pos;

	if (!wwidx->Lookup(id, pos))
		return FALSE;

	BOOL bHasLoc = FALSE;
	for (; pos; listww->GetNext(pos))
	{
		if (listww->GetAt(pos).word == word)
			bHasLoc = TRUE;

		if (bHasLoc)
		{
			if (listww->GetAt(pos).word != word)
			{
				posloc = pos;
				listww->GetPrev(posloc);
				return FALSE;
			}
			else if (listww->GetAt(pos).wordc == wordc)
			{
				posloc = pos;
				return TRUE;
			}
		}
	}

	posloc = listww->GetTailPosition();
	return FALSE;
}


static BOOL Convert(CString csMBFileName, CMessWnd* cMessWnd)
{
	CFile filemb;
	CFile filec;
	CFile filew;
	CFile filecw;
	CFile filewc;
	CFile fileww;
	CFileException e;

	if (!filemb.Open(csMBFileName, CFile::modeRead, &e))
	{
		MessageBox(NULL, "打开码表文件失败!", "错误", MB_ICONSTOP|MB_OK);
		return FALSE;
	}

	filec.Open("..\\mb\\code.dat", CFile::modeCreate|CFile::modeWrite, &e);
	filew.Open("..\\mb\\word.dat", CFile::modeCreate|CFile::modeWrite, &e);
	filecw.Open("..\\mb\\cwmap.dat", CFile::modeCreate|CFile::modeWrite, &e);
	filewc.Open("..\\mb\\wcmap.dat", CFile::modeCreate|CFile::modeWrite, &e);
	fileww.Open("..\\mb\\wwmap.dat", CFile::modeCreate|CFile::modeWrite, &e);
	

	//unsigned char buffc[WORD_NUM][CODE_LEN];
	//unsigned char buffw[WORD_NUM][WORD_LEN];

//	CStringList listc1;
//	CList<int, int&> listc2;		//duplicate word number for a code
	CMap<int, int&, POSITION, POSITION&> cidx;		//index for wcmap word
	CList<CLItem, CLItem&> listc;

//	CStringList listcw1;
//	CStringList listcw2;
//	CList<float, float&> listcw3;		//for computer transition probabilities
	CMap<int, int&, POSITION, POSITION&> cwidx;		//index for wcmap word
	CList<CWLItem, CWLItem&> listcw;

//	CStringList listwc1;
//	CStringList listwc2;
//	CList<float, float&> listwc3;			//transition probabilities
//	CMap<POSITION, POSITION&, POSITION, POSITION&> wcmap;
	CMap<int, int&, POSITION, POSITION&> wcidx;		//index for wcmap word
	CList<WCLItem, WCLItem&> listwc;

	CMap<int, int&, POSITION, POSITION&> widx;		//index for wcmap word
	CList<WLItem, WLItem&> listw;

	CMap<int, int&, POSITION, POSITION&> wwidx;		//index for wwmap word
	CList<WWLItem, WWLItem&> listww;

	unsigned char buffl[LINE_LEN];
	unsigned char buffc[CODE_LEN];
	unsigned char buffw[WORD_LEN];
	POSITION posc, posw, poscw, poswc, poswc0, posww, posww0;
	CString strw, strc, last_strw, last_strc, str1, str2;
	unsigned char newl[2] = {13, 10};
	int stop = 0;
	int numl;		//line number of origin mb-file
	int lenl, lenc, lenw;
	int nw, nw2;		//numw is not used now
	float fcp, fcp2, ftp, fpp, fpp2;				//conditional probability and prior probability
	int idc, idw;
	BOOL bUpdateIdx;
	int n;
	CLItem itemc;
	WLItem itemw;
	CWLItem itemcw;
	WCLItem itemwc;
	WWLItem itemww;
	float area, area2, sump, sump2;
	int state;
	int wordnum, chcharnum;
	BOOL bWCFind, bWC2Find;

	numl = CalcLineNum(&filemb);
	cMessWnd->SetMessage("生成码表");
	cMessWnd->m_prog.SetRange32(0, numl);
	cMessWnd->m_prog.SetStep(1);

	//ic = 1;
	for ( ;; )
	{
		memset(buffl, 0, 256);
		lenl = ReadLine(&filemb, buffl);
		if (lenl == 0)
			break;

		unsigned char* blp = buffl;
		memset(buffc, 0, CODE_LEN);
		lenc = ReadCode(blp, buffc);
		if (lenc == 0)
			break;

		idc = GetCodeId(CString(buffc));
		BOOL bAutoAdd = (AUTOADD2CCODE && (strlen((char*)buffc) > 2));

		nw = nw2 = 0;					//number of duplicate word for the same code
		for( ; ; )
		{
			memset(buffw, 0, WORD_LEN);
			lenw = ReadWord(blp, buffw);

			if (CString(buffw) == "你")
				Sleep(0);

			if (lenw == 0)
				break;
			
			if (lenw%2 == 1)		// a word' length must be integral times of 2
			{
				MessageBox(NULL, CString("转换码表时发生错误,错误码")+CString(buffc), "错误", MB_ICONSTOP|MB_OK);
				stop = 1;
				break;
			}

			//add code-word item into CWT
			BOOL bAutoAddCW = bAutoAdd && (strlen((char*)buffw) == 2);
			if (bAutoAddCW)
			{
				strc = CString(buffc).Left(2);
				strw = CString(buffw);

				int id = GetCodeId(strc);
				POSITION pos;
				if (!CWLocate(strc, strw, &listcw, &cwidx, pos))
				{//if the strc-strw item not exists, add it
					if (pos)
						listcw.InsertBefore(pos, CWLItem(strc, strw, 0.0, 0.0, TRUE));
					else
					{
						pos = listcw.AddTail(CWLItem(strc, strw, 0.0, 0.0, TRUE));
						if (cwidx[id] == 0)		//no 1 item of the code in code-word table
							cwidx[id] = pos;
					}

					if (cidx[id] == 0)			//code does not exist
					{
						cidx[id] = listc.AddTail(CLItem(strc, 0, 1));
					}
					else
					{
						itemc = listc.GetAt(cidx[id]);
						itemc.numw2++;
						listc.SetAt(cidx[id], itemc);
					}
				}
			}

			if (cwidx[idc] == 0)
				cwidx[idc] = listcw.AddTail(CWLItem(buffc, buffw, 0.0, 0.0, FALSE));
			else
				listcw.InsertAfter(cwidx[idc], CWLItem(buffc, buffw, 0.0, 0.0, FALSE));
			nw++;
			nw2++;


			//add word and it's corresponding code index into listwc;
			BOOL bAutoAddWC = bAutoAddCW;
			idw = GetWordId(CString(buffw));
			poswc = wcidx[idw];
			if (!poswc)		//the word's fist chinese character doesn't exist
			{
				if (bAutoAddWC)
				{
					poswc = listwc.AddTail(WCLItem(buffw, CString(buffc).Left(2), 0.0f, 0.0f, TRUE));
					listwc.AddTail(WCLItem(buffw, buffc, 0.0f, 0.0f, FALSE));
					wcidx.SetAt(idw, poswc);
				}
				else
				{
					poswc = listwc.AddTail(WCLItem(buffw, buffc, 0.0f, 0.0f, FALSE));
					wcidx.SetAt(idw, poswc);
				}
			}
			else
			{
				strw = CString(buffw);
				strc = CString(buffc);
				bWCFind = WCLocate(strw, strc, &listwc, &wcidx, poswc, bWC2Find);

				if (!bWCFind)		//item not exist
				{
					//find the apporiate position to insert buffw-buffc item, note all items are sort by word and code's length
					if (!poswc)		//insert position is at the tail
					{
						if (bAutoAddWC)		//the buffc.left(2)-buffw item does not exist
						{
							int id = GetWordId(strw.Left(2));
							if (wcidx[id] == 0)
								wcidx[id] = listwc.AddTail(WCLItem(strw, strc.Left(2), 0.0f, 0.0f, TRUE));
						}
						listwc.AddTail(WCLItem(strw, strc, 0.0f, 0.0f, FALSE));
					}
					else		
					{
						if (poswc == wcidx[idw])		//the insert position is the word-two-first-bytes's first appearing position 
							bUpdateIdx = TRUE;			//update index table
						else
							bUpdateIdx = FALSE;

						if (bAutoAddWC)		//the buffc.left(2)-buffw item does not exist
						{
							poswc = listwc.InsertBefore(poswc, WCLItem(strw, strc, 0.0f, 0.0f, FALSE));
							if (!bWC2Find)
								poswc = listwc.InsertBefore(poswc, WCLItem(strw, strc.Left(2), 0.0f, 0.0f, TRUE));
						}
						else
						{
							poswc = listwc.InsertBefore(poswc, WCLItem(strw, strc, 0.0f, 0.0f, FALSE));
						}

						if (bUpdateIdx)
							wcidx[idw] = poswc;
					}
				}
			}
		}

		//add code into CT table
		posc = listc.AddTail(CLItem(buffc, nw, nw2));
		cidx.SetAt(idc, posc);

		cMessWnd->m_prog.StepIt();
		if (stop)
			break;
	}

	if (stop)
		return FALSE;

	//calculate conditional probability
	cMessWnd->SetMessage("计算条件概率");
	cMessWnd->m_prog.SetRange32(0, listcw.GetCount()+listwc.GetCount()*2);
	cMessWnd->m_prog.SetStep(1);
	cMessWnd->m_prog.SetPos(0);

	last_strc = "";
	for (poscw = listcw.GetHeadPosition(); poscw; )
	{
		itemcw = listcw.GetAt(poscw);
		strc = itemcw.code;
		
		if (strc != last_strc)
		{
			idc = GetCodeId(strc);
			nw = listc.GetAt(cidx[idc]).numw;
			area = float(nw*nw);
			if (AUTOADD2CCODE)
			{
				area2 = float(nw2*nw2);
				nw2 = listc.GetAt(cidx[idc]).numw2;
			}
			n = 0;
		}

		if (0 == n)
			itemcw.post = (2*(nw-n)-1)/area*POST_SUM + (1-POST_SUM);
		else
		{
			if (n < nw)		//note! n may be more than nw, if AUTOADD2CCODE set
				itemcw.post = (2*(nw-n)-1)/area*POST_SUM;		//按直角三角形面积分配后验概率, 但有
		}
		if (AUTOADD2CCODE && strc.GetLength() <= 2)
		{
			if (0 == n)
				itemcw.post2 = (2*(nw2-n)-1)/area2*POST_SUM + (1-POST_SUM);
			else
				itemcw.post2 = (2*(nw2-n)-1)/area2*POST_SUM;
		}
		listcw.SetAt(poscw, itemcw);

		last_strc = strc;
		n++;
		listcw.GetNext(poscw);

		cMessWnd->m_prog.StepIt();
	}

	
	poswc = listwc.GetHeadPosition();
	last_strw = listwc.GetAt(poswc).word;
	last_strc = listwc.GetAt(poswc).code;
	poswc0 = poswc;
	sump = 0.0f;
	sump2 = 0.0f;
	state = 0;
	for (; poswc; listwc.GetNext(poswc))
	{
		itemwc = listwc.GetAt(poswc);
		strw = itemwc.word;
		strc = itemwc.code;

		if (strw == "你")
			Sleep(0);

		if (strw != last_strw)
		{
			if (state == 0)
			{
				poswc = poswc0;		//if state == 0, return to the first item of this word and calc post prob.
				state = 1;

				itemwc = listwc.GetAt(poswc);
				strw = itemwc.word;
				strc = itemwc.code;
			}
			else
			{
				poswc0 = poswc;
				sump = 0.0f;
				sump2 = 0.0f;
				state = 0;
			}
		}

		if (state == 0)
		{
			poscw = CWFind(strc, strw, &listcw, &cwidx);
			itemwc.cp = listcw.GetAt(poscw).post;
			itemwc.cp2 = listcw.GetAt(poscw).post2;
			listwc.SetAt(poswc, itemwc);
			sump += itemwc.cp;
			sump2 += itemwc.cp2;
		}
		else
		{
			itemwc.cp /= sump;
			itemwc.cp2 /= sump2;
			listwc.SetAt(poswc, itemwc);
		}

		last_strw = strw;

		cMessWnd->m_prog.StepIt();
	}
	
//			if (nc == 1)
//			{
//				fcp = 1.0f;
//				listwc3.AddTail(fcp);
//			}
//			else
//			{
////				if (Is1CWord(last_strw))		//一级字优先
////				{
////					fcp = 0.9f;
////					listwc3.AddTail(fcp);
////					fcp = 0.1f/(nc-1);
////				}
////				else
//				{
//					fcp = 0.9f;
//					listwc3.AddTail(fcp);
//					fcp = 0.1f/(nc-1);
//				}
//
//				for (int n = 1; n < nc; n++)
//					listwc3.AddTail(fcp);
//			}
//
//			numw++;
//			nc = 1;
//			last_strw = strw;
//		}
//		else
//		{
//			nc++;
//		}
//	}

	//generate word table
	cMessWnd->SetMessage("生成词表");
	cMessWnd->m_prog.SetRange32(0, listwc.GetCount() + listw.GetCount());
	cMessWnd->m_prog.SetStep(1);
	cMessWnd->m_prog.SetPos(0);

	wordnum = 0;
	chcharnum = 0;
	last_strw = "";
	fpp = DEFAULT_PRIOR;			//initialize the word prior probabilities with uniform distribution
	fpp2 = DEFAULT_PRIOR;
	for (poswc = listwc.GetHeadPosition(); poswc != NULL; )
	{
		itemwc = listwc.GetNext(poswc);
		strw = itemwc.word;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -