📄 hikerwbfunc.cpp

📁 自己用Markov模型做的一个整句物笔输入法的原型
💻 CPP
📖 第 1 页 / 共 4 页
字号:

		pos = CPT.AddTail(CPTItem(strw, strc, cp, cp2));

		//set wcmap index
		if (strncmp((char*)strw, (char*)last_strw, 2) != 0)
		{
			id = GetWordId((char*)strw);
			//if (CPIdx[id] == 0)
				CPIdx[id] = pos;
		}

		strcpy((char*)last_strw, (char*)strw);
	}
	
	//read wwmap file
	filelen = fileww.GetLength();
	for (len = 0; len < filelen; )
	{
		memset(strw, 0, CODELEN) ;
		memset(strw1, 0, WORDLEN) ;

		len += ReadInt(lpfww, lenw);
		if (lenw == 0) 
			break;
		len += ReadStr(lpfww, lenw, strw);
		len += ReadInt(lpfww, lenw1);
		len += ReadStr(lpfww, lenw1, strw1);
		len += ReadFloat(lpfww, tp);			//read transition probabilities

		pos = TPT.AddTail(TPTItem(strw, strw1, tp));

		if (strncmp((char*)strw, (char*)last_strw, 2) != 0)
		{
			id = GetWordId((char*)strw);
			//if (TPIdx[id] == 0)
				TPIdx[id] = pos;
		}
		strcpy((char*)last_strw, (char*)strw);
	}

	//read pinyin file
	filelen = filepy.GetLength();
	for (len = 0; len < filelen; )
	{
		memset(strc, 0, CODELEN) ;
		memset(strw, 0, WORDLEN) ; 

		len += ReadInt(lpfpy, lenc);
		if (lenc == 0) 
			break;

		len += ReadStr(lpfpy, lenc, strc);
		len += ReadStr(lpfpy, 2, strw);

		pos = PYT.AddTail(PYTItem(strc, strw));

		//set cwmap index
		if (strcmp((char*)strc, (char*)last_strc) != 0)
		{
			id = GetCodeId((char*)strc);
			//if (PYIdx[id] == 0)
				PYIdx[id] = pos;
		}
		strcpy((char*)last_strc, (char*)strc);
	}

	UnmapViewOfFile(lpfc);
	UnmapViewOfFile(lpfw);
	UnmapViewOfFile(lpfcw);
	UnmapViewOfFile(lpfwc);
	UnmapViewOfFile(lpfww);
	UnmapViewOfFile(lpfpy);

	CloseHandle(hfilec);
	CloseHandle(hfilew);
	CloseHandle(hfilecw);
	CloseHandle(hfilewc);
	CloseHandle(hfileww);
	CloseHandle(hfilepy);

	filec.Close();
	filew.Close();
	filecw.Close();
	filewc.Close();
	fileww.Close();
	filepy.Close();

	_ftime(&time1);
	elptime = time1.time*1000+time1.millitm - time0.time*1000+time0.millitm;
	
	return TRUE;
}

//read mb file to arrays or lists, as well as generating indexs
static BOOL SaveMB()
{
	CFile filec;
	CFile filew;
	CFile filecw;
	CFile filewc;
	CFile fileww;
	//CFileException e;
	CString csCap;
	AfxGetMainWnd()->GetWindowText(csCap);

	if (!filec.Open("..\\mb\\code.dat", CFile::modeReadWrite) ||
		!filew.Open("..\\mb\\word.dat", CFile::modeReadWrite) ||
		!filecw.Open("..\\mb\\cwmap.dat", CFile::modeReadWrite) ||
		!filewc.Open("..\\mb\\wcmap.dat", CFile::modeReadWrite) || 
		!fileww.Open("..\\mb\\wwmap.dat", CFile::modeReadWrite))
		MessageBox(NULL, "打开码表文件失败!", csCap, MB_OK|MB_ICONSTOP);

	CString strc, strw, strw1;
	int lenc, lenw, nw, nw2, lenw1;
	float cp, cp2, tp, pp;				//!must be float type.
	POSITION pos;
	CTItem itemc;
	WTItem itemw;
	CWTItem itemcw;
	CPTItem itemcp;
	TPTItem itemtp;

	DWORD lenfl, lenfh, lenfc, lenfw, lenfcw, lenfwc, lenfww;
	
//	lenfl = GetFileSize((HANDLE)(filec.m_hFile), &lenfh);
//	HANDLE hfilec = CreateFileMapping((HANDLE)filec.m_hFile, NULL, PAGE_READWRITE, lenfh, lenfl+1000, 0);
//	lenfl = GetFileSize((HANDLE)(filew.m_hFile), &lenfh);
//	HANDLE hfilew = CreateFileMapping((HANDLE)filew.m_hFile, NULL, PAGE_READWRITE, lenfh, lenfl+10000, 0);
//	lenfl = GetFileSize((HANDLE)(filecw.m_hFile), &lenfh);
//	HANDLE hfilecw = CreateFileMapping((HANDLE)filecw.m_hFile, NULL, PAGE_READWRITE, lenfh, lenfl+10000, 0);
//	lenfl = GetFileSize((HANDLE)(filewc.m_hFile), &lenfh);
//	HANDLE hfilewc = CreateFileMapping((HANDLE)filewc.m_hFile, NULL, PAGE_READWRITE, lenfh, lenfl+10000, 0);
//	lenfl = GetFileSize((HANDLE)(fileww.m_hFile), &lenfh);
//	HANDLE hfileww = CreateFileMapping((HANDLE)fileww.m_hFile, NULL, PAGE_READWRITE, lenfh, lenfl+1000000, 0);

	//calculate new file length
	lenfc = 0;
	for (pos = CT.GetHeadPosition(); pos != NULL; )
	{
		lenfc += CT.GetNext(pos).code.GetLength(); 
		lenfc += 12;
	}

	lenfw = 0;
	for (pos = WT.GetHeadPosition(); pos != NULL; )
	{
		lenfw += WT.GetNext(pos).word.GetLength();
		lenfw += 8;
	}

	lenfcw = 0;
	for (pos = CWT.GetHeadPosition(); pos != NULL; )
	{
		itemcw = CWT.GetNext(pos);
		lenfcw += itemcw.code.GetLength();
		lenfcw += itemcw.word.GetLength();
		lenfcw += 8;
	}

	lenfwc = 0;
	for (pos = CPT.GetHeadPosition(); pos != NULL; )
	{
		itemcp = CPT.GetNext(pos);
		lenfwc += itemcp.word.GetLength();
		lenfwc += itemcp.code.GetLength();
		lenfwc += 16;
	}

	lenfww = 0;
	for (pos = TPT.GetHeadPosition(); pos != NULL; )
	{
		itemtp = TPT.GetNext(pos);
		lenfww += itemtp.word.GetLength();
		lenfww += itemtp.wordc.GetLength();
		lenfww += 12;
	}

	HANDLE hfilec = CreateFileMapping((HANDLE)filec.m_hFile, NULL, PAGE_READWRITE, 0, lenfc, 0);
	HANDLE hfilew = CreateFileMapping((HANDLE)filew.m_hFile, NULL, PAGE_READWRITE, 0, lenfw, 0);
	HANDLE hfilecw = CreateFileMapping((HANDLE)filecw.m_hFile, NULL, PAGE_READWRITE, 0, lenfcw, 0);
	HANDLE hfilewc = CreateFileMapping((HANDLE)filewc.m_hFile, NULL, PAGE_READWRITE, 0, lenfwc, 0);
	HANDLE hfileww = CreateFileMapping((HANDLE)fileww.m_hFile, NULL, PAGE_READWRITE, 0, lenfww, 0);


	LPBYTE lpfc = (LPBYTE)MapViewOfFile(hfilec, FILE_MAP_WRITE, 0, 0, 0);
	LPBYTE lpfw = (LPBYTE)MapViewOfFile(hfilew, FILE_MAP_WRITE, 0, 0, 0);
	LPBYTE lpfcw = (LPBYTE)MapViewOfFile(hfilecw, FILE_MAP_WRITE, 0, 0, 0);
	LPBYTE lpfwc = (LPBYTE)MapViewOfFile(hfilewc, FILE_MAP_WRITE, 0, 0, 0);
	LPBYTE lpfww = (LPBYTE)MapViewOfFile(hfileww, FILE_MAP_WRITE, 0, 0, 0);

	struct _timeb time0, time1;
	time_t elptime;
	_ftime(&time0);

	try
	{
		//write Code table
		for (pos = CT.GetHeadPosition(); pos != NULL; )
		{
			itemc = CT.GetNext(pos);
			strc = itemc.code;
			nw = itemc.wnum;
			nw2 = itemc.wnum2;

			lenc = strc.GetLength();
			WriteInt(lpfc, lenc);
			WriteStr(lpfc, (LPSTR)(LPCSTR)strc, lenc);
			WriteInt(lpfc, nw);
			WriteInt(lpfc, nw2);

	//
	//		filec.Write(&lenc, 4);
	//		filec.Write(strc, lenc);
	//		filec.Write(&nw, 4);
	//		filec.Write(&nw2, 4);
		}
	}
	catch(...)
	{
		MessageBox(0, "保存码表时出现错误!", "终结者五笔", MB_OK|MB_ICONERROR);
	}


	try
	{
		//write word table
		for (pos = WT.GetHeadPosition(); pos != NULL; )
		{
			itemw = WT.GetNext(pos);
			strw = itemw.word;
			pp = itemw.prior;

			lenw = strw.GetLength();
			WriteInt(lpfw, lenw);
			WriteStr(lpfw, (LPSTR)(LPCSTR)strw, lenw);
			WriteFloat(lpfw, pp);

	//		filew.Write(&lenw, 4);
	//		filew.Write(strw, lenw);
	//		filew.Write(&pp, 4);
		}
	}
	catch(...)
	{
		MessageBox(0, "保存词表时出现错误!", "终结者五笔", MB_OK|MB_ICONERROR);
	}
	
	try
	{
		//write cwmap file
		for (pos = CWT.GetHeadPosition(); pos != NULL; )
		{
			itemcw = CWT.GetNext(pos);
			strc = itemcw.code;
			strw = itemcw.word;
			lenc = strc.GetLength();
			lenw = strw.GetLength();
			WriteInt(lpfcw, lenc);
			WriteStr(lpfcw, (LPSTR)(LPCSTR)strc, lenc);
			WriteInt(lpfcw, lenw);
			WriteStr(lpfcw, (LPSTR)(LPCSTR)strw, lenw);

	//		filecw.Write(&lenc, 4);
	//		filecw.Write(strc, lenc);
	//		filecw.Write(&lenw, 4);
	//		filecw.Write(strw, lenw);
		}
	}
	catch(...)
	{
		MessageBox(0, "保存码词表时出现错误!", "终结者五笔", MB_OK|MB_ICONERROR);
	}

	try
	{
		//write wcmap file and word file
		for (pos = CPT.GetHeadPosition(); pos != NULL; )
		{
			itemcp = CPT.GetNext(pos);
			strw = itemcp.word;
			strc = itemcp.code;
			cp = itemcp.cp;
			cp2 = itemcp.cp2;

			lenw = strw.GetLength();
			lenc = strc.GetLength();
			WriteInt(lpfwc, lenw);
			WriteStr(lpfwc, (LPSTR)(LPCSTR)strw, lenw);
			WriteInt(lpfwc, lenc);
			WriteStr(lpfwc, (LPSTR)(LPCSTR)strc, lenc);
			WriteFloat(lpfwc, cp);
			WriteFloat(lpfwc, cp2);
		}
	}
	catch(...)
	{
		MessageBox(0, "保存词码表时出现错误!", "终结者五笔", MB_OK|MB_ICONERROR);
	}

	try
	{
		//write transition probability table
		for (pos = TPT.GetHeadPosition(); pos != NULL; )
		{
			itemtp = TPT.GetNext(pos);
			strw = itemtp.word;
			strw1 = itemtp.wordc;
			tp = itemtp.tp;
			lenw = strw.GetLength();
			lenw1 = strw1.GetLength();

			WriteInt(lpfww, lenw);
			WriteStr(lpfww, (LPSTR)(LPCSTR)strw, lenw);
			WriteInt(lpfww, lenw1);
			WriteStr(lpfww, (LPSTR)(LPCSTR)strw1, lenw1);
			WriteFloat(lpfww, tp);
		}
	}
	catch(...)
	{
		MessageBox(0, "保存词词表时出现错误!", "终结者五笔", MB_OK|MB_ICONERROR);
	}

	UnmapViewOfFile(lpfc);
	UnmapViewOfFile(lpfw);
	UnmapViewOfFile(lpfcw);
	UnmapViewOfFile(lpfwc);
	UnmapViewOfFile(lpfww);

	CloseHandle(hfilec);
	CloseHandle(hfilew);
	CloseHandle(hfilecw);
	CloseHandle(hfilewc);
	CloseHandle(hfileww);

	filec.SetLength(lenfc);
	filew.SetLength(lenfw);
	filecw.SetLength(lenfcw);
	filewc.SetLength(lenfwc);
	fileww.SetLength(lenfww);

	filec.Close();
	filew.Close();
	filecw.Close();
	filewc.Close();
	fileww.Close();
	
	_ftime(&time1);
	elptime = time1.time*1000+time1.millitm - time0.time*1000+time0.millitm;

	return TRUE;
}

HKWB_API BOOL Translate(unsigned char* pcstream, unsigned char* pwstream, 
						int& cdnum, int& wdnum,  int* pcdlen, int* pwdlen)
{
	int i;
	int numc;
	BOOL bstop = FALSE;
	unsigned char* pws = pwstream;
	int wlen;
	int cslen = strlen((char*)pcstream);

	SegmentCode(pcstream, cslen, (unsigned char**)pcode, numc);
	cdnum = numc;

	if (numc < 1)
		return FALSE;
	
	//set code length array
	for (i = 0; i < numc; i++)
		pcdlen[i] = strlen((char*)pcode[i]);

	//set last code string
//	memset(plastcode, 0, CODELEN);
//	strcpy((char*)plastcode, (const char*)pcode[numc-1]);
//	lastcodelen = strlen((const char*)pcode[numc-1]);
//	if (lastcodelen < 4 && pcstream[cslen-1] != 32)
//		return FALSE;

	//translate
	if (!Code2Word(pcode, numc, pword))
		return FALSE;

	//set word length array and word stream length var.
	for (i = 0; i < numc; i++)
	{
		strcpy((char*)pws, (const char*)pword[i]);
		wlen = strlen((const char*)pword[i]);
		pws += wlen;
		pwdlen[i] = wlen;
	}
	wdnum = cdnum;		//if Code2Word false, wslen and wdnum remains original value

	return TRUE;
}

HKWB_API BOOL Train(unsigned char* pcstream, unsigned char* pwstream, int* pwdlen)
{
	unsigned char buffw[WORDLEN];
	CString code, word, wordc;
	unsigned char* pws = pwstream;
	WTItem itemw;
	CWTItem itemwc;
	TPTItem itemtp, itemtp0;
	CPTItem itemcp;
	int wdlen, numc, transwdnum;
	POSITION pos, pos0, posw;
	double delta, tpsum, dlt, ppsum;

	SegmentCode(pcstream, strlen((char*)pcstream), pcode, numc);

	word = "";
	for (int n = 0; ; n++)
	{
		wdlen = pwdlen[n];
		if (0 == wdlen)
			break;

		memset(buffw, 0, sizeof(buffw));
		strncpy((char*)buffw, (char*)pws, wdlen);
		pws += wdlen;

		wordc = CString(buffw);

		//update prior probabilities
		if (n == 0)
		{//shield above if sentence only update first word's prior
			pos0 = WFind(wordc);

			//calculate prior sum for this code's all words
			ppsum = 0;
			code = pcode[n];
			int nw = 0;
			for (pos = CWFind(code); pos; CWT.GetNext(pos))
			{
				itemwc = CWT.GetAt(pos);
				if (itemwc.code != code)
					break;
				
				ppsum += WT.GetAt(WFind(CWT.GetAt(pos).word)).prior;
				nw++;
			}

			
			
			if (pos0)	//06.4.4
			{
				//delta = (ppsum - WT.GetAt(pos0).prior)/2;		//seems too sensitive
				delta = (ppsum - WT.GetAt(pos0).prior)/3;		//seems too sensitive
				//delta = ppsum/nw;								//seems too insensitive
				//adjust priors of this code's all words
				for (pos = CWFind(pcode[n]); pos; CWT.GetNext(pos))
				{
					itemwc = CWT.GetAt(pos);
					if (itemwc.code != code)
						break;

					posw = WFind(itemwc.word);
					itemw = WT.GetAt(posw);
					itemw.prior -= itemw.prior/ppsum*delta;
					ASSERT(itemw.prior > 0);
					WT.SetAt(posw, itemw);
				}

				itemw = WT.GetAt(pos0);
				itemw.prior += delta;
				ASSERT(itemw.prior > 0);
				WT.SetAt(pos0, itemw);
			}
		}

		//update transition probabilities
		if (word != "")
		{
			if (TPFind(word) != 0)				//the lastword-word item exist in TPmap table
			{
				pos0 = TPFind(word, wordc);
				if (pos0)
					itemtp0 = TPT.GetAt(pos0);
				else
				{
					itemtp0.word = word;
					itemtp0.wordc = wordc;
					itemtp0.tp = 0.0;
				}

				tpsum = TPSum(word, transwdnum);
				delta = (((itemtp0.tp+TPINC) < TPSUMLMT) ? TPINC : TPSUMLMT-itemtp0.tp);
				dlt = tpsum+delta-TPSUMLMT;

				//adjust all existing item's tp
				if (dlt > 0.0)
				{
					pos = TPFind(word);
					for (; pos; TPT.GetNext(pos))
					{
						itemtp = TPT.GetAt(pos);
						if (itemtp.word != word)
							break;

						itemtp.tp -= itemtp.tp/tpsum*dlt;
						ASSERT(itemtp.tp > 0);
						TPT.SetAt(pos, itemtp);
					}
				}
				
				if (pos0)		//refresh itemtp0
					itemtp0 = TPT.GetAt(pos0);
				itemtp0.tp += delta;
				ASSERT(itemtp0.tp > 0);
				if (pos0)
					TPT.SetAt(pos0, itemtp0);
				else
				{
					pos = TPFind(word);			//locate the last word-wordc item
					for (; pos; TPT.GetNext(pos))
					{
						if (TPT.GetAt(pos).word != word)
							break;
					}
					if (pos)		//the above loop are breaked, this means the pos is not the end of Word-word table
						TPT.InsertBefore(pos, itemtp0);
					else
						TPT.AddTail(itemtp0);
				}
				ASSERT(TPSum(word, transwdnum) <= TPSUMLMT+1e-5);
			}
			else
			{
				itemtp0.word = word;
				itemtp0.wordc = wordc;
				itemtp0.tp = TPINC;
				pos = TPT.AddTail(itemtp0);
				TPIdx[GetWordId((LPSTR)(LPCSTR)word)] = pos;
			}
		}

		word = wordc;
				
		//update conditional probabilities
		pos = CPFind(word);
		for (; pos; CPT.GetNext(pos))
		{
			itemcp = CPT.GetAt(pos);
			if (itemcp.word != word)
				break;

			if (maxcodelen == 2)
			{
				itemcp.cp2 -= CPINC*itemcp.cp2;
				ASSERT(itemcp.cp2 > 0 || itemcp.code.GetLength() > 2);
			}
			else
			{
				itemcp.cp -= CPINC*itemcp.cp;
				//ASSERT(itemcp.cp > 0);
			}

			if (itemcp.code == pcode[n])
			{
				if (maxcodelen == 2)
					itemcp.cp2 += CPINC;
				else
					itemcp.cp += CPINC;
			}

			CPT.SetAt(pos, itemcp);
		}
	}

	return TRUE;
}

//generate select words table 
HKWB_API BOOL GetWords(unsigned char* nofullcode, unsigned char* pcstream, int codeno, unsigned char* pwords, int& wdnum)
{
	unsigned char code[CODELEN];
	double ai[DUPWORD];
	int wi[DUPWORD];
	double dTmp;
	int c, d, numc, nw, len;
	CString word;
	POSITION ic;

	wdnum = 0;
	len = strlen((char*)nofullcode);

	if ( nofullcode[len-1] != 32 && len < maxcodelen)
	{//generate select word by prior
		strcpy((char*)code, (char*)nofullcode);
		len = strlen((char*)code);

		ic = CFind(code);
		if (0 == ic)
			return FALSE;

		//transfer a[codeno][.] to ai
		nw = ((maxcodelen == 4) ? CT.GetAt(ic).wnum : CT.GetAt(ic).wnum2);	//get duplicate word number for this code
		ic = CWFind(code);
		for (int m=0; m < nw; m++)
		{
			ai[m] = WT.GetAt(WFind(CWT.GetNext(ic).word)).prior/sumwordfre;
			wi[m] = m;
		}
	}
	else
	{//generate select word by post
		if (codeno < 0)
			return FALSE;

		SegmentCode(pcstream, strlen((char*)pcstream), pcode, numc);

		strcpy((char*)code, (char*)pcode[codeno]);
		len = strlen((char*)code);
		if (32 == code[len-1])
			code[len-1] = 0;

		ic = CFind(code);
		if (0 == ic)
			return FALSE;

		//transfer a[codeno][.] to ai
		nw = ((maxcodelen == 4) ? CT.GetAt(ic).wnum : CT.GetAt(ic).wnum2);	//get duplicate word number for this code
		for (int m=0; m < nw; m++)
		{
			ai[m] = a[codeno][m];
			wi[m] = m;
		}
	}


	//sort ai and send the index to wi
	for (int n = 0; n < nw-1; n++)
	{
		c = n;
		for (int m=n+1; m < nw; m++)
		{
			if (ai[m] > ai[c])		//swap wi[n] and wi[m]
			{
				c = m;
			}
		}

		//swap ai[n] and ai[wi[n]]
		if (c != n)
		{
			dTmp = ai[n];
			ai[n] = ai[c];
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -