⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 hikerwbfunc.cpp

📁 自己用Markov模型做的一个整句物笔输入法的原型
💻 CPP
📖 第 1 页 / 共 4 页
字号:

		tp += itemtp.tp;
		wordcnum++;
	}

	return tp;
}

static BOOL IsGB2312(CString word)
{
	if (word.GetLength() < 1)
		return TRUE;

	CString code = CPT.GetAt(CPFind(word)).code;
	if (code[0] == 'z')
		return TRUE;

	unsigned char c1 = word[0];
	unsigned char c2 = word[1];

	if (c1 >= 0xB0 && c1 <= 0xF7 && c2 >= 0xA1 && c2 <= 0xFE)
		return TRUE;

	return FALSE;
}

BOOL SegmentCode(unsigned char* pcs, int pcslen, unsigned char** pcode, int& numc)
{
	int lenc = 0;
	BOOL bSpcCode;
	//unsigned char code[5];

	numc = 0;
	memset(pcode[numc], 0, CODELEN);

	for (int i = 0; i < pcslen; i++)
	{
		if (pcs[i] != 32)
		{
			if (lenc == 0)
			{
				numc++;
				if (pcs[i] == 'z')
					bSpcCode = TRUE;
				else
					bSpcCode = FALSE;
			}

			pcode[numc-1][lenc++] = pcs[i];
			if (lenc == ((bSpcCode)?4:maxcodelen))
			{
				lenc = 0;
				memset(pcode[numc], 0, CODELEN);
			}
		}
		else
		{
			if (lenc > 0)
			{
				lenc = 0;
				memset(pcode[numc], 0, CODELEN);
			}
		}
	}

	return TRUE;
}

static BOOL CheckCodes(unsigned char** pcode, int numc)
{
	CString code;
	POSITION ic;

	for (int i = 0; i < numc; i++)
	{
		code = pcode[i];
		ic = CFind(code);		//Get the Code's index in array CArr;

		if (ic == 0)
			strcpy((char*)(pcode[i]), (const char*)CString('z', code.GetLength()));
	}

	return TRUE;
}

//word1 is before word2
static BOOL CWIsBefore(POSITION pos1, POSITION pos2, int nw)
{
	POSITION pos = pos1;

	for (int i = 0; i < nw; i++)
	{
		CWT.GetNext(pos);
		if (pos)
		{
			if (pos == pos2)
				return TRUE;
		}
		else
			break;
	}

	return FALSE;
}

static double GetTransPorb(CString word, CString wordc, double tpsum, int transwdnum)
{
	double tp;
	POSITION itp = TPFind(word, wordc);

	if (itp == 0)
	{
		//itp = TPFind(word);
		if (/*itp*/transwdnum == 0)
			//tp = 1.0/WT.GetCount();
			tp = WT.GetAt(WFind(wordc)).prior/sumwordfre;	//using prior instead of transition
		else
			//tp = (1.0f-tpsum)/(WT.GetCount()-transwdnum);		//assign rest transtition probabilities averagely to no existing items
			tp = (1.0-tpsum)*WT.GetAt(WFind(wordc)).prior/sumwordfre;
	}
	else
	{
		tp = TPT.GetAt(itp).tp;
	}

	return tp;
}



static BOOL Backward(unsigned char** pcode, double** b, int numc)
{
//	unsigned char code[CODELEN];
//	unsigned char codec[CODELEN];
//	unsigned char word[WORDLEN];
//	unsigned char wordc[WORDLEN];
	CString code;
	CString codec;
	CString word;
	CString wordc;

	POSITION ic, icc, icw, icwc, icp;
	int nw, nwc;
	double tp, tpsum;

	int i, iw, iwc, transwdnum; 
	POSITION k, kc;
	double sumiwb;

	//calculate p(yi/si)
	for (i = 0; i < numc; i++)
	{
		code = pcode[i];
		ic = CFind(code);		//Get the Code's index in array CArr;
		if (ic == 0)
			return FALSE;

		nw = ((maxcodelen == 4) ? CT.GetAt(ic).wnum : CT.GetAt(ic).wnum2);
		icw = CWFind(code);

	//	if (icw >= CWCArr.GetSize()-1)
	//		code = "";
	//	else
	//		code = CWCArr[icw];

		k = icw;
		for (iw = 0; iw < nw; iw++)
		{
			word = CWT.GetNext(k).word;
			icp = CPFind(word, code);		//a map has key word+code?
			b[i][iw] = ((maxcodelen == 4) ? CPT.GetAt(icp).cp : CPT.GetAt(icp).cp2);
		}
	}

	for (i = numc-2; i >= 0; i--)
	{
		code = pcode[i];
		ic = CFind(code);		//Get the Code's index in array CArr;
		if (ic == 0)
			return FALSE;

		nw = ((maxcodelen == 4) ? CT.GetAt(ic).wnum : CT.GetAt(ic).wnum2);
		icw = CWFind(code);

		k = icw;
		for (iw = 0; iw < nw; iw++)
		{
			word = CWT.GetNext(k).word;
			tpsum = TPSum(word, transwdnum);
			ASSERT(tpsum < 1.0);
			
			codec = pcode[i+1];
			icc = CFind(codec);		//Get the Code's index in array CArr;
			nwc = ((maxcodelen == 4) ? CT.GetAt(icc).wnum : CT.GetAt(icc).wnum2);
			icwc = CWFind(codec);
			
			kc = icwc;
			sumiwb = 0;
			for (iwc = 0; iwc < nwc; iwc++)
			{
				wordc = CWT.GetNext(kc).word;
				tp = GetTransPorb(word, wordc, tpsum, transwdnum);
				ASSERT(tp >= -1e-5);
				sumiwb += b[i+1][iwc] * tp;
			}
			b[i][iw] *= sumiwb;
		}
	}


    return TRUE;
}

static BOOL Foreward(unsigned char** pcode, double** a, double** b, int numc)
{
	POSITION ic, icp, ipp, icw, icwp;
	int iw, nw, iwp, nwp, transwdnum;
	double pp, tp, tpsum;
	double sumaiw[DUPWORD];
	double sumaiwp[DUPWORD];
//	unsigned char *code, *codep;
//	unsigned char *word, *wordp;
	CString code, codep, word, wordp;
	POSITION k, kp;

	code = pcode[0];
	ic = CFind(code);		//Get the Code's index in array CArr;
	if (ic == 0)
		return FALSE;

	nw = ((maxcodelen == 4) ? CT.GetAt(ic).wnum : CT.GetAt(ic).wnum2);
	icw = CWFind(code);
	for (int m = 0; m < nw; m++)
	{
		ipp = WFind(CWT.GetNext(icw).word);		//prior probability index
		pp = WT.GetAt(ipp).prior/sumwordfre;
		a[0][m] = b[0][m] * pp;
	}

	for (int i = 1; i < numc; i++)
	{
		code = pcode[i];
		ic = CFind(code);		//Get the Code's index in array CArr;
		if (ic == 0)
			return FALSE;

		nw = ((maxcodelen == 4) ? CT.GetAt(ic).wnum : CT.GetAt(ic).wnum2);
		icw = CWFind(code);

		codep = pcode[i-1];
		icp = CFind(codep);		//Get the Code's index in array CArr;
		nwp = ((maxcodelen == 4) ? CT.GetAt(icp).wnum : CT.GetAt(icp).wnum2);
		icwp = CWFind(codep);

		for (iw = 0; iw < nw; iw++)
			sumaiwp[iw] = 0;

		kp = icwp;
		for (iwp = 0; iwp < nwp; iwp++)
		{
			wordp = CWT.GetNext(kp).word;
			tpsum = TPSum(wordp, transwdnum);
			ASSERT(tpsum < 1.0);
			sumaiw[iwp] = 0;

			k = icw;
			for (iw = 0; iw < nw; iw++)
			{
				word = CWT.GetNext(k).word;
				tp = GetTransPorb(wordp, word, tpsum, transwdnum);
				if (tp <= 0)
					Sleep(0);

				a[i][iw] = b[i][iw] * tp;
				sumaiw[iwp] += a[i][iw];
			}

			for (iw = 0; iw < nw; iw++)
			{
				//if (sumaiw[iwp] > 0)			//always > 0 if tp always > 0
				sumaiwp[iw] += a[i][iw]/sumaiw[iwp] * a[i-1][iwp];
			}
		}

		for (iw = 0; iw < nw; iw++)
		{
			a[i][iw] = sumaiwp[iw];
		}
	}

    return TRUE;
}

static BOOL MpmEstm(unsigned char** pcode, double** a, int* c, int numc)
{
	//MPM estimation
	POSITION ic;
	int nw;
	int i;
	double maxp;
	CString word;

	for (i = 0; i < numc; i++)
	{
		ic = CFind(pcode[i]);		//Get the Code's index in array C1Arr;
	    maxp=-0.001;

		if (ic >= 0)
		{
			nw = ((maxcodelen == 4) ? CT.GetAt(ic).wnum : CT.GetAt(ic).wnum2);	//get duplicate word number for this code
			POSITION iw = CWFind(pcode[i]);

			c[i] = -1;		//06.4.4
			for (int m=0; m < nw; m++)
			{
				word = CWT.GetNext(iw).word;

				if (a[i][m]/*-0.001*/ > maxp)
				{
					if (1 == charset || 0 == charset && (IsGB2312(word)))
					{
						maxp = a[i][m];
						c[i] = m;
					}
				}
				else if (a[i][m]/*+0.001 >*/== maxp)     //the two value is near
				{
					POSITION iw1 = CWFind(pcode[i], m);
					POSITION iw2 = CWFind(pcode[i], c[i]);
					//if (iw1 < iw2)					//???
					if (CWIsBefore(iw1, iw2, nw))
						c[i] = m;
				}
			}
		}
		else
		{
			c[i] = -1;
		}
	}

    return TRUE;
}

static BOOL Code2Word(unsigned char** pcode, int numc, unsigned char** pword)
{
	int i; 
	int c[SENTLEN];
	BOOL bstop = FALSE;

	CheckCodes(pcode, numc);

    if (!Backward(pcode, (double**)b, numc))
		return FALSE;

    if (!Foreward(pcode, (double**)a, (double**)b, numc))
		return FALSE;

    if (!MpmEstm(pcode, (double**)a, c, numc))
		return FALSE;

	//convert duplicate word no. to word stream
	for (i = 0; i < numc; i++)
	{
		if (c[i] >= 0)
		{
			POSITION icw = CWFind(pcode[i], c[i]);
			strcpy((char*)pword[i], (const char*)CWT.GetAt(icw).word);
		}
		else
		{
			strcpy((char*)pword[i], CString(15, strlen((char*)pcode[i])));			//??? no use?
			//maybe should set pcode[i] to "zz" and goto the "if(!Backward...)" sentence
		}
	}

	return TRUE;
}

//BOOL WordID2Word(int* pwordid, int numw, unsigned char* pword, int& pwlen)
//{
//	pwlen = 0;
//	unsigned char word[9];
//	memset(word, 0, 9);
//
//	for (int i = 0; i < numw; i++)
//	{
//		FindWord(pwordid[i], word);
//
//		for (int j = 0; word[j] != 0; j++)
//			pword[pwlen++] = word[j];
//	}
//
//	return TRUE;
//}

static BOOL CreateData()
{
	int i;

	for (i = 0; i < 256; i++)
	{
		pcode[i] = new unsigned char[CODELEN];
		pword[i] = new unsigned char[WORDLEN];
	}

	for (i = 0; i < 256; i++)
	{
		b[i] = new double[DUPWORD];
		a[i] = new double[DUPWORD];
	}

	return TRUE;
}

static BOOL DeleteData()
{
	int i;

	for (i = 0; i < 256; i++)
	{
		delete [] pcode[i];
		delete [] pword[i];;
	}

	for (i = 0; i < 256; i++)
	{
		delete [] b[i];
		delete [] a[i];;
	}

	return TRUE;
}

static int ReadInt(LPBYTE& lpf, int& val)
{
	val = *((int*)lpf);
	lpf += 4;

	return 4;
}

static int ReadFloat(LPBYTE& lpf, float& val)
{
	val = *((float*)lpf);
	lpf += 4;

	return 4;
}

static int ReadStr(LPBYTE& lpf, int len, unsigned char* str)
{
	strncpy((char*)str, (char*)lpf, len);
	lpf += len;

	return len;
}

static int WriteInt(LPBYTE& lpf, int val)
{
	*((int*)lpf) = val;
	lpf += 4;

	return 4;
}

static int WriteFloat(LPBYTE& lpf, float val)
{
	*((float*)lpf) = val;
	lpf += 4;

	return 4;
}

static int WriteStr(LPBYTE& lpf, char* str, int len)
{
	strncpy((char*)lpf, str, len);
	lpf += len;

	return len;
}

//read mb file to arrays or lists, as well as generating indexs
static BOOL InitMB()
{
	CFile filec;
	CFile filew;
	CFile filecw;
	CFile filewc;
	CFile fileww;
	CFile filepy;
	//CFileException e;
	CString csCap, csDir;
	AfxGetMainWnd()->GetWindowText(csCap);

	GetModuleFileName(AfxGetInstanceHandle(), csDir.GetBuffer(MAX_PATH), MAX_PATH);
	csDir.ReleaseBuffer();
	csDir = csDir.Left(csDir.ReverseFind('\\'));
	SetCurrentDirectory(csDir);

	if (!filec.Open("..\\mb\\code.dat", CFile::modeRead) ||
		!filew.Open("..\\mb\\word.dat", CFile::modeRead) ||
		!filecw.Open("..\\mb\\cwmap.dat", CFile::modeRead) ||
		!filewc.Open("..\\mb\\wcmap.dat", CFile::modeRead) || 
		!fileww.Open("..\\mb\\wwmap.dat", CFile::modeRead) ||
		!filepy.Open("..\\mb\\pinyin.dat", CFile::modeRead) )
		MessageBox(NULL, "打开码表文件失败!", csCap, MB_OK|MB_ICONSTOP);

	unsigned char strc[CODELEN], strw[WORDLEN], strw1[WORDLEN];
	int lenc, lenw, nw, nw2, lenw1;
	float cp, cp2, tp, pp;				//!must be float type.
	int i;
	POSITION pos;
	unsigned int id;
	int len, filelen;
	unsigned char last_strc[CODELEN], last_strw[WORDLEN];

	struct _timeb time0, time1;
	time_t elptime;
	_ftime(&time0);

	HANDLE hfilec = CreateFileMapping((HANDLE)filec.m_hFile, NULL, PAGE_READONLY, 0, 0, 0);
	HANDLE hfilew = CreateFileMapping((HANDLE)filew.m_hFile, NULL, PAGE_READONLY, 0, 0, 0);
	HANDLE hfilecw = CreateFileMapping((HANDLE)filecw.m_hFile, NULL, PAGE_READONLY, 0, 0, 0);
	HANDLE hfilewc = CreateFileMapping((HANDLE)filewc.m_hFile, NULL, PAGE_READONLY, 0, 0, 0);
	HANDLE hfileww = CreateFileMapping((HANDLE)fileww.m_hFile, NULL, PAGE_READONLY, 0, 0, 0);
	HANDLE hfilepy = CreateFileMapping((HANDLE)filepy.m_hFile, NULL, PAGE_READONLY, 0, 0, 0);

	LPBYTE lpfc = (LPBYTE)MapViewOfFile(hfilec, FILE_MAP_READ, 0, 0, 0);
	LPBYTE lpfw = (LPBYTE)MapViewOfFile(hfilew, FILE_MAP_READ, 0, 0, 0);
	LPBYTE lpfcw = (LPBYTE)MapViewOfFile(hfilecw, FILE_MAP_READ, 0, 0, 0);
	LPBYTE lpfwc = (LPBYTE)MapViewOfFile(hfilewc, FILE_MAP_READ, 0, 0, 0);
	LPBYTE lpfww = (LPBYTE)MapViewOfFile(hfileww, FILE_MAP_READ, 0, 0, 0);
	LPBYTE lpfpy = (LPBYTE)MapViewOfFile(hfilepy, FILE_MAP_READ, 0, 0, 0);

	//allocate memory for arrays and index table
	CIdx.SetSize( 0x100000, 1000);
	WIdx.SetSize( 0x10000, 1000);
	CWIdx.SetSize(0x100000, 1000);
	CPIdx.SetSize(0x10000, 1000);
	TPIdx.SetSize(0x10000, 1000);
	PYIdx.SetSize(0x100000, 1000);

	//initialize all items in index tables with -1
	for (i = 0; i < 0x100000; i++)
	{
		CIdx[i] = 0;
		CWIdx[i] = 0;
	}

	for (i = 0; i < 0x10000; i++)
	{
		WIdx[i] = 0;
		CPIdx[i] = 0;
		TPIdx[i] = 0;
	}

	//read code file
	filelen = filec.GetLength();
	for (len = 0; len < filelen; )
	{
		memset(strc, 0, CODELEN) ;

		len += ReadInt(lpfc, lenc);
		if (lenc == 0) 
			break;
		len += ReadStr(lpfc, lenc, strc);
		len += ReadInt(lpfc, nw);				//read number of word for this code
		len += ReadInt(lpfc, nw2);				//read number of word for this code

		pos = CT.AddTail(CTItem(strc, nw, nw2));

		//set code index
		id = GetCodeId((char*)strc);
		//if (CIdx[id] == 0)
			CIdx[id] = pos;
	}

	//read word file
	sumwordfre = 0.0;
	//wordnum = 0;
	filelen = filew.GetLength();
	for (len = 0; len < filelen; )
	{
		memset(strw, 0, WORDLEN) ;

		len += ReadInt(lpfw, lenw);
		if (lenw == 0) 
			break;
		len += ReadStr(lpfw, lenw, strw);
		len += ReadFloat(lpfw, pp);			//read word prior-probabilities

		sumwordfre += pp;

		pos = WT.AddTail(WTItem(strw, pp));

		//set word index
		if (lenw <= 2)
		{
			id = GetWordId((char*)strw);
			//if (WIdx[id] == 0)
				WIdx[id] = pos;
		}
	}

	//read cwmap file
	filelen = filecw.GetLength();
	for (len = 0; len < filelen; )
	{
		memset(strc, 0, CODELEN) ;
		memset(strw, 0, WORDLEN) ;

		len += ReadInt(lpfcw, lenc);
		if (lenc == 0) 
			break;

		len += ReadStr(lpfcw, lenc, strc);
		len += ReadInt(lpfcw, lenw);
		len += ReadStr(lpfcw, lenw, strw);

		pos = CWT.AddTail(CWTItem(strc, strw));

		//set cwmap index
		if (strcmp((char*)strc, (char*)last_strc) != 0)
		{
			id = GetCodeId((char*)strc);
			//if (CWIdx[id] == 0)
				CWIdx[id] = pos;
		}

		strcpy((char*)last_strc, (char*)strc);
	}

	//read wcmap file
	filelen = filewc.GetLength();
	for (len = 0; len < filelen; )
	{
		memset(strc, 0, CODELEN) ;
		memset(strw, 0, WORDLEN) ;

		len += ReadInt(lpfwc, lenw);
		if (lenw == 0) 
			break;
		len += ReadStr(lpfwc, lenw, strw);
		len += ReadInt(lpfwc, lenc);
		len += ReadStr(lpfwc, lenc, strc);

		len += ReadFloat(lpfwc, cp);	//read conditional probabilities
		len += ReadFloat(lpfwc, cp2);	//read conditional probabilities

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -