📄 pattree.c

📁 Symbian平台数字键盘手机输入法源码
💻 C
📖 第 1 页 / 共 5 页
字号:
	pDic = (STATIC_DIC_T*)pDicBuffer;

	if ( pDic ) {
		if ( pDic->nIs_Data_AllocBuffer ) {
			free(pDic->abData);
			pDic->nIs_Data_AllocBuffer = 0;
		}
		if ( pDic->nIs_Data_User_AllocBuffer ) {
			free(pDic->abData_User);
			pDic->nIs_Data_User_AllocBuffer = 0;
		}
		free(pDic);
	}
}

#ifdef _WINDOWS
#ifdef _WIN32

static void PinyinCode2Digi( unsigned short *pwPinyinCode, int nWordLen, DIC_T *pDic, char *pszDigi )
{
	int i, nLen;
	char *pszCur;

	pszDigi[0] = 0;

	nLen = 0;
	for ( i = 0; i < nWordLen; i ++ ) {
		pszCur = pDic->ppPinyinList[pwPinyinCode[i]];
		while ( *pszCur ) {
            if ( pszCur[1] == ':' ) {
				if ( !(pszCur[-1] == 'j' || pszCur[-1] == 'q' || pszCur[0] == 'x') ) {
					pszDigi[nLen++] = ALPHABET2DIGI['v' - 'a'];
				}
				else
					pszDigi[nLen++] = ALPHABET2DIGI[pszCur[0] - 'a'];
				pszCur += 2;
			}
			else {
				pszDigi[nLen++] = ALPHABET2DIGI[pszCur[0] - 'a'];
				pszCur ++;
			}
		}
	}
	pszDigi[nLen] = 0;
}

static unsigned short SearchPinyinCode( DIC_T *pDic, char *pszPinyin, int nLen )
{
	unsigned short wMin, wMax, wMid;
	int nResult;

	wMin = 0;
	wMax = pDic->nPinyinNum-1;

	while( wMax >= wMin ) {
		wMid = (wMax+wMin)/2;
		nResult = strncmp(pDic->ppPinyinList[wMid], pszPinyin, nLen);
		if ( nResult == 0 ) {
			if ( strlen(pDic->ppPinyinList[wMid]) == (size_t)nLen )
				return wMid;
			else
				wMax = wMid-1;
		}
		else if ( nResult > 0 ) 
			wMax = wMid-1;
		else
			wMin = wMid+1;
	}

	return 0x7fff;
}

static short g_IMMethod = PATTREE_CHS_PINYIN;
static int CompareWord_ByDigiProb(const void *pVocab1, const void *pVocab2 ) 
{
	SORTABLE_LEXICON_ENTRY_T *pW1, *pW2;
	char *str1, *str2;
	char code1, code2;

	pW1 = (SORTABLE_LEXICON_ENTRY_T *)pVocab1;
	pW2 = (SORTABLE_LEXICON_ENTRY_T *)pVocab2;
	str1 = pW1->szDigiString;
	str2 = pW2->szDigiString;
	while ( *str1 != '\0' && *str2 != '\0' ) {
		DIGI2CODE(*str1, code1, g_IMMethod);
		DIGI2CODE(*str2, code2, g_IMMethod);
		if ( code1 > code2 )
			return 1;
		else if ( code1 < code2 )
			return -1;
		str1++;
		str2++;
	}
	if ( *str1 == '\0' && *str2 != '\0'	)
		return -1;
	else if ( *str1 != '\0' && *str2 == '\0'	)
		return 1;
	else {
		return (int)pW2->LogProb - (int)pW1->LogProb;
	}
}

static int CompareWord(const void *pLexicon1, const void *pLexicon2) 
{
	LEXICON_ENTRY_T *pL1, *pL2;
	wchar_t *str1, *str2;

	pL1 = (LEXICON_ENTRY_T *)pLexicon1;
	pL2 = (LEXICON_ENTRY_T *)pLexicon2;
	str1 = pL1->pwszWord;
	str2 = pL2->pwszWord;
	while ( *str1 != L'\0' && *str2 != L'\0' ) {
		if ( *str1 > *str2 )
			return 1;
		else if ( *str1 < *str2 )
			return -1;
		str1++;
		str2++;
	}
	if ( *str1 == L'\0' && *str2 != L'\0'	)
		return -1;
	else if ( *str1 != L'\0' && *str2 == L'\0'	)
		return 1;
	else {
		return 0;
	}
}

static int FillPinyin( LEXICON_ENTRY_T *pLexicon, DIC_T *pDic, char *szPinyin, int nWordLen )
{
	char *pszLast;
	char *pszCur;
	unsigned short wCode[16];
	int i, n, nNum;
	int nIsBreak, nIsSame;

	pszCur = szPinyin;
	pLexicon->cPinyinNum = 0;
	pLexicon->pwPinyinCode = (unsigned short*)malloc( 8*nWordLen*sizeof(unsigned short));
	while ( *pszCur != '\r' && *pszCur != '\n' && *pszCur != '\0' ) {
		pszLast = pszCur;
		while ( *pszCur != '|' && *pszCur != '\r' && *pszCur != '\n' && *pszCur != '\0' )
			pszCur ++;
		if ( *pszCur == 0 )
			nIsBreak = 1;
		else {
			*pszCur = 0;
			nIsBreak = 0;
		}
		if ( pszCur-pszLast >= 2 ) { // each distinct word pinyin
			char *pCurSec, *pLastSec;
			int nSecLen;

			pCurSec = pszLast;
			nNum = 0;
			while ( *pCurSec ) { // each sec in word pinyin
				pLastSec = pCurSec;
				while ( *pCurSec >= 'a' && *pCurSec <= 'z' || *pCurSec == ':' ) {
					pCurSec ++;
				}
				nSecLen = pCurSec - pLastSec;
				pCurSec ++;

				if ( nSecLen > 0 ) {
					if ( nSecLen > 1 && pLastSec[nSecLen-1] == 'r' && strncmp(pLastSec, "er", nSecLen) != 0 ) {
						wCode[nNum] = SearchPinyinCode(pDic, pLastSec, nSecLen-1);
						if ( wCode[nNum] >= 512 )
							return 0;
						else 
							wCode[nNum+1] = SearchPinyinCode(pDic, "r", 1);
						nNum += 2;
					}
					else {
						wCode[nNum] = SearchPinyinCode(pDic, pLastSec, nSecLen);
						if ( wCode[nNum] >= 512 )
							return 0;
						else 
							nNum ++;
					}
				}
				else
					return 0;
			}
			if ( nNum != nWordLen )
				return 0;

			nIsSame = 0;
			for ( i = 0; i < pLexicon->cPinyinNum; i ++ ) {
				for ( n = 0; n < nWordLen; n ++ ) {
					if ( pLexicon->pwPinyinCode[i*nWordLen+n] != wCode[n] )
						break;
				}
				if ( n < nWordLen ) {
					nIsSame = 0;
				}
				else {
					nIsSame = 1;
					break;
				}
			}
			if ( !nIsSame ) {
				memcpy(pLexicon->pwPinyinCode + pLexicon->cPinyinNum*nWordLen, wCode, nWordLen*sizeof(unsigned short));
				pLexicon->cPinyinNum ++;
			}
		}

		if ( nIsBreak )
			break;
		pszCur ++;
	}

	return 1;
}

static int FillPOS( LEXICON_ENTRY_T *pLexicon, char *szPOS )
{
	char *pszLast;
	char *pszCur;
	int nIsBreak;

	pszCur = szPOS;
	pLexicon->cPOSNum = 0;
	while ( *pszCur != '\r' && *pszCur != '\n' && *pszCur != '\0' ) {
		pszLast = pszCur;
		while ( *pszCur != '|' && *pszCur != '\r' && *pszCur != '\n' && *pszCur != '\0' )
			pszCur ++;
		if ( *pszCur == 0 )
			nIsBreak = 1;
		else {
			*pszCur = 0;
			nIsBreak = 0;
		}
		if ( pszCur-pszLast >= 2 ) { // each distinct word pos
			char *pPos;

			pPos = pszLast;
			while ( *pPos && *pPos != '-' ) 
				pPos ++;
			*pPos = 0;
			
			if ( strlen(pszLast) == 1 ) {
				if ( *pszLast >= 'A' && *pszLast <= 'Z' )
					pLexicon->acPOS[pLexicon->cPOSNum] = (*pszLast) - 'A';
				else if ( *pszLast >= 'a' && *pszLast <= 'z' )
					pLexicon->acPOS[pLexicon->cPOSNum] = (*pszLast) - 'a';
				else
					pLexicon->acPOS[pLexicon->cPOSNum] = -2;
			}
			//"Ag",	26, "Bg", 27, "Dg", 28, "Eg", 29, "Mg", 30, "Ng", 31, "Qg", 32, "Rg", 33, "Tg", 34, "Ug", 35, "Vg", 36, "Yg", 37, "Zg", 38);
			else if ( *pszLast == 'A' ) {
				pLexicon->acPOS[pLexicon->cPOSNum] = 26;
			}
			else if ( *pszLast == 'B' ) {
				pLexicon->acPOS[pLexicon->cPOSNum] = 27;
			}
			else if ( *pszLast == 'D' ) {
				pLexicon->acPOS[pLexicon->cPOSNum] = 28;
			}
			else if ( *pszLast == 'E' ) {
				pLexicon->acPOS[pLexicon->cPOSNum] = 29;
			}
			else if ( *pszLast == 'M' ) {
				pLexicon->acPOS[pLexicon->cPOSNum] = 30;
			}
			else if ( *pszLast == 'N' ) {
				pLexicon->acPOS[pLexicon->cPOSNum] = 31;
			}
			else if ( *pszLast == 'Q' ) {
				pLexicon->acPOS[pLexicon->cPOSNum] = 32;
			}
			else if ( *pszLast == 'R' ) {
				pLexicon->acPOS[pLexicon->cPOSNum] = 33;
			}
			else if ( *pszLast == 'T' ) {
				pLexicon->acPOS[pLexicon->cPOSNum] = 34;
			}
			else if ( *pszLast == 'U' ) {
				pLexicon->acPOS[pLexicon->cPOSNum] = 35;
			}
			else if ( *pszLast == 'V' ) {
				pLexicon->acPOS[pLexicon->cPOSNum] = 36;
			}
			else if ( *pszLast == 'Y' ) {
				pLexicon->acPOS[pLexicon->cPOSNum] = 37;
			}
			else if ( *pszLast == 'Z' ) {
				pLexicon->acPOS[pLexicon->cPOSNum] = 38;
			}
			else {
				pLexicon->acPOS[pLexicon->cPOSNum] = -1;
			}
			if ( pLexicon->acPOS[pLexicon->cPOSNum] >= 0 )
				pLexicon->anPOSLogProb[pLexicon->cPOSNum++] = atoi(pPos+1);
		}

		if ( nIsBreak )
			break;
		pszCur ++;
	}

	return 1;
}

static int ReadPOSBigram( DIC_T *pDic, long *pnPOSBigram, long *pnPOSUnigram, const TCHAR* strTextFilename )
{
	FILE *fpText;
	char strTemp[2048];
	char *pszCur;
	char *pszLast;
	int nNumLine;
	int nNum;
	int nZerotonNum;
	long *pnCurPOSBigram;
	double d, dT, dMin, dMax, dTotal;

	/* open file operation */
	fpText = _tfopen(strTextFilename, _T("rt"));
	if ( !fpText ) {
		fprintf( stderr, "can not open file %s\n", strTextFilename );
		return 0;
	}
		
	/* read bigram*/
	nNumLine = 0;
	pnCurPOSBigram = pnPOSBigram;
	dMin = 1e99;
	dMax = -1e99;
	dTotal = 0;
	while (fgets (strTemp, 2048, fpText)) {
		pszCur = strTemp;
		nNum = 0;
		while ( *pszCur != '\0' && *pszCur != '\r' && *pszCur != '\n' ) {
			while ( *pszCur != '\0' && (*pszCur == ' ' || *pszCur == '\t') )
				pszCur ++;
			pszLast = pszCur;
			while ( *pszCur != '\0' && *pszCur != ' '&& *pszCur != '\t' && *pszCur != '\r' && *pszCur != '\n' )
				pszCur ++;
			if ( pszCur - pszLast >= 1 ) {
				if ( *pszLast >= '0' && *pszLast <= '9' ) {
					pnCurPOSBigram[nNum] = atoi(pszLast);
					nNum ++;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -