📄 pattree.c
字号:
pDic = (STATIC_DIC_T*)pDicBuffer;
if ( pDic ) {
if ( pDic->nIs_Data_AllocBuffer ) {
free(pDic->abData);
pDic->nIs_Data_AllocBuffer = 0;
}
if ( pDic->nIs_Data_User_AllocBuffer ) {
free(pDic->abData_User);
pDic->nIs_Data_User_AllocBuffer = 0;
}
free(pDic);
}
}
#ifdef _WINDOWS
#ifdef _WIN32
static void PinyinCode2Digi( unsigned short *pwPinyinCode, int nWordLen, DIC_T *pDic, char *pszDigi )
{
int i, nLen;
char *pszCur;
pszDigi[0] = 0;
nLen = 0;
for ( i = 0; i < nWordLen; i ++ ) {
pszCur = pDic->ppPinyinList[pwPinyinCode[i]];
while ( *pszCur ) {
if ( pszCur[1] == ':' ) {
if ( !(pszCur[-1] == 'j' || pszCur[-1] == 'q' || pszCur[0] == 'x') ) {
pszDigi[nLen++] = ALPHABET2DIGI['v' - 'a'];
}
else
pszDigi[nLen++] = ALPHABET2DIGI[pszCur[0] - 'a'];
pszCur += 2;
}
else {
pszDigi[nLen++] = ALPHABET2DIGI[pszCur[0] - 'a'];
pszCur ++;
}
}
}
pszDigi[nLen] = 0;
}
static unsigned short SearchPinyinCode( DIC_T *pDic, char *pszPinyin, int nLen )
{
unsigned short wMin, wMax, wMid;
int nResult;
wMin = 0;
wMax = pDic->nPinyinNum-1;
while( wMax >= wMin ) {
wMid = (wMax+wMin)/2;
nResult = strncmp(pDic->ppPinyinList[wMid], pszPinyin, nLen);
if ( nResult == 0 ) {
if ( strlen(pDic->ppPinyinList[wMid]) == (size_t)nLen )
return wMid;
else
wMax = wMid-1;
}
else if ( nResult > 0 )
wMax = wMid-1;
else
wMin = wMid+1;
}
return 0x7fff;
}
static short g_IMMethod = PATTREE_CHS_PINYIN;
static int CompareWord_ByDigiProb(const void *pVocab1, const void *pVocab2 )
{
SORTABLE_LEXICON_ENTRY_T *pW1, *pW2;
char *str1, *str2;
char code1, code2;
pW1 = (SORTABLE_LEXICON_ENTRY_T *)pVocab1;
pW2 = (SORTABLE_LEXICON_ENTRY_T *)pVocab2;
str1 = pW1->szDigiString;
str2 = pW2->szDigiString;
while ( *str1 != '\0' && *str2 != '\0' ) {
DIGI2CODE(*str1, code1, g_IMMethod);
DIGI2CODE(*str2, code2, g_IMMethod);
if ( code1 > code2 )
return 1;
else if ( code1 < code2 )
return -1;
str1++;
str2++;
}
if ( *str1 == '\0' && *str2 != '\0' )
return -1;
else if ( *str1 != '\0' && *str2 == '\0' )
return 1;
else {
return (int)pW2->LogProb - (int)pW1->LogProb;
}
}
static int CompareWord(const void *pLexicon1, const void *pLexicon2)
{
LEXICON_ENTRY_T *pL1, *pL2;
wchar_t *str1, *str2;
pL1 = (LEXICON_ENTRY_T *)pLexicon1;
pL2 = (LEXICON_ENTRY_T *)pLexicon2;
str1 = pL1->pwszWord;
str2 = pL2->pwszWord;
while ( *str1 != L'\0' && *str2 != L'\0' ) {
if ( *str1 > *str2 )
return 1;
else if ( *str1 < *str2 )
return -1;
str1++;
str2++;
}
if ( *str1 == L'\0' && *str2 != L'\0' )
return -1;
else if ( *str1 != L'\0' && *str2 == L'\0' )
return 1;
else {
return 0;
}
}
static int FillPinyin( LEXICON_ENTRY_T *pLexicon, DIC_T *pDic, char *szPinyin, int nWordLen )
{
char *pszLast;
char *pszCur;
unsigned short wCode[16];
int i, n, nNum;
int nIsBreak, nIsSame;
pszCur = szPinyin;
pLexicon->cPinyinNum = 0;
pLexicon->pwPinyinCode = (unsigned short*)malloc( 8*nWordLen*sizeof(unsigned short));
while ( *pszCur != '\r' && *pszCur != '\n' && *pszCur != '\0' ) {
pszLast = pszCur;
while ( *pszCur != '|' && *pszCur != '\r' && *pszCur != '\n' && *pszCur != '\0' )
pszCur ++;
if ( *pszCur == 0 )
nIsBreak = 1;
else {
*pszCur = 0;
nIsBreak = 0;
}
if ( pszCur-pszLast >= 2 ) { // each distinct word pinyin
char *pCurSec, *pLastSec;
int nSecLen;
pCurSec = pszLast;
nNum = 0;
while ( *pCurSec ) { // each sec in word pinyin
pLastSec = pCurSec;
while ( *pCurSec >= 'a' && *pCurSec <= 'z' || *pCurSec == ':' ) {
pCurSec ++;
}
nSecLen = pCurSec - pLastSec;
pCurSec ++;
if ( nSecLen > 0 ) {
if ( nSecLen > 1 && pLastSec[nSecLen-1] == 'r' && strncmp(pLastSec, "er", nSecLen) != 0 ) {
wCode[nNum] = SearchPinyinCode(pDic, pLastSec, nSecLen-1);
if ( wCode[nNum] >= 512 )
return 0;
else
wCode[nNum+1] = SearchPinyinCode(pDic, "r", 1);
nNum += 2;
}
else {
wCode[nNum] = SearchPinyinCode(pDic, pLastSec, nSecLen);
if ( wCode[nNum] >= 512 )
return 0;
else
nNum ++;
}
}
else
return 0;
}
if ( nNum != nWordLen )
return 0;
nIsSame = 0;
for ( i = 0; i < pLexicon->cPinyinNum; i ++ ) {
for ( n = 0; n < nWordLen; n ++ ) {
if ( pLexicon->pwPinyinCode[i*nWordLen+n] != wCode[n] )
break;
}
if ( n < nWordLen ) {
nIsSame = 0;
}
else {
nIsSame = 1;
break;
}
}
if ( !nIsSame ) {
memcpy(pLexicon->pwPinyinCode + pLexicon->cPinyinNum*nWordLen, wCode, nWordLen*sizeof(unsigned short));
pLexicon->cPinyinNum ++;
}
}
if ( nIsBreak )
break;
pszCur ++;
}
return 1;
}
static int FillPOS( LEXICON_ENTRY_T *pLexicon, char *szPOS )
{
char *pszLast;
char *pszCur;
int nIsBreak;
pszCur = szPOS;
pLexicon->cPOSNum = 0;
while ( *pszCur != '\r' && *pszCur != '\n' && *pszCur != '\0' ) {
pszLast = pszCur;
while ( *pszCur != '|' && *pszCur != '\r' && *pszCur != '\n' && *pszCur != '\0' )
pszCur ++;
if ( *pszCur == 0 )
nIsBreak = 1;
else {
*pszCur = 0;
nIsBreak = 0;
}
if ( pszCur-pszLast >= 2 ) { // each distinct word pos
char *pPos;
pPos = pszLast;
while ( *pPos && *pPos != '-' )
pPos ++;
*pPos = 0;
if ( strlen(pszLast) == 1 ) {
if ( *pszLast >= 'A' && *pszLast <= 'Z' )
pLexicon->acPOS[pLexicon->cPOSNum] = (*pszLast) - 'A';
else if ( *pszLast >= 'a' && *pszLast <= 'z' )
pLexicon->acPOS[pLexicon->cPOSNum] = (*pszLast) - 'a';
else
pLexicon->acPOS[pLexicon->cPOSNum] = -2;
}
//"Ag", 26, "Bg", 27, "Dg", 28, "Eg", 29, "Mg", 30, "Ng", 31, "Qg", 32, "Rg", 33, "Tg", 34, "Ug", 35, "Vg", 36, "Yg", 37, "Zg", 38);
else if ( *pszLast == 'A' ) {
pLexicon->acPOS[pLexicon->cPOSNum] = 26;
}
else if ( *pszLast == 'B' ) {
pLexicon->acPOS[pLexicon->cPOSNum] = 27;
}
else if ( *pszLast == 'D' ) {
pLexicon->acPOS[pLexicon->cPOSNum] = 28;
}
else if ( *pszLast == 'E' ) {
pLexicon->acPOS[pLexicon->cPOSNum] = 29;
}
else if ( *pszLast == 'M' ) {
pLexicon->acPOS[pLexicon->cPOSNum] = 30;
}
else if ( *pszLast == 'N' ) {
pLexicon->acPOS[pLexicon->cPOSNum] = 31;
}
else if ( *pszLast == 'Q' ) {
pLexicon->acPOS[pLexicon->cPOSNum] = 32;
}
else if ( *pszLast == 'R' ) {
pLexicon->acPOS[pLexicon->cPOSNum] = 33;
}
else if ( *pszLast == 'T' ) {
pLexicon->acPOS[pLexicon->cPOSNum] = 34;
}
else if ( *pszLast == 'U' ) {
pLexicon->acPOS[pLexicon->cPOSNum] = 35;
}
else if ( *pszLast == 'V' ) {
pLexicon->acPOS[pLexicon->cPOSNum] = 36;
}
else if ( *pszLast == 'Y' ) {
pLexicon->acPOS[pLexicon->cPOSNum] = 37;
}
else if ( *pszLast == 'Z' ) {
pLexicon->acPOS[pLexicon->cPOSNum] = 38;
}
else {
pLexicon->acPOS[pLexicon->cPOSNum] = -1;
}
if ( pLexicon->acPOS[pLexicon->cPOSNum] >= 0 )
pLexicon->anPOSLogProb[pLexicon->cPOSNum++] = atoi(pPos+1);
}
if ( nIsBreak )
break;
pszCur ++;
}
return 1;
}
static int ReadPOSBigram( DIC_T *pDic, long *pnPOSBigram, long *pnPOSUnigram, const TCHAR* strTextFilename )
{
FILE *fpText;
char strTemp[2048];
char *pszCur;
char *pszLast;
int nNumLine;
int nNum;
int nZerotonNum;
long *pnCurPOSBigram;
double d, dT, dMin, dMax, dTotal;
/* open file operation */
fpText = _tfopen(strTextFilename, _T("rt"));
if ( !fpText ) {
fprintf( stderr, "can not open file %s\n", strTextFilename );
return 0;
}
/* read bigram*/
nNumLine = 0;
pnCurPOSBigram = pnPOSBigram;
dMin = 1e99;
dMax = -1e99;
dTotal = 0;
while (fgets (strTemp, 2048, fpText)) {
pszCur = strTemp;
nNum = 0;
while ( *pszCur != '\0' && *pszCur != '\r' && *pszCur != '\n' ) {
while ( *pszCur != '\0' && (*pszCur == ' ' || *pszCur == '\t') )
pszCur ++;
pszLast = pszCur;
while ( *pszCur != '\0' && *pszCur != ' '&& *pszCur != '\t' && *pszCur != '\r' && *pszCur != '\n' )
pszCur ++;
if ( pszCur - pszLast >= 1 ) {
if ( *pszLast >= '0' && *pszLast <= '9' ) {
pnCurPOSBigram[nNum] = atoi(pszLast);
nNum ++;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -