📄 pattree.c
字号:
}
else
break;
}
}
if ( nNum == PATTREE_MAX_POSTYPE ) {
dT = 0;
nZerotonNum = 0;
for ( nNum = 0; nNum < PATTREE_MAX_POSTYPE; nNum ++ ) {
if ( pnCurPOSBigram[nNum] == 0 )
nZerotonNum ++;
else
dT += pnCurPOSBigram[nNum];
}
pnPOSUnigram[nNumLine] = (long)dT;
dTotal += pnPOSUnigram[nNumLine];
dT += PATTREE_ZEROTON_POS_BIGRAM;
for ( nNum = 0; nNum < PATTREE_MAX_POSTYPE; nNum ++ ) {
if ( pnCurPOSBigram[nNum] == 0 )
d = PATTREE_ZEROTON_POS_BIGRAM / nZerotonNum / dT;
else
d = pnCurPOSBigram[nNum] / dT;
d = log(d) * pDic->fLogProbScale;
d = -d;
if ( d > dMax )
dMax = d;
if ( d < dMin )
dMin = d;
if ( d < 0 )
pnCurPOSBigram[nNum] = 0;
else if ( d >= PATTREE_NULL_WORDINDEX )
pnCurPOSBigram[nNum] = PATTREE_NULL_WORDINDEX;
else
pnCurPOSBigram[nNum] = (long)d;
}
nNumLine ++;
pnCurPOSBigram += PATTREE_MAX_POSTYPE;
if ( nNumLine > PATTREE_MAX_POSTYPE )
break;
}
}
if ( nNumLine == PATTREE_MAX_POSTYPE ) {
for ( nNumLine = 0; nNumLine < PATTREE_MAX_POSTYPE; nNumLine ++ ) {
d = -log(pnPOSUnigram[nNumLine] / dTotal) * pDic->fLogProbScale;
if ( d < 0 )
pnPOSUnigram[nNumLine] = 0;
else if ( d >= PATTREE_NULL_WORDINDEX )
pnPOSUnigram[nNumLine] = PATTREE_NULL_WORDINDEX;
else
pnPOSUnigram[nNumLine] = (long)d;
}
return 1;
}
else
return 0;
}
static TYPE_PROB GetMaxLogProb( DIC_T *pDic, IND_T *pInd, short IMMethod )
{
if ( pInd == 0 || !IND_IS_NONNULL(pInd) )
return 0;
if ( IMMethod == PATTREE_CHS_PINYIN )
return pDic->pLexicons[pDic->pwWordIndex[pInd->wWordIDFirst]].LogProb;
else
return pDic->pLexicons[pDic->pwWordIndex_2[pInd->wWordIDFirst]].LogProb;
}
static TYPE_PROB GetSeriesMaxLogProb( DIC_T *pDic, DIC_IND_T *pDicInd, TYPE_PROB * pMaxSeriesLogProb, int nStep, int nIndex, short IMMethod )
{
int nMinIndex, nMaxIndex, i, k;
IND_T *pInd;
TYPE_PROB LogProb;
pInd = &pDicInd->Ind[nStep][nIndex];
if ( pInd->nIndex == PATTREE_NULL_WORDINDEX ) {
//pInd->wMaxSeriesLogProb = GetMaxLogProb(pDic, pInd, IMMethod);
for ( k = pInd->wWordIDFirst; k < pInd->wWordIDFirst + __max(1,pInd->byWordIDNum); k ++ )
pMaxSeriesLogProb[k] = GetMaxLogProb(pDic, pInd, IMMethod);
}
else {
nMinIndex = pInd->nIndex;
nMaxIndex = PATTREE_NULL_WORDINDEX;
i = nIndex + 1;
while ( nMaxIndex == PATTREE_NULL_WORDINDEX && i < pDicInd->wIndNum[nStep] ) {
nMaxIndex = pDicInd->Ind[nStep][i].nIndex;
i ++;
}
if ( i >= pDicInd->wIndNum[nStep] )
nMaxIndex = pDicInd->wIndNum[nStep+1];
//pInd->wMaxSeriesLogProb = GetMaxLogProb(pDic, pInd, IMMethod);
for ( k = pInd->wWordIDFirst; k < pInd->wWordIDFirst + pInd->byWordIDNum; k ++ )
pMaxSeriesLogProb[k] = GetMaxLogProb(pDic, pInd, IMMethod);
for ( i = nMinIndex; i < nMaxIndex; i ++ ) {
LogProb = GetSeriesMaxLogProb( pDic, pDicInd, pMaxSeriesLogProb, nStep + 1, i, IMMethod );
//if ( pInd->wMaxSeriesLogProb < wLogProb )
// pInd->wMaxSeriesLogProb = wLogProb;
for ( k = pInd->wWordIDFirst; k < pInd->wWordIDFirst + __max(1, pInd->byWordIDNum); k ++ ) {
if ( pMaxSeriesLogProb[k] < LogProb ) {
pMaxSeriesLogProb[k] = LogProb;
}
}
}
}
//return pInd->wMaxSeriesLogProb;
return pMaxSeriesLogProb[pInd->wWordIDFirst];
}
static void FreeLexiconEntry(LEXICON_ENTRY_T *pLexiconEntry)
{
if ( pLexiconEntry->pwszWord ) {
if( pLexiconEntry->pwPinyinCode )
free( pLexiconEntry->pwPinyinCode );
free( pLexiconEntry->pwszWord );
}
}
static void FreeDicInd( DIC_IND_T* pDicInd )
{
int n;
for ( n = 0; n < PATTREE_MAX_IND_LEN; n ++ ) {
if ( pDicInd->Ind[n] )
free(pDicInd->Ind[n]);
}
}
static void FreePinyinList( DIC_T *pDic )
{
int i;
if ( pDic->ppPinyinList ) {
for ( i = 0; i < pDic->nPinyinNum; i ++ ) {
if ( pDic->ppPinyinList[i] )
free(pDic->ppPinyinList[i]);
}
free( pDic->ppPinyinList );
}
}
static void PATTREE_FreeDic( void* pBuffer )
{
int n;
DIC_T *pDic = (DIC_T*)pBuffer;
for ( n = 0; n < pDic->nLexiconNum; n ++ ) {
FreeLexiconEntry( pDic->pLexicons+n );
}
free( pDic->pwWordIndex );
free( pDic->pMaxSeriesLogProb );
free( pDic->pLexicons );
FreeDicInd(pDic->pDicFullDigi);
free(pDic->pDicFullDigi);
FreePinyinList(pDic);
if ( pDic->nCharNum > 0 ) {
if ( pDic->pwCharPinyinCode )
free( pDic->pwCharPinyinCode );
if ( pDic->ppCharStrokeList ) {
for ( n = 0; n < pDic->nCharNum; n ++ ) {
if ( pDic->ppCharStrokeList[n] )
free( pDic->ppCharStrokeList[n] );
}
free( pDic->ppCharStrokeList );
}
if ( pDic->pcCharStrokeNum )
free( pDic->pcCharStrokeNum );
if ( pDic->pwWordIndex_2 )
free( pDic->pwWordIndex_2 );
if ( pDic->pMaxSeriesLogProb_2 )
free( pDic->pMaxSeriesLogProb_2 );
if ( pDic->pDicFullDigi_2 ) {
FreeDicInd(pDic->pDicFullDigi_2);
free(pDic->pDicFullDigi_2);
}
}
}
static void TransCount2LogProb( int *pnCount, int nTotalCount, long nNum, long *pnOffset, double *pfScale, int nMax )
{
long i;
double dMin = 1e99, dMax = -1e99, d;
for ( i = 0; i < nNum; i ++ ) {
if ( pnCount[i] == 0 )
d = log(PATTREE_ZEROTON_COUNT/((double)nTotalCount));
else
d = log(pnCount[i]/((double)nTotalCount));
if ( dMin > d )
dMin = d;
if ( dMax < d )
dMax = d;
}
assert( dMax < 0 && dMin < dMax );
*pfScale = (nMax - 0.1) / (dMax - dMin);
*pnOffset = (long)( 0.1 - dMin * (*pfScale) );
for ( i = 0; i < nNum; i ++ ) {
if ( pnCount[i] == 0 )
d = log(PATTREE_ZEROTON_COUNT/((double)nTotalCount));
else
d = log(pnCount[i]/((double)nTotalCount));
pnCount[i] = (int)( d * (*pfScale) + (*pnOffset));
if ( pnCount[i] < 0 )
pnCount[i] = 0;
if ( pnCount[i] > nMax+1 )
pnCount[i] = nMax+1;
}
}
static int CreateDicInd( SORTABLE_LEXICON_ENTRY_T *pSortableEntry, DIC_T *pDic, int nSortNum, short IMMethod )
{
DIC_IND_T *pDicFullDigi;
unsigned short *pnActualDigiNum;
unsigned short *pwWordIndex;
TYPE_PROB *pMaxSeriesLogProb;
unsigned short wWordIDFirst, wWordIDLast;
int n, i;
int nWordLen;
/* the code of a current and prev word */
char cCurCode[PATTREE_MAX_IND_LEN], cPrevCode[PATTREE_MAX_IND_LEN];
/* the begining index of the current code differentiate from prev code */
int nNewCodeBegin;
if ( IMMethod == PATTREE_CHS_PINYIN ) {
pDicFullDigi = pDic->pDicFullDigi = (DIC_IND_T*)calloc(sizeof(DIC_IND_T), 1);
pDic->nWordIndexNum = nSortNum;
pnActualDigiNum = &pDic->nActualDigiNum;
pwWordIndex = pDic->pwWordIndex = malloc( sizeof(unsigned short) * nSortNum );
pMaxSeriesLogProb = pDic->pMaxSeriesLogProb = calloc( nSortNum, sizeof(TYPE_PROB) );
}
else {
pDicFullDigi = pDic->pDicFullDigi_2 = (DIC_IND_T*)calloc(sizeof(DIC_IND_T), 1);
pDic->nWordIndexNum_2 = nSortNum;
pnActualDigiNum = &pDic->nActualDigiNum_2;
pwWordIndex = pDic->pwWordIndex_2 = malloc( sizeof(unsigned short) * nSortNum );
pMaxSeriesLogProb = pDic->pMaxSeriesLogProb_2 = calloc( nSortNum, sizeof(TYPE_PROB) );
}
/* sort the word with alphabet sequence */
g_IMMethod = IMMethod;
qsort( (void*)pSortableEntry, (size_t)nSortNum, sizeof(SORTABLE_LEXICON_ENTRY_T), CompareWord_ByDigiProb );
/* begin filling the dic */
pDicFullDigi->Ind[0] = (IND_T*)calloc(sizeof(IND_T), PATTREE_MAX_CODE_NUM);
for ( i = 0; i < PATTREE_MAX_CODE_NUM; i ++ ) {
pDicFullDigi->Ind[0][i].nIndex = -1;
pDicFullDigi->Ind[0][i].cCode = i;
}
for ( i = 1; i < PATTREE_MAX_IND_LEN; i ++ ) {
pDicFullDigi->Ind[i] = (IND_T*)calloc(sizeof(IND_T), PATTREE_MAX_LEXICON_ENTRY);
}
for ( n = 0; n < PATTREE_MAX_IND_LEN; n ++ ) {
cPrevCode[n] = -1;
}
*pnActualDigiNum = 0;
for ( i = 0; i < nSortNum; i ++ ) {
wWordIDFirst = i;
pwWordIndex[i] = pSortableEntry[i].wIndex;
while ( i <= nSortNum - 2 && strcmp( pSortableEntry[i].szDigiString, pSortableEntry[i+1].szDigiString ) == 0 ) {
i ++;
pwWordIndex[i] = pSortableEntry[i].wIndex;
}
wWordIDLast = i;
(*pnActualDigiNum) ++;
nWordLen = strlen(pSortableEntry[i].szDigiString);
assert(nWordLen <= PATTREE_MAX_IND_LEN);
nNewCodeBegin = -1; //...??? 10
for ( n = 0; n < PATTREE_MAX_IND_LEN; n ++ ) {
if ( n < nWordLen ) {
DIGI2CODE(pSortableEntry[i].szDigiString[n], cCurCode[n], IMMethod);
}
else
cCurCode[n] = -1;
if ( cPrevCode[n] != cCurCode[n] && nNewCodeBegin < 0 ) {
nNewCodeBegin = n;
}
}
if ( nNewCodeBegin >= 0 ) {
if ( nNewCodeBegin == 0 ) {
pDicFullDigi->Ind[0][cCurCode[0]].nIndex = pDicFullDigi->wIndNum[1];
pDicFullDigi->Ind[0][cCurCode[0]].cCode = cCurCode[0];
pDicFullDigi->Ind[0][cCurCode[0]].wWordIDFirst = wWordIDFirst;
}
for ( n = __max(1,nNewCodeBegin); n < nWordLen-1; n ++ ) {
pDicFullDigi->Ind[n][pDicFullDigi->wIndNum[n]].nIndex = pDicFullDigi->wIndNum[n+1];
pDicFullDigi->Ind[n][pDicFullDigi->wIndNum[n]].cCode = cCurCode[n];
pDicFullDigi->Ind[n][pDicFullDigi->wIndNum[n]].wWordIDFirst = wWordIDFirst;
}
/* the last ind of this word must be marked to identify as a word ending */
if ( nWordLen == 1 ) {
pDicFullDigi->Ind[nWordLen-1][cCurCode[nWordLen-1]].cCode = cCurCode[0] | 0x80;
pDicFullDigi->Ind[nWordLen-1][cCurCode[nWordLen-1]].nIndex = PATTREE_NULL_WORDINDEX;
pDicFullDigi->Ind[nWordLen-1][cCurCode[nWordLen-1]].wWordIDFirst = wWordIDFirst;
//pDicFullDigi->Ind[nWordLen-1][cCurCode[nWordLen-1]].wWordIDLast = wWordIDLast;
pDicFullDigi->Ind[nWordLen-1][cCurCode[nWordLen-1]].byWordIDNum = wWordIDLast - wWordIDFirst + 1;
/*pDicFullDigi->Ind[nWordLen-1][cCurCode[nWordLen-1]].wMaxLogProb = 0;
for ( n = wWordIDFirst; n <= wWordIDLast; n ++ ) {
if ( Dic.pDicFullDigi->Ind[nWordLen-1][cCurCode[nWordLen-1]].wMaxLogProb < Dic.pLexicons[Dic.pwWordIndex[n]].wLogProb )
Dic.pDicFullDigi->Ind[nWordLen-1][cCurCode[nWordLen-1]].wMaxLogProb = Dic.pLexicons[Dic.pwWordIndex[n]].wLogProb;
}*/
}
else {
pDicFullDigi->Ind[nWordLen-1][pDicFullDigi->wIndNum[nWordLen-1]].cCode = cCurCode[nWordLen-1] | 0x80;
pDicFullDigi->Ind[nWordLen-1][pDicFullDigi->wIndNum[nWordLen-1]].nIndex = PATTREE_NULL_WORDINDEX;
pDicFullDigi->Ind[nWordLen-1][pDicFullDigi->wIndNum[nWordLen-1]].wWordIDFirst = wWordIDFirst;
//pDicFullDigi->Ind[nWordLen-1][pDicFullDigi->wIndNum[nWordLen-1]].wWordIDLast = wWordIDLast;
// ... to be modified
if ( wWordIDLast - wWordIDFirst + 1 > 255 )
pDicFullDigi->Ind[nWordLen-1][pDicFullDigi->wIndNum[nWordLen-1]].byWordIDNum = 255;
else
pDicFullDigi->Ind[nWordLen-1][pDicFullDigi->wIndNum[nWordLen-1]].byWordIDNum = wWordIDLast - wWordIDFirst + 1;
/*pDicFullDigi->Ind[nWordLen-1][pDicFullDigi->wIndNum[nWordLen-1]].wMaxLogProb = 0;
for ( n = wWordIDFirst;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -