📄 pattree.c
字号:
typedef struct tagHW_WORD_NODE_T {
unsigned short nIndex_SingleCharWord[PATTREE_CAND_NUM];
unsigned short nIndex_MultiCharWord[PATTREE_MAX_PREDICTION_NUM];
unsigned char byStart[PATTREE_MAX_INDEXINLEXICON_NUM];
unsigned short nIndex[PATTREE_MAX_INDEXINLEXICON_NUM];
unsigned short nMinIndexInLexicon[PATTREE_MAX_INDEXINLEXICON_NUM];
unsigned short nMaxIndexInLexicon[PATTREE_MAX_INDEXINLEXICON_NUM];
long nHWLogProb[PATTREE_MAX_INDEXINLEXICON_NUM];
unsigned char byNum_SingleCharWord;
unsigned char byNum_MultiCharWord;
unsigned char byNum_IndexInLexicon;
} HW_WORD_NODE_T;
typedef struct tagWORD_LIST_T
{
long nLogProb;
unsigned short nIndex;
unsigned char byStart;
unsigned char reserved[1];
} WORD_LIST_T;
typedef struct tagHW_SENTENCE_T
{
//////////////////////////////////
OUTBUFFER OutBuffer;
#ifdef PATTREE_POSBIGRAM
SENTENCE_NODE_T Nodes[PATTREE_MAX_HW_INPUT_LEN][PATTREE_MAX_POSTYPE];
#else
SENTENCE_NODE_T Nodes[PATTREE_MAX_HW_INPUT_LEN];
#endif
// this part must be compatible with SENTENCE_T
//////////////////////////////////
HW_WORD_NODE_T Words[PATTREE_MAX_HW_INPUT_LEN];
unsigned short Candidates[PATTREE_MAX_HW_CAND_LEN];
unsigned short Distances[PATTREE_MAX_HW_CAND_LEN];
wchar_t *pwszPredict[PATTREE_MAX_PREDICTION_NUM];
int nPredictNum;
union {
unsigned char abTraceBuffer[PATTREE_MAX_TRACE_LEN]; // working buffer for HWE
WORD_LIST_T aWordList[PATTREE_MAX_INDEXINLEXICON_NUM]; // working buffer of all words in each node for add trace to sentence fill predict
};
unsigned char abHWBuffer[PATTREE_MAX_HW_BUFFER_LEN];
//unsigned char byCharNum;
} HW_SENTENCE_T;
#endif
/***************************************************************HW SENTENCE END***********************************************************************************/
#pragma pack( pop, BEFORE_PATTREE_T )
/*********************************************************************************************/
/*********************************************************************************************/
/* Static Dic Base function */
#define IND_IS_NONNULL(pInd) ( ((pInd)->cCode) & 0x80 )
#define IND_IS_CONFLICTED(pInd) ( ((pInd)->cCode) & 0x40 )
#define IND_GET_REALCODE(pInd) ( ((pInd)->cCode) & 0x3F )
#define StaticDic_GetPinyinList( pDic, n ) ( (char*)(pDic->abData + ((long*)(pDic->abData + pDic->ReadOnlyHead.nOffset_anPinyinListOffset))[n]) )
#define StaticDic_GetCharStrokeCode( pDic ) ( (unsigned short*)(pDic->abData + pDic->ReadOnlyHead.nOffset_awCharStrokeCode) )
#define StaticDic_GetCharPinyinCode( pDic ) ( (unsigned short*)(pDic->abData + pDic->ReadOnlyHead.nOffset_awCharPinyinCode) )
#define StaticLexiconEntry_GetOwnPinyinCode(pLexicon) ( (unsigned short*)(((char*)pLexicon) + pLexicon->cPinyinCodeOffset) )
static unsigned short StaticLexiconEntry_GetPinyinCode( STATIC_LEXICON_ENTRY_T* pLexicon, STATIC_DIC_T *pDic, int n )
{
if ( pLexicon->cPinyinNum != 0 )
return StaticLexiconEntry_GetOwnPinyinCode(pLexicon)[n];
else
return StaticDic_GetCharPinyinCode(pDic)[pLexicon->wszWord[n] - pDic->ReadOnlyHead.wCharCodeOffset];
}
#define StaticDic_GetCharStrokeNum(x) ( ((x) & 0xF000) ? 4 : ( ((x) & 0x0F00) ? 3 : ( ((x) & 0x00F0) ? 2 : 1 ) ) )
#define StaticDic_GetIndNum_Pinyin( pDic ) ( (unsigned short*)(pDic->abData + pDic->ReadOnlyHead.nOffset_awIndNum_Pinyin) )
#define StaticDic_GetIndNum( pDic, pSentence ) ( (unsigned short*)(pDic->abData + pSentence->nDicOffset_awIndNum) )
#define StaticDic_GetInd_Pinyin( pDic, step ) ( (IND_T*)(pDic->abData + ((long*)(pDic->abData + pDic->ReadOnlyHead.nOffset_anIndOffset_Pinyin))[step]) )
#define StaticDic_GetInd( pDic, pSentence, step ) ( (IND_T*)(pDic->abData + ((long*)(pDic->abData + pSentence->nDicOffset_anIndOffset))[step]) )
#define StaticDic_GetLexicon( pDic, n ) ( (STATIC_LEXICON_ENTRY_T*)(pDic->abData + ((long*)(pDic->abData + pDic->ReadOnlyHead.nOffset_anlexiconsOffset))[n]) )
#define StaticDic_GetLexiconOffset( pDic ) ( (long*)(pSDic->abData + pSDic->ReadOnlyHead.nOffset_anlexiconsOffset) )
#define StaticDic_GetLexiconLogprob( pDic, n ) ( ((TYPE_PROB*)(pDic->abData_User + pDic->UserHead.nOffset_aLexiconLogprob))[n] )
#define StaticDic_GetWordIndex_Pinyin( pDic ) ( (unsigned short*)(pDic->abData_User + pDic->UserHead.nOffset_awWordIndex_Pinyin) )
#define StaticDic_GetWordIndex( pDic, pSentence ) ( (unsigned short*)(pDic->abData_User + pSentence->nDicOffset_awWordIndex) )
#define StaticDic_GetMaxSeriesLogProb( pDic, pSentence ) ( (TYPE_PROB*)(pDic->abData_User + pSentence->nDicOffset_aMaxSeriesLogProb) )
#define StaticDic_GetByOffset_User( pDic, nOffset ) ( pDic->abData_User + nOffset )
#define StaticDic_GetIndNum_User( pDic, pSentence ) ( (unsigned short*)(pDic->abData_User + pSentence->nDicOffset_awIndNum_User) )
#define StaticDic_GetWordIndex_User( pDic, pSentence ) ( (unsigned short*)(pDic->abData_User + pSentence->nDicOffset_awWordIndex_User) )
#define StaticDic_GetMaxSeriesLogProb_User(pDic, pSentence) ( (TYPE_PROB*)(pDic->abData_User + pSentence->nDicOffset_aMaxSeriesLogProb_User) )
#define StaticDic_GetInd_User( pDic, pSentence, step ) ( (IND_T*)(pDic->abData_User + ((long*)(pDic->abData_User + pSentence->nDicOffset_anIndOffset_User))[step]) )
#define StaticDic_GetLexicon_User( pDic, n ) ( (STATIC_LEXICON_ENTRY_T*)(pDic->abData_User + ((long*)(pDic->abData_User + pDic->UserHead.nOffset_anlexiconsOffset_User))[n - pDic->ReadOnlyHead.nLexiconNum]) )
#define StaticDic_GetLexiconLogprob_User( pDic, n ) ( ((TYPE_PROB*)(pDic->abData_User + pDic->UserHead.nOffset_aLexiconLogprob_User))[n] )
#define StaticDic_GetWordIndex_Pinyin_User( pDic ) ( (unsigned short*)(pDic->abData_User + pDic->UserHead.nOffset_awWordIndex_Pinyin_User) )
#define StaticDic_GetWordIndex_PinyinIndex_User( pDic ) ( (unsigned char*)(pDic->abData_User + pDic->UserHead.nOffset_awWordIndex_Pinyin_User + pDic->UserHead.nUSER_DIC_MAX_ENTRY * 2 * sizeof(unsigned short) ) )
#define StaticDic_GetWordIndex_Stroke_User( pDic ) ( (unsigned short*)(pDic->abData_User + pDic->UserHead.nOffset_awWordIndex_Stroke_User) )
#define StaticDic_GetMaxSeriesLogProb_Pinyin_User( pDic ) ( (TYPE_PROB*)(pDic->abData_User + pDic->UserHead.nOffset_aMaxSeriesLogProb_Pinyin_User) )
#define StaticDic_GetMaxSeriesLogProb_Stroke_User( pDic ) ( (TYPE_PROB*)(pDic->abData_User + pDic->UserHead.nOffset_aMaxSeriesLogProb_Stroke_User) )
#define StaticDic_GetLexicon_User_AbsoluteIndex( pDic, n ) ( (STATIC_LEXICON_ENTRY_T*)(pDic->abData_User + ((long*)(pDic->abData_User + pDic->UserHead.nOffset_anlexiconsOffset_User))[n]) )
#define StaticDic_GetLexiconOffset_User( pDic ) ( (long*)(pDic->abData_User + pDic->UserHead.nOffset_anlexiconsOffset_User) )
#define StaticDic_GetIndNum_Pinyin_User( pDic ) ( (unsigned short*)(pDic->abData_User + pDic->UserHead.nOffset_awIndNum_Pinyin_User) )
#define StaticDic_GetIndNum_Stroke_User( pDic ) ( (unsigned short*)(pDic->abData_User + pDic->UserHead.nOffset_awIndNum_Stroke_User) )
#define StaticDic_GetInd_Pinyin_User( pDic, step ) ( (IND_T*)(pDic->abData_User + ((long*)(pDic->abData_User + pDic->UserHead.nOffset_anIndOffset_Pinyin_User))[step]) )
#define StaticDic_GetInd_Stroke_User( pDic, step ) ( (IND_T*)(pDic->abData_User + ((long*)(pDic->abData_User + pDic->UserHead.nOffset_anIndOffset_Stroke_User))[step]) )
//#define StaticDic_GetLexiconByWordID( pDic, pSentence, ID ) StaticDic_GetLexicon( pDic, StaticDic_GetWordIndex( pDic, pSentence )[ID] )
static STATIC_LEXICON_ENTRY_T* StaticDic_GetLexiconByWordID( STATIC_DIC_T *pDic, SENTENCE_T *pSentence, unsigned short ID )
{
if ( ID >= (unsigned short)pSentence->nWordIndexNum ) {
return StaticDic_GetLexicon_User( pDic, StaticDic_GetWordIndex_User( pDic, pSentence )[ID - pSentence->nWordIndexNum] );
}
else {
return StaticDic_GetLexicon( pDic, StaticDic_GetWordIndex( pDic, pSentence )[ID] );
}
}
static unsigned short * StaticDic_GetLexiconIndexByWordID( STATIC_DIC_T *pDic, SENTENCE_T *pSentence, unsigned short ID )
{
if ( ID >= (unsigned short)pSentence->nWordIndexNum ) {
return &StaticDic_GetWordIndex_User( pDic, pSentence )[ID - pSentence->nWordIndexNum];
}
else {
return &StaticDic_GetWordIndex( pDic, pSentence )[ID];
}
}
static TYPE_PROB * StaticDic_GetMaxSeriesLogProbByIND( STATIC_DIC_T *pDic, SENTENCE_T *pSentence, IND_T *pInd )
{
if ( pInd->wWordIDFirst >= (unsigned short)pSentence->nWordIndexNum ) {
return &StaticDic_GetMaxSeriesLogProb_User( pDic, pSentence )[pInd->wWordIDFirst - pSentence->nWordIndexNum];
}
else {
return &StaticDic_GetMaxSeriesLogProb( pDic, pSentence )[pInd->wWordIDFirst];
}
}
static STATIC_LEXICON_ENTRY_T* StaticDic_GetLexiconByIndex( STATIC_DIC_T *pDic, unsigned short nIndex )
{
if ( nIndex >= (unsigned short)pDic->ReadOnlyHead.nLexiconNum ) {
return StaticDic_GetLexicon_User(pDic, nIndex);
}
else {
return StaticDic_GetLexicon(pDic, nIndex);
}
}
static TYPE_PROB* StaticDic_GetLexiconLogprobByIndex( STATIC_DIC_T *pDic, unsigned short nIndex )
{
if ( nIndex >= (unsigned short)pDic->ReadOnlyHead.nLexiconNum ) {
return &StaticDic_GetLexiconLogprob_User(pDic, nIndex - pDic->ReadOnlyHead.nLexiconNum);
}
else {
return &StaticDic_GetLexiconLogprob(pDic, nIndex);
}
}
static TYPE_PROB* StaticDic_GetLexiconLogprobByWordID( STATIC_DIC_T *pDic, SENTENCE_T *pSentence, unsigned short wWordID )
{
return StaticDic_GetLexiconLogprobByIndex(pDic, *StaticDic_GetLexiconIndexByWordID(pDic, pSentence, wWordID));
}
static IND_T* StaticDic_GetInd_Spec( const STATIC_DIC_T *pDic, SENTENCE_T *pSentence, int step, int nIsUserDic )
{
if ( nIsUserDic == 0 )
return StaticDic_GetInd( pDic, pSentence, step );
else
return StaticDic_GetInd_User( pDic, pSentence, step );
}
static unsigned short* StaticDic_GetIndNum_Spec( const STATIC_DIC_T *pDic, SENTENCE_T *pSentence, int nIsUserDic )
{
if ( nIsUserDic == 0 )
return StaticDic_GetIndNum( pDic, pSentence );
else
return StaticDic_GetIndNum_User( pDic, pSentence );
}
#define StaticDic_GetPOSBigram( pDic ) ( (TYPE_PROB*)(pDic->abData + pDic->ReadOnlyHead.nOffset_aPOSBigram) )
#define StaticDic_GetPOSUnigram( pDic ) ( (TYPE_PROB*)(pDic->abData + pDic->ReadOnlyHead.nOffset_aPOSUnigram) )
#define StaticDic_GetENIndNum( pDic ) ( (unsigned short*)(pDic->abData + pDic->ReadOnlyHead.nOffset_awIndNum_Pinyin) )
#define StaticDic_GetENInd( pDic, step ) ( (EN_IND_T*)(pDic->abData + ((long*)(pDic->abData + pDic->ReadOnlyHead.nOffset_anIndOffset_Pinyin))[step]) )
/*********************************************************************************************/
/*********************************************************************************************/
/* Some basic function needed */
static char* salloc( char* str )
{
char *pStr;
int nLen = strlen(str);
pStr = malloc(nLen+1);
memcpy(pStr, str, nLen+1);
return pStr;
}
static long GetStaticDicVersion( short nLanguage )
{
long n;
if ( nLanguage == PATTREE_DIC_CHS ) {
n = PATTREE_BIN_VERSION | PATTREE_DIC_STATIC;
#ifdef PATTREE_POSBIGRAM
n |= PATTREE_DIC_POSBIGRAM;
#endif
n |= PATTREE_DIC_TYPE_PROB_CHAR;
}
else
n = PATTREE_BIN_EN_VERSION | PATTREE_DIC_STATIC;
#ifdef _UNICODE
n |= PATTREE_DIC_UNICODE;
#endif
return n;
}
long NEXTAP_GetKernelVersion(void)
{
long n;
n = PATTREE_KERNEL_VERSION | PATTREE_DIC_STATIC;
#ifdef PATTREE_POSBIGRAM
n |= PATTREE_DIC_POSBIGRAM;
#endif
n |= PATTREE_DIC_TYPE_PROB_CHAR;
#ifdef _UNICODE
n |= PATTREE_DIC_UNICODE;
#endif
return n;
}
/* basic memory function end */
/*********************************************************************************************/
#define DIGI2CODE(c,code,mode) \
{ \
if ( mode == PATTREE_CHS_PINYIN ) \
PINYINDIGI2CODE(c,code) \
else \
STROKEDIGI2CODE(c,code) \
}
#define PINYINDIGI2CODE(c,code) \
{ \
if ( '2' <= c && '9' >= c ) \
code = c-'2'; \
else \
code = -1; \
}
#define STROKEDIGI2CODE(c,code) \
{ \
if ( '1' <= c && '5' >= c ) \
code = c-'1'; \
else \
code = -1; \
}
#define STROKECODE2DIGI(code,c) ( c = code + '1' )
static const char DIGI2ALPHABET[][4] = { {'a', 'b', 'c', 0}, {'d', 'e', 'f', 0}, {'g', 'h', 'i', 0}, {'j', 'k', 'l', 0}, {'m', 'n', 'o', 0}, {'p', 'q', 'r', 's'},
{'t', 'u', 'v', 0}, {'w', 'x', 'y', 'z'} };
static const int DIGI2ALPHABETNUM[] = { 3, 3, 3, 3, 3, 4, 3, 4 };
static const char ALPHABET2DIGI[26] = { '2', '2', '2', '3', '3', '3', '4', '4', '4', '5', '5', '5', '6', '6', '6', '7', '7', '7', '7', '8', '8', '8', '9', '9', '9', '9' };
//////////////////////////////////////////////////////////////////
// create dic
static int StaticDic_Pinyin2Digi( STATIC_LEXICON_ENTRY_T *pLexicon, STATIC_DIC_T *pDic, char *pszDigi, unsigned char bPinyinIndex )
{
int i, nWordLen, nLen;
int nPos;
char *pszCur;
pszDigi[0] = 0;
nWordLen = ch_strlen(pLexicon->wszWord);
nLen = 0;
nPos = bPinyinIndex * nWordLen;
for ( i = 0; i < nWordLen; i ++ ) {
pszCur = StaticDic_GetPinyinList( pDic, StaticLexiconEntry_GetPinyinCode( pLexicon, pDic, nPos+i ) );
while ( *pszCur ) {
if ( pszCur[1] == ':' ) {
if ( !(pszCur[-1] == 'j' || pszCur[-1] == 'q' || pszCur[0] == 'x') ) {
pszDigi[nLen++] = ALPHABET2DIGI['v' - 'a'];
}
else
pszDigi[nLen++] = ALPHABET2DIGI[pszCur[0] - 'a'];
pszCur += 2;
}
else {
pszDigi[nLen++] = ALPHABET2DIGI[pszCur[0] - 'a'];
pszCur ++;
}
}
}
pszDigi[nLen] = 0;
return nLen;
}
static int StaticDic_LexiconStrokeCode2Digi( STATIC_LEXICON_ENTRY_T *pLexicon, STATIC_DIC_T *pDic, char *pszDigi )
{
int i, nWordLen, nLen, k;
unsigned short *pwCharStrokeCode;
unsigned short wCode, wThisCode;
pszDigi[0] = 0;
pwCharStrokeCode = StaticDic_GetCharStrokeCode(pDic);
nWordLen = ch_strlen(pLexicon->wszWord);
nLen = 0;
for ( i = 0; i < nWordLen; i ++ ) {
wCode = pwCharStrokeCode[pLexicon->wszWord[i] - pDic->ReadOnlyHead.wCharCodeOffset];
for ( k = 0; k < 4; k ++ ) {
wThisCode = (wCode >> (4*k)) & 0x000F;
if ( wThisCode == 0 )
break;
pszDigi[nLen] = wThisCode + '0';
nLen ++;
}
}
pszDigi[nLen] = 0;
return nLen;
}
void PATTREE_FreeStaticDic( void* pDicBuffer )
{
STATIC_DIC_T *pDic;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -