📄 pattree.c
字号:
long nWordIndexNum_Stroke;
long nOffset_awIndNum_Stroke;
long nOffset_anIndOffset_Stroke;
// p-o-s
long nPOSNum;
//词性数目
long nOffset_aPOSBigram;
// 词性Bigram数组的偏移量,数值也Log整型化,原数值按如下公式计算:(-wPOSBigram) * pDic->fLogProbScale,在实际计算中只采用(-wPOSBigram)
long nOffset_aPOSUnigram;
// 词性Uigram数组的偏移量,数值也Log整型化,原数值按如下公式计算:(-wPOSUnigram) * pDic->fLogProbScale,在实际计算中只采用(-wPOSUnigram)
long nPunctuationPOSIndex;
// 标点符号的词性码,标点符号用于整句优化中,假设整句一定以标点符号结束。
long nCompoundPOSIndex;
// 用户自定义词的词性码。
// handwriting dic
long nOffset_HandwritingDic;
// 手写字典的偏移量,手写字典由第3方提供
long nReserved1[8];
unsigned short awCommonSingleCharIndex[PATTREE_MAX_COMMON_SINGLE_CHAR];
// 常见字索引数组,索引对应词条动态数组中的索引。这个数组用于在没有任何联想时,给出常用字作为联想
wchar_t awszSymbol[PATTREE_MAX_SYMBOL_NUM][2];
} STATIC_READONLY_DIC_HEAD_T;
typedef struct tagSTATIC_DIC_T
{
STATIC_DIC_HEAD_T Head;
STATIC_READONLY_DIC_HEAD_T ReadOnlyHead;
STATIC_USER_DIC_HEAD_T UserHead;
unsigned char *abData;
int nIs_Data_AllocBuffer;
unsigned char *abData_User;
int nIs_Data_User_AllocBuffer;
/* dynamic data*/
//unsigned short awCharPinyinCode[nCharNum];
//unisgned short awCharStrokeCode[nCharNum];
//long anPinyinListOffset[nPinyinNum];
//char azPinyinList[];
//long anlexiconsOffset[nLexiconNum];
//STATIC_LEXICON_ENTRY_T aLexicons[];
//unsigned short awIndNum_Pinyin[nMaxIndLen_Pinyin];
//long anIndOffset_Pinyin[nMaxIndLen_Pinyin];
//IND_T aInd_Pinyin[];
//unsigned short awIndNum_Stroke[nMaxIndLen_Stroke];
//long anIndOffset_Stroke[nMaxIndLen_Stroke];
//IND_T aInd_Stroke[];
//TYPE_PROB aPOSBigram[nPOSNum][nPOSNum];
//TYPE_PROB aPOSUnigram[nPOSNum];
//char abHandWritingDic;
/* rewritable data */
//TYPE_PROB aLexiconLogprob[nLexiconNum];
//unsigned short awWordIndex_Pinyin[nWordIndexNum_Pinyin];
//TYPE_PROB aMaxSeriesLogProb_Pinyin[nWordIndexNum_Pinyin];
//unsigned short awWordIndex_Stroke[nWordIndexNum_Stroke];
//TYPE_PROB aMaxSeriesLogProb_Stroke[nWordIndexNum_Stroke];
//TYPE_PROB aLexiconLogprob_User[nUSER_DIC_MAX_ENTRY+1];
//long anLexiconsOffset_User[nUSER_DIC_MAX_ENTRY+1];
//unsigned short awWordIndex_Pinyin_User[nUSER_DIC_MAX_ENTRY*2];
//unsigned char abWordIndex_PinyinIndex_User[nUSER_DIC_MAX_ENTRY*2]; // 这个WordIndex对应的是这个词的第几个拼音
//TYPE_PROB aMaxSeriesLogProb_Pinyin_User[nUSER_DIC_MAX_ENTRY*2];
//unsigned short awIndNum_Pinyin_User[nMaxIndLen_Pinyin];
//long anIndOffset_Pinyin_User[nMaxIndLen_Pinyin+1];
//unsigned short awWordIndex_Stroke_User[nUSER_DIC_MAX_ENTRY];
//TYPE_PROB aMaxSeriesLogProb_Stroke_User[nUSER_DIC_MAX_ENTRY];
//unsigned short awIndNum_Stroke_User[nMaxIndLen_Stroke];
//long anIndOffset_Stroke_User[nMaxIndLen_Stroke+1];
//STATIC_LEXICON_ENTRY_T aLexicons_User[];
//IND_T aInd_Pinyin_User[];
//IND_T aInd_Stroke_User[];
} STATIC_DIC_T;
/***************************************basic datastructure END*****************************************/
/***************************************Dynamic Dic*********************************************/
//动态词典部分用于生成词典时用。此处主要区别于静态词典的是结构简单,易调用生成结构,但需要动态分配内存。不适合嵌入式应用。
#ifdef _WINDOWS
#ifdef _WIN32
typedef struct tagSORTABLE_LEXICON_ENTRY_T
// 用于根据数字串以及词频排序的结构
{
char *szDigiString;
TYPE_PROB LogProb;
unsigned short wIndex;
} SORTABLE_LEXICON_ENTRY_T;
typedef struct tagLEXICON_ENTRY
{
wchar_t *pwszWord;
unsigned short *pwPinyinCode;
TYPE_PROB LogProb;
// POS
unsigned char acPOS[PATTREE_MAX_CHARPOSNUM];
unsigned long anPOSLogProb[PATTREE_MAX_CHARPOSNUM];
char cPOSNum;
char cPinyinNum;
} LEXICON_ENTRY_T;
/* struct of the dictionary, stored in a tree, each node represent a lexicon or the form
part of a lexicon. */
typedef struct tagDIC_IND_T
{
/* node of the dictionary tree. each array(Ind[0] ~ Ind[PATTREE_MAX_IND_LEN]) is the nodes
of all the char in words of the dictionary which's in the same order in the chars
sequence of word. you must retrieve a array of nodes by using the prev array node's
nIndex member in IND_T struct. that is to say, you must retrieve a word from the first
char to the last one.
such as: Ind[1] represent the index of the second char in word, the nIndex member of
Ind[1][] point to the next index---Ind[2] array's index.
each array(Ind[i])'s number is resampled by nIndNum[i].
NOTE: Ind[0]'s number is always set to PATTREE_MAX_CODE_NUM, and it is not retrieve by the
prev array (no prev array exist) but the code of the first char in a word instead
*/
IND_T *Ind[PATTREE_MAX_IND_LEN];
/* the number of each Ind array, the last one nIndNum[PATTREE_MAX_IND_LEN] is the total
number of the valid strWord */
unsigned short wIndNum[PATTREE_MAX_IND_LEN];
} DIC_IND_T;
typedef struct tagDIC_T
{
LEXICON_ENTRY_T *pLexicons;
unsigned short nLexiconNum;
long nTotalCount;
long nLogProbOffset;
double fLogProbScale;
DIC_IND_T *pDicFullDigi;
unsigned short nActualDigiNum;
unsigned short *pwWordIndex;
TYPE_PROB *pMaxSeriesLogProb;
unsigned short nWordIndexNum;
char **ppPinyinList;
int nPinyinNum;
DIC_IND_T *pDicFullDigi_2;
unsigned short nActualDigiNum_2;
unsigned short *pwWordIndex_2;
TYPE_PROB *pMaxSeriesLogProb_2;
unsigned short nWordIndexNum_2;
int nCharNum;
unsigned short wCharCodeOffset;
char *pcCharStrokeNum;
char **ppCharStrokeList;
unsigned short *pwCharPinyinCode;
long anPOSBigram[PATTREE_MAX_POSTYPE*PATTREE_MAX_POSTYPE];
long anPOSUnigram[PATTREE_MAX_POSTYPE];
unsigned short awCommonSingleCharIndex[PATTREE_MAX_COMMON_SINGLE_CHAR];
} DIC_T;
static const TCHAR CHS_COMMON_SINGLE_CHAR[][3] = { _T("的"), _T("在"), _T("和"), _T("了"), _T("是"), _T("为"), _T("中"), _T("不"), _T("有"), _T("人"),
_T("上"), _T("对"), _T("大"), _T("一"), _T("与"), _T("地"), _T("等"), _T("以"), _T("新"), _T("到") };
static const TCHAR CHS_SYMBOL[][3] = { _T(","), _T("。"), _T("?"), _T("!"), _T(" "), _T("“"), _T("”"), _T(":"), _T(";"), _T("‘"), _T("’"), _T("@") };
static const TCHAR EN_SYMBOL[][2] = { _T(","), _T("."), _T("?"), _T("!"), _T("\""), _T(":"), _T(";"), _T("'"), _T("@") };
#endif //_WINDOWS
#endif //_WIN32
/***************************************Dynamic Dic END*********************************************/
/***************************************Sentence*********************************************/
// 以下结构为输入法中的工作缓冲区。这是应为输入法中不允许(原则上)分配动态内存,而是工作中只利用一块预先分配好的内存--SENTENCE_T结构
// 当前词状态,当前词为整句输入过程中的第一个词(后面没有其他词,也就是说也是最后一个词)或者最后一个词,这个状态的Prediction允许有无限制的向后的联想。
typedef struct tagCUR_STATE
{
unsigned short nIndex[PATTREE_MAX_WORD_LEN];
// 对应各级数字在Patricia Tree中的索引,例如nIndex[3]对应这个数字串(当前词的)中第3级(从0级算起)数字对应IND_T结构中第3级数组的索引。
unsigned short nIndex_User[PATTREE_MAX_WORD_LEN];
// 与上面成员的区别是此处对应用户词典。
char szDigi[PATTREE_MAX_WORD_LEN];
// 数字串
unsigned char nDigiNum;
// 数字串中数字数目
unsigned char byBegin;
// 当前词起始数字在SENTENCE_T中数字串的起始索引。
unsigned char byPredictNum;
// Prediction词条数目
char reserved2[3];
// 结构字节对齐用
unsigned short wCandNum;
// Candidates词条数目
unsigned short wUnifiedCandNum;
// Unified Candidates词条数目。Unified Candidates将Prediction和Candidates统一,前2个为Candidates的前2个,其余Prediction和Candidates按词频排列
unsigned short awWordIdCand[PATTREE_MAX_CAND_NUM];
// Candidates对应的WordID
short nStepPredict[PATTREE_MAX_PREDICTION_NUM];
// 对应于每个Prediction中的数字串的级数,即数字串的长度减1。例如:输入“96”("wo")后一个Prediction是“我们”("women"),则此项为4
unsigned short nIndexPredict[PATTREE_MAX_PREDICTION_NUM];
// 对应于每个Prediction中的数字串在Patricia Tree中的索引
unsigned short wWordIdPredict[PATTREE_MAX_PREDICTION_NUM];
// 对应于每个Prediction中的WordID
unsigned short awUnifiedCandNo[PATTREE_MAX_UNIFIED_CAND_NUM];
// 用于Unified Candidates。高字节为0时,低字节表示在candidates数组中索引;高字节为1时,表示在predictions数组中索引
} CUR_STATE;
typedef struct tagNEXTPREDICT
//该结构用于Next的预测方式,也就是已知的是几个汉字,通过这几个汉字来预测可能的词。例如:对“中”预测“国”“华”等,
// 实是根据可能的词“中国”,“中华”做预测
{
unsigned short nLexcionEntry[PATTREE_MAX_PREDICTION_NUM];
// 词的原始索引
unsigned char byPredictNum;
// 预测个数
} NEXTPREDICT;
typedef struct tagSENTENCE_NODE_T
// viterbi算法中的节点
{
unsigned char byStartStep;
// 数字串在Sentence中的起始值(从0开始)
unsigned char byEndStep;
// 数字串在Sentence中的结束值
unsigned short nWordIndex;
// 词的原始索引
long nFreezedLogProb;
// 如果用户在此处选择了某个词,那么此处的这个词将被固定住,此处给出一个很大的值作为这个词的词频。
long nMaxLogProb;
// viterbi算法中的累积值。
struct tagSENTENCE_NODE_T *pPreNode;
// 前一个节点
} SENTENCE_NODE_T;
typedef struct tagSENTENCE_T
{
OUTBUFFER OutBuffer;
#ifdef PATTREE_POSBIGRAM
SENTENCE_NODE_T Nodes[PATTREE_MAX_INPUT_LEN][PATTREE_MAX_POSTYPE];
// 用二元词性做viterbi算法,则相应MM模型为1阶,每个节点状态数为词性数目(PATTREE_MAX_POSTYPE个),在每个节点处需要存储PATTREE_MAX_POSTYPE个最优
// 解。数字串中每个数字为一个节点,总共有PATTREE_MAX_INPUT_LEN个节点
#else
SENTENCE_NODE_T Nodes[PATTREE_MAX_INPUT_LEN];
// 不用词性,直接用词的Unigram做优化,则每个节点总共需要1个最优解。
#endif
unsigned short nIndexOfInvertString[PATTREE_MAX_WORD_LEN];
// 假设当前Sentence的数字串为"2345678765432",则nIndexOfInvertString[0]为最后一个数字"2"在Patricia Tree中的索引(级数为0)
// nIndexOfInvertString[1]为最后2个数字"32"在Patricia Tree中的索引(级数为1)
unsigned short nIndexOfInvertString_User[PATTREE_MAX_WORD_LEN];
// 与上面成员的唯一区别在于此处对应用户自定义词典
CUR_STATE CurWord;
// Cur word词预测
CUR_STATE PreWord;
// Pre word词预测
NEXTPREDICT NextWord;
// next word词预测
// 这些参数实际上都存储于字典中(STATIC_DIC_T结构),只是在不同输入方式下(拼音或笔划),这些值不同
long nDicOffset_awWordIndex;
long nDicOffset_aMaxSeriesLogProb;
long nDicOffset_awIndNum;
long nDicOffset_anIndOffset;
short nMaxIndLen;
long nWordIndexNum;
long nDicOffset_awWordIndex_User;
long nDicOffset_aMaxSeriesLogProb_User;
long nDicOffset_awIndNum_User;
long nDicOffset_anIndOffset_User;
} SENTENCE_T;
/***************************************Sentence END*********************************************/
/***************************************************************EN DIC**********************************************************************************/
/* struct of a index in dictionary, it correspond to a char in the dictionary tree */
typedef struct tagEN_IND_T
{
unsigned char bLogProb;
unsigned char bMaxSeriesLogProb;
char cCode;
unsigned char cIndex;
unsigned short nIndex;
unsigned short nPrevIndex;
}
#ifdef __SYMBIAN32__
#ifdef __GCC32__
__attribute__ ((__aligned__(PATTREE_ALIGNBYTES), __packed__))
#endif
#endif
EN_IND_T;
typedef struct tagEN_SENTENCE_T
{
OUTBUFFER OutBuffer;
long anIndex[PATTREE_MAX_EN_ENTRY];
int nEntryNum;
int nWordNum;
TCHAR szUnifiedCandBuffer[PATTREE_CAND_NUM][PATTREE_MAX_WORDEN_LEN];
TCHAR szCandBuffer[PATTREE_CAND_NUM][PATTREE_MAX_WORDEN_LEN];
TCHAR szPredictBuffer[PATTREE_PREDICT_NUM][PATTREE_MAX_WORDEN_LEN];
unsigned short awUnifiedCandNo[PATTREE_MAX_UNIFIED_CAND_NUM_EN];
short anStepPredict[PATTREE_MAX_PREDICTION_NUM];
long anIndexPredict[PATTREE_MAX_PREDICTION_NUM];
int nPredictNum;
int nUnifiedCandNum;
} EN_SENTENCE_T;
/***************************************************************EN DIC END***********************************************************************************/
/***************************************************************HW SENTENCE**********************************************************************************/
#ifdef PATTREE_HANDWRITING
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -