📄 pattree.c

📁 Symbian平台数字键盘手机输入法源码
💻 C
📖 第 1 页 / 共 5 页
字号:
	long			nWordIndexNum_Stroke;

	long			nOffset_awIndNum_Stroke;
	long			nOffset_anIndOffset_Stroke;

	// p-o-s 
	long			nPOSNum;
		//词性数目
	long			nOffset_aPOSBigram;
		// 词性Bigram数组的偏移量，数值也Log整型化，原数值按如下公式计算：(-wPOSBigram) * pDic->fLogProbScale，在实际计算中只采用(-wPOSBigram)
	long			nOffset_aPOSUnigram;
		// 词性Uigram数组的偏移量，数值也Log整型化，原数值按如下公式计算：(-wPOSUnigram) * pDic->fLogProbScale，在实际计算中只采用(-wPOSUnigram)
	long			nPunctuationPOSIndex;
		// 标点符号的词性码，标点符号用于整句优化中，假设整句一定以标点符号结束。
	long			nCompoundPOSIndex;
		// 用户自定义词的词性码。

	// handwriting dic
	long			nOffset_HandwritingDic;
		// 手写字典的偏移量，手写字典由第3方提供

	long			nReserved1[8];

	unsigned short	awCommonSingleCharIndex[PATTREE_MAX_COMMON_SINGLE_CHAR];
		// 常见字索引数组，索引对应词条动态数组中的索引。这个数组用于在没有任何联想时，给出常用字作为联想

	wchar_t			awszSymbol[PATTREE_MAX_SYMBOL_NUM][2];
} STATIC_READONLY_DIC_HEAD_T;

typedef struct tagSTATIC_DIC_T
{
	STATIC_DIC_HEAD_T Head;
	STATIC_READONLY_DIC_HEAD_T ReadOnlyHead;
	STATIC_USER_DIC_HEAD_T UserHead;
	
	unsigned char	*abData;
	int				nIs_Data_AllocBuffer;
	unsigned char	*abData_User;
	int				nIs_Data_User_AllocBuffer;
	
	/* dynamic data*/

	//unsigned short			awCharPinyinCode[nCharNum];
	//unisgned short			awCharStrokeCode[nCharNum];
	//long						anPinyinListOffset[nPinyinNum];
	//char 					    azPinyinList[];

	//long						anlexiconsOffset[nLexiconNum];
	//STATIC_LEXICON_ENTRY_T	aLexicons[];

	//unsigned short			awIndNum_Pinyin[nMaxIndLen_Pinyin];
	//long						anIndOffset_Pinyin[nMaxIndLen_Pinyin];
	//IND_T						aInd_Pinyin[];
	
	//unsigned short			awIndNum_Stroke[nMaxIndLen_Stroke];
	//long						anIndOffset_Stroke[nMaxIndLen_Stroke];
	//IND_T						aInd_Stroke[];

	//TYPE_PROB					aPOSBigram[nPOSNum][nPOSNum];
	//TYPE_PROB					aPOSUnigram[nPOSNum];

	//char						abHandWritingDic;

	/* rewritable data */
	//TYPE_PROB					aLexiconLogprob[nLexiconNum];

	//unsigned short			awWordIndex_Pinyin[nWordIndexNum_Pinyin];
	//TYPE_PROB					aMaxSeriesLogProb_Pinyin[nWordIndexNum_Pinyin];

	//unsigned short			awWordIndex_Stroke[nWordIndexNum_Stroke];
	//TYPE_PROB					aMaxSeriesLogProb_Stroke[nWordIndexNum_Stroke];

	//TYPE_PROB					aLexiconLogprob_User[nUSER_DIC_MAX_ENTRY+1];
	//long						anLexiconsOffset_User[nUSER_DIC_MAX_ENTRY+1];
	
	//unsigned short			awWordIndex_Pinyin_User[nUSER_DIC_MAX_ENTRY*2];
	//unsigned char				abWordIndex_PinyinIndex_User[nUSER_DIC_MAX_ENTRY*2];  // 这个WordIndex对应的是这个词的第几个拼音
	//TYPE_PROB					aMaxSeriesLogProb_Pinyin_User[nUSER_DIC_MAX_ENTRY*2];
	//unsigned short			awIndNum_Pinyin_User[nMaxIndLen_Pinyin];
	//long						anIndOffset_Pinyin_User[nMaxIndLen_Pinyin+1];
	
	//unsigned short			awWordIndex_Stroke_User[nUSER_DIC_MAX_ENTRY];
	//TYPE_PROB					aMaxSeriesLogProb_Stroke_User[nUSER_DIC_MAX_ENTRY];
	//unsigned short			awIndNum_Stroke_User[nMaxIndLen_Stroke];
	//long						anIndOffset_Stroke_User[nMaxIndLen_Stroke+1];

	//STATIC_LEXICON_ENTRY_T	aLexicons_User[];
	//IND_T						aInd_Pinyin_User[];
	//IND_T						aInd_Stroke_User[];
} STATIC_DIC_T;

/***************************************basic datastructure END*****************************************/

/***************************************Dynamic Dic*********************************************/
//动态词典部分用于生成词典时用。此处主要区别于静态词典的是结构简单，易调用生成结构，但需要动态分配内存。不适合嵌入式应用。
#ifdef _WINDOWS
#ifdef _WIN32

typedef struct tagSORTABLE_LEXICON_ENTRY_T
	// 用于根据数字串以及词频排序的结构
{
	char *szDigiString;
	TYPE_PROB LogProb;
	unsigned short wIndex;
} SORTABLE_LEXICON_ENTRY_T;

typedef struct tagLEXICON_ENTRY
{
	wchar_t *pwszWord;

	unsigned short *pwPinyinCode;
	TYPE_PROB LogProb;

	// POS
	unsigned char acPOS[PATTREE_MAX_CHARPOSNUM];
	unsigned long anPOSLogProb[PATTREE_MAX_CHARPOSNUM];
	char cPOSNum;

	char cPinyinNum;
} LEXICON_ENTRY_T;

/* struct of the dictionary, stored in a tree, each node represent a lexicon or the form
   part of a lexicon. */
typedef struct tagDIC_IND_T
{
	/* node of the dictionary tree. each array(Ind[0] ~ Ind[PATTREE_MAX_IND_LEN]) is the nodes
	   of all the char in words of the dictionary which's in the same order in the chars
       sequence of word. you must retrieve a array of nodes by using the prev array node's
	   nIndex member in IND_T struct. that is to say, you must retrieve a word from the first
	   char to the last one.
	   such as: Ind[1] represent the index of the second char in word, the nIndex member of
	   Ind[1][] point to the next index---Ind[2] array's index.
	   each array(Ind[i])'s number is resampled by nIndNum[i].
	   NOTE: Ind[0]'s number is always set to PATTREE_MAX_CODE_NUM, and it is not retrieve by the
			 prev array (no prev array exist) but the code of the first char in a word instead
	*/
	IND_T	*Ind[PATTREE_MAX_IND_LEN];
	/* the number of each Ind array, the last one nIndNum[PATTREE_MAX_IND_LEN] is the total 
	 number of the valid strWord */
	unsigned short	wIndNum[PATTREE_MAX_IND_LEN];
}	DIC_IND_T;

typedef struct tagDIC_T
{
	LEXICON_ENTRY_T	*pLexicons;
	unsigned short	nLexiconNum;
	long			nTotalCount;
	long			nLogProbOffset;
	double			fLogProbScale;

	DIC_IND_T		*pDicFullDigi;
	unsigned short  nActualDigiNum;
	unsigned short  *pwWordIndex;
	TYPE_PROB		*pMaxSeriesLogProb;
	unsigned short	nWordIndexNum;

	char			**ppPinyinList;
	int				nPinyinNum;

	DIC_IND_T		*pDicFullDigi_2;
	unsigned short  nActualDigiNum_2;
	unsigned short  *pwWordIndex_2;
	TYPE_PROB		*pMaxSeriesLogProb_2;
	unsigned short	nWordIndexNum_2;

	int				nCharNum;
	unsigned short	wCharCodeOffset;
	char			*pcCharStrokeNum;
	char			**ppCharStrokeList;

	unsigned short *pwCharPinyinCode;

	long			anPOSBigram[PATTREE_MAX_POSTYPE*PATTREE_MAX_POSTYPE];
	long			anPOSUnigram[PATTREE_MAX_POSTYPE];

	unsigned short	awCommonSingleCharIndex[PATTREE_MAX_COMMON_SINGLE_CHAR];
} DIC_T;

static const TCHAR CHS_COMMON_SINGLE_CHAR[][3] = { _T("的"), _T("在"), _T("和"), _T("了"), _T("是"), _T("为"), _T("中"), _T("不"), _T("有"), _T("人"), 
											_T("上"), _T("对"), _T("大"), _T("一"), _T("与"), _T("地"), _T("等"), _T("以"), _T("新"), _T("到") };
static const TCHAR CHS_SYMBOL[][3] = { _T("，"), _T("。"), _T("?"), _T("!"), _T(" "), _T("“"), _T("”"), _T("："), _T("；"), _T("‘"), _T("’"), _T("@") };
static const TCHAR EN_SYMBOL[][2] = { _T(","), _T("."), _T("?"), _T("!"), _T("\""), _T(":"), _T(";"), _T("'"), _T("@") };

#endif //_WINDOWS
#endif //_WIN32

/***************************************Dynamic Dic END*********************************************/

/***************************************Sentence*********************************************/
// 以下结构为输入法中的工作缓冲区。这是应为输入法中不允许（原则上）分配动态内存，而是工作中只利用一块预先分配好的内存－－SENTENCE_T结构

// 当前词状态，当前词为整句输入过程中的第一个词（后面没有其他词，也就是说也是最后一个词）或者最后一个词，这个状态的Prediction允许有无限制的向后的联想。
typedef struct tagCUR_STATE
{
	unsigned short nIndex[PATTREE_MAX_WORD_LEN];
		// 对应各级数字在Patricia Tree中的索引，例如nIndex[3]对应这个数字串（当前词的）中第3级（从0级算起）数字对应IND_T结构中第3级数组的索引。
	unsigned short nIndex_User[PATTREE_MAX_WORD_LEN];
		// 与上面成员的区别是此处对应用户词典。
	
	char szDigi[PATTREE_MAX_WORD_LEN];
		// 数字串
	unsigned char nDigiNum;
		// 数字串中数字数目
	unsigned char byBegin;
		// 当前词起始数字在SENTENCE_T中数字串的起始索引。

	unsigned char byPredictNum;
		// Prediction词条数目
	char reserved2[3];
		// 结构字节对齐用
	unsigned short wCandNum;
		// Candidates词条数目
	unsigned short wUnifiedCandNum;
		// Unified Candidates词条数目。Unified Candidates将Prediction和Candidates统一，前2个为Candidates的前2个，其余Prediction和Candidates按词频排列
	
	unsigned short awWordIdCand[PATTREE_MAX_CAND_NUM];
		// Candidates对应的WordID
	
	short nStepPredict[PATTREE_MAX_PREDICTION_NUM];
		// 对应于每个Prediction中的数字串的级数，即数字串的长度减1。例如：输入“96”("wo")后一个Prediction是“我们”("women")，则此项为4
	unsigned short nIndexPredict[PATTREE_MAX_PREDICTION_NUM];
		// 对应于每个Prediction中的数字串在Patricia Tree中的索引
	unsigned short wWordIdPredict[PATTREE_MAX_PREDICTION_NUM];
		// 对应于每个Prediction中的WordID

	unsigned short awUnifiedCandNo[PATTREE_MAX_UNIFIED_CAND_NUM];
		// 用于Unified Candidates。高字节为0时，低字节表示在candidates数组中索引；高字节为1时，表示在predictions数组中索引
	
} CUR_STATE;										 

typedef struct tagNEXTPREDICT
	//该结构用于Next的预测方式，也就是已知的是几个汉字，通过这几个汉字来预测可能的词。例如：对“中”预测“国”“华”等，
	// 实是根据可能的词“中国”，“中华”做预测
{
	unsigned short nLexcionEntry[PATTREE_MAX_PREDICTION_NUM];
		// 词的原始索引

	unsigned char byPredictNum;
		// 预测个数
} NEXTPREDICT;

typedef struct tagSENTENCE_NODE_T
	// viterbi算法中的节点
{
	unsigned char byStartStep;
		// 数字串在Sentence中的起始值（从0开始）
	unsigned char byEndStep;
		// 数字串在Sentence中的结束值
	unsigned short nWordIndex;
		// 词的原始索引 
	long nFreezedLogProb;
		// 如果用户在此处选择了某个词，那么此处的这个词将被固定住，此处给出一个很大的值作为这个词的词频。
	long nMaxLogProb;
		// viterbi算法中的累积值。
	struct tagSENTENCE_NODE_T *pPreNode;
		// 前一个节点
} SENTENCE_NODE_T;

typedef struct tagSENTENCE_T
{
	OUTBUFFER OutBuffer;
	
#ifdef PATTREE_POSBIGRAM
	SENTENCE_NODE_T Nodes[PATTREE_MAX_INPUT_LEN][PATTREE_MAX_POSTYPE];
		// 用二元词性做viterbi算法，则相应MM模型为1阶，每个节点状态数为词性数目（PATTREE_MAX_POSTYPE个），在每个节点处需要存储PATTREE_MAX_POSTYPE个最优
		// 解。数字串中每个数字为一个节点，总共有PATTREE_MAX_INPUT_LEN个节点
#else
	SENTENCE_NODE_T Nodes[PATTREE_MAX_INPUT_LEN];
		// 不用词性，直接用词的Unigram做优化，则每个节点总共需要1个最优解。
#endif
	unsigned short nIndexOfInvertString[PATTREE_MAX_WORD_LEN];
		// 假设当前Sentence的数字串为"2345678765432",则nIndexOfInvertString[0]为最后一个数字"2"在Patricia Tree中的索引（级数为0）
		// nIndexOfInvertString[1]为最后2个数字"32"在Patricia Tree中的索引（级数为1）
	unsigned short nIndexOfInvertString_User[PATTREE_MAX_WORD_LEN];
		// 与上面成员的唯一区别在于此处对应用户自定义词典

	CUR_STATE CurWord;
		// Cur word词预测
	CUR_STATE PreWord;
		// Pre word词预测
	NEXTPREDICT NextWord;
		// next word词预测

	// 这些参数实际上都存储于字典中（STATIC_DIC_T结构），只是在不同输入方式下（拼音或笔划），这些值不同
	long			nDicOffset_awWordIndex;
	long			nDicOffset_aMaxSeriesLogProb;
	long			nDicOffset_awIndNum;
	long			nDicOffset_anIndOffset;
	short			nMaxIndLen;
	long			nWordIndexNum;

	long			nDicOffset_awWordIndex_User;
	long			nDicOffset_aMaxSeriesLogProb_User;
	long			nDicOffset_awIndNum_User;
	long			nDicOffset_anIndOffset_User;
} SENTENCE_T;

/***************************************Sentence END*********************************************/

/***************************************************************EN DIC**********************************************************************************/
/* struct of a index in dictionary, it correspond to a char in the dictionary tree */
typedef struct tagEN_IND_T
{
	unsigned char	bLogProb;
	unsigned char	bMaxSeriesLogProb;

	char cCode;
	unsigned char cIndex;
	unsigned short nIndex;
	unsigned short nPrevIndex;
}	
#ifdef __SYMBIAN32__
#ifdef __GCC32__
__attribute__ ((__aligned__(PATTREE_ALIGNBYTES), __packed__))
#endif
#endif
EN_IND_T;

typedef struct tagEN_SENTENCE_T
{
	OUTBUFFER OutBuffer;

	long anIndex[PATTREE_MAX_EN_ENTRY];
	int nEntryNum;
	int nWordNum;
	TCHAR szUnifiedCandBuffer[PATTREE_CAND_NUM][PATTREE_MAX_WORDEN_LEN];
	TCHAR szCandBuffer[PATTREE_CAND_NUM][PATTREE_MAX_WORDEN_LEN];
	TCHAR szPredictBuffer[PATTREE_PREDICT_NUM][PATTREE_MAX_WORDEN_LEN];
	
	unsigned short awUnifiedCandNo[PATTREE_MAX_UNIFIED_CAND_NUM_EN];

	short anStepPredict[PATTREE_MAX_PREDICTION_NUM];
	long anIndexPredict[PATTREE_MAX_PREDICTION_NUM];
	int nPredictNum;
	int nUnifiedCandNum;
}	EN_SENTENCE_T;
/***************************************************************EN DIC END***********************************************************************************/

/***************************************************************HW SENTENCE**********************************************************************************/
#ifdef PATTREE_HANDWRITING
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -