⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dictionary.h

📁 中科院分词系统VC++版本
💻 H
字号:
//////////////////////////////////////////////////////////////////////
//ICTCLAS简介:计算所汉语词法分析系统ICTCLAS(Institute of Computing Technology, Chinese Lexical Analysis System),
//             功能有:中文分词;词性标注;未登录词识别。
//             分词正确率高达97.58%(973专家评测结果),
//             未登录词识别召回率均高于90%,其中中国人名的识别召回率接近98%;
//             处理速度为31.5Kbytes/s。
//著作权:  Copyright?2002-2005中科院计算所 职务著作权人:张华平 刘群
//遵循协议:自然语言处理开放资源许可证1.0
//Email: zhanghp@software.ict.ac.cn
//Homepage:www.nlp.org.cn;mtgroup.ict.ac.cn
/****************************************************************************
 *
 * Copyright (c) 2000, 2001 
 *     Machine Group
 *     Software Research Lab.
 *     Institute of Computing Tech.
 *     Chinese Academy of Sciences
 *     All rights reserved.
 *
 * This file is the confidential and proprietary property of 
 * Institute of Computing Tech. and the posession or use of this file requires 
 * a written license from the author.
 * Filename: Dictionary.h
 * Abstract:
 *           dictionary class definition
 *			 interface for the CDictionary class.
 * Author:   Kevin Zhang 
 *          (zhanghp@software.ict.ac.cn)
 * Date:     2002-1-8
 *
 * Notes:
 *                
 * 
 ****************************************************************************/
#if !defined(AFX_DICTIONARY_H__80E88BC1_784E_4C96_868B_D7CD66DD6725__INCLUDED_)
#define AFX_DICTIONARY_H__80E88BC1_784E_4C96_868B_D7CD66DD6725__INCLUDED_

#if _MSC_VER > 1000
#pragma once
#endif // _MSC_VER > 1000

#define CC_NUM  6768
//The number of Chinese Char,including 5 empty position between 3756-3761
#define WORD_MAXLENGTH 100
#define WT_DELIMITER 0
#define WT_CHINESE   1
#define WT_OTHER     2
#define CC_ID(c1,c2) ((unsigned char)(c1)-176)*94+((unsigned char)(c2)-161)
//The ID equation of Chinese Char 
#define CC_CHAR1(id) (id)/94+176
//The first char computed by the Chinese Char ID
#define CC_CHAR2(id) (id)%94+161
//The second char computed by the Chinese Char ID 

/*data structure for word segmentation and tag result*/
//Add in 2002-6-20
struct tagWordResult{
	char sWord[WORD_MAXLENGTH];
	//The word 
	int nHandle;
	//the POS of the word
	double  dValue;
	//The -log(frequency/MAX)
};
typedef struct tagWordResult WORD_RESULT,*PWORD_RESULT;

/*data structure for word item*/
struct tagWordItem{
	int nWordLen;
	char *sWord;
	//The word 
	int nHandle;
	//the process or information handle of the word
	int  nFrequency;
	//The count which it appear
};
typedef struct tagWordItem WORD_ITEM,*PWORD_ITEM;
/*data structure for dictionary index table item*/
struct tagIndexTable{
    int nCount;
	//The count number of words which initial letter is sInit
    PWORD_ITEM pWordItemHead;
	//The  head of word items
};
typedef struct tagIndexTable INDEX_TABLE;

/*data structure for word item chain*/
struct tagWordChain{
       WORD_ITEM data;
       struct tagWordChain *next;
};
typedef struct tagWordChain WORD_CHAIN,*PWORD_CHAIN;
/*data structure for dictionary index table item*/
struct tagModifyTable{
    int nCount;
	//The count number of words which initial letter is sInit
	int nDelete;
    //The number of deleted items in the index table
	PWORD_CHAIN pWordItemHead;
	//The  head of word items
};
typedef struct tagModifyTable MODIFY_TABLE,*PMODIFY_TABLE;

class CDictionary  
{
public:
	bool Optimum();
	bool Merge(CDictionary dict2,int nRatio);
	bool OutputChars(char *sFilename);
	bool Output(char *sFilename);
	int GetFrequency(char *sWord,  int nHandle);
	bool GetPOSString(int nPOS,char *sPOSRet);
	int GetPOSValue(char *sPOS);
	bool GetMaxMatch(char *sWord, char *sWordRet, int *npHandleRet);
	bool MergePOS(int nHandle);
	bool GetHandle(char *sWord,int *pnCount,int *pnHandle,int *pnFrequency);
	bool IsExist(char *sWord,int nHandle);
	bool AddItem(char *sWord,int nHandle,int nFrequency=0);
	bool DelItem(char *sWord,int nHandle);
	bool Save(char *sFilename);
	bool Load(char *sFilename,bool bReset=false);
	int  GetWordType(char *sWord);
	bool PreProcessing(char *sWord,int *nId,char *sWordRet,bool bAdd=false);
	CDictionary();
    virtual ~CDictionary();
	INDEX_TABLE   m_IndexTable[CC_NUM];
    PMODIFY_TABLE m_pModifyTable;
	//The data for modify  
protected:
	bool DelModified();
	bool FindInOriginalTable(int nInnerCode,char *sWord,int nHandle,int *nPosRet=0);
	bool FindInModifyTable(int nInnerCode,char *sWord,int nHandle,PWORD_CHAIN *pFindRet=0);
};

#endif // !defined(AFX_DICTIONARY_H__80E88BC1_784E_4C96_868B_D7CD66DD6725__INCLUDED_)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -