⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 segment.h

📁 为自然语言处理领域的中文分词程序
💻 H
字号:
#ifndef _SEGMENT_H
#define _SEGMENT_H

#include <stdio.h>  
#include <stdlib.h>
#include <malloc.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
/*最大文件数目为5*/
#define MAX_FILE_NUM 5
#define TRUE 1
#define FALSE 0 
/*计算高字节和低字节*/
#define CC_ID(c1,c2) ((unsigned char)(c1)-176)*94+((unsigned char)(c2)-161)
/*高字节*/
#define CC_CHAR1(id) (id)/94+176 
/*低字节*/
#define CC_CHAR2(id) (id)%94+161 
#define CC_NUM  6768
/*字的最大长度*/
#define WORD_MAXLENGTH 100
/*分隔符为0*/
#define WT_DELIMITER 0
/*汉字为1*/
#define WT_CHINESE   1
/*其他为2*/
#define WT_OTHER     2
/*起始为1,4*/
#define  CT_SENTENCE_BEGIN 1 
#define  CT_SENTENCE_END   4 

#define  CT_SINGLE  5 
#define  CT_DELIMITER CT_SINGLE+1 
#define  CT_CHINESE   CT_SINGLE+2 
#define  CT_LETTER    CT_SINGLE+3 
#define  CT_NUM       CT_SINGLE+4 
#define  CT_INDEX     CT_SINGLE+5 
#define  CT_OTHER     CT_SINGLE+12
/*单字后缀*/
#define POSTFIX_SINGLE "坝邦堡杯城池村单岛道堤店洞渡队法峰府冈港阁宫沟国海号河湖环集江奖礁角街井郡坑口矿里岭楼路门盟庙弄牌派坡铺旗桥区渠泉人山省市水寺塔台滩坛堂厅亭屯湾文屋溪峡县线乡巷型洋窑营屿语园苑院闸寨站镇州庄族陂庵町"
/*多字后缀*/
#define POSTFIX_MUTIPLE {"半岛","草原","城市","大堤","大公国","大桥","地区","帝国","渡槽","港口","高速公路","高原","公路","公园","共和国","谷地","广场","国道","海峡","胡同","机场","集镇","教区","街道","口岸","码头","煤矿","牧场","农场","盆地","平原","丘陵","群岛","沙漠","沙洲","山脉","山丘","水库","隧道","特区","铁路","新村","雪峰","盐场","盐湖","渔场","直辖市","自治区","自治县","自治州",""}                           
/*英文翻译*/
#define  TRANS_ENGLISH   "·—阿埃艾爱安昂敖奥澳笆芭巴白拜班邦保堡鲍北贝本比毕彼别波玻博勃伯泊卜布才采仓查差柴彻川茨慈次达大戴代丹旦但当道德得的登迪狄蒂帝丁东杜敦多额俄厄鄂恩尔伐法范菲芬费佛夫福弗甫噶盖干冈哥戈革葛格各根古瓜哈海罕翰汗汉豪合河赫亨侯呼胡华霍基吉及加贾坚简杰金京久居君喀卡凯坎康考柯科可克肯库奎拉喇莱来兰郎朗劳勒雷累楞黎理李里莉丽历利立力连廉良列烈林隆卢虏鲁路伦仑罗洛玛马买麦迈曼茅茂梅门蒙盟米蜜密敏明摩莫墨默姆木穆那娜纳乃奈南内尼年涅宁纽努诺欧帕潘畔庞培佩彭皮平泼普其契恰强乔切钦沁泉让热荣肉儒瑞若萨塞赛桑瑟森莎沙山善绍舍圣施诗石什史士守斯司丝苏素索塔泰坦汤唐陶特提汀图土吐托陀瓦万王旺威韦维魏温文翁沃乌吾武伍西锡希喜夏相香歇谢辛新牙雅亚彦尧叶依伊衣宜义因音英雍尤于约宰泽增詹珍治中仲朱诸卓孜祖佐伽娅尕腓滕济嘉津赖莲琳律略慕妮聂裴浦奇齐琴茹珊卫欣逊札哲智兹芙汶迦珀琪梵斐胥黛"
/*俄文翻译*/
#define  TRANS_RUSSIAN   "·阿安奥巴比彼波布察茨大德得丁杜尔法夫伏甫盖格哈基加坚捷金卡科可克库拉莱兰勒雷里历利连列卢鲁罗洛马梅蒙米姆娜涅宁诺帕泼普奇齐乔切日萨色山申什斯索塔坦特托娃维文乌西希谢亚耶叶依伊以扎佐柴达登蒂戈果海赫华霍吉季津柯理琳玛曼穆纳尼契钦丘桑沙舍泰图瓦万雅卓兹"
/*日文翻译*/
#define  TRANS_JAPANESE  "安奥八白百邦保北倍本比滨博步部彩菜仓昌长朝池赤川船淳次村大代岛稻道德地典渡尔繁饭风福冈高工宫古谷关广桂贵好浩和合河黑横恒宏后户荒绘吉纪佳加见健江介金今进井静敬靖久酒菊俊康可克口梨理里礼栗丽利立凉良林玲铃柳隆鹿麻玛美萌弥敏木纳南男内鸟宁朋片平崎齐千前浅桥琴青清庆秋丘曲泉仁忍日荣若三森纱杉山善上伸神圣石实矢世市室水顺司松泰桃藤天田土万望尾未文武五舞西细夏宪相小孝新星行雄秀雅亚岩杨洋阳遥野也叶一伊衣逸义益樱永由有佑宇羽郁渊元垣原远月悦早造则泽增扎宅章昭沼真政枝知之植智治中忠仲竹助椎子佐阪坂堀荻菅薰浜濑鸠筱" 
/*英俄日文翻译分别0,1,2*/
#define  TT_ENGLISH  0
#define  TT_RUSSIAN  1
#define  TT_JAPANESE  2
/*句子分割符 "。!?:;…"*/
#define  SEPERATOR_C_SENTENCE "。!?:;…"
/*子句分割符 "、,()“”‘’"*/
#define  SEPERATOR_C_SUB_SENTENCE "、,()“”‘’"
#define  SEPERATOR_E_SENTENCE "!?:;"
#define  SEPERATOR_E_SUB_SENTENCE ",()\042'"
#define  SEPERATOR_LINK "\n\r  " 
#define SENTENCE_BEGIN "始##始"
#define SENTENCE_END "末##末"
/*句子最大字数*/
#define MAX_WORDS_PER_SENTENCE 120
#define MAX_UNKNOWN_PER_SENTENCE 200
/*每个字最大位置*/
#define MAX_POS_PER_WORD 20
/*最小词频*/
#define LITTLE_FREQUENCY 6
/*最大词频*/
#define MAX_FREQUENCE   2079997
/*句子最大长度*/
#define MAX_SENTENCE_LEN 620
/*最大词*/
#define MAX_WORDS 650
/*最大分割数*/
#define MAX_SEGMENT_NUM 10
/*字的分词符*/
#define WORD_SEGMENTER "@"
/*最小问题*/
#define MIN_PROBLEM 1
#if MIN_PROBLEM==1 
	#define INFINITE_VALUE 10000.00 
#else 
#define INFINITE_VALUE 0.00 
#endif

/*词标注结果:词、词性和权重*/
struct tagWordResult{ 
	char sWord[WORD_MAXLENGTH]; 
	int nHandle; 
	double  dValue; 
};
typedef struct tagWordResult WORD_RESULT,*PWORD_RESULT;


/*词典中的词项:词长、词、词性和词频*/ 
struct tagWordItem{
	int nWordLen; 
	char *sWord; 
	int nHandle; 
	int  nFrequency; 
};
typedef struct tagWordItem WORD_ITEM,*PWORD_ITEM;
 
/*索引表:数目、词项的头指针*/ 
struct tagIndexTable{
    int nCount; 
    PWORD_ITEM pWordItemHead; 
	
};
typedef struct tagIndexTable INDEX_TABLE;
 
/*词的链表*/ 
struct tagWordChain{
       WORD_ITEM data; 
       struct tagWordChain *next; 
};
typedef struct tagWordChain WORD_CHAIN,*PWORD_CHAIN;

/*修改表:数目、删除和词链表头指针*/ 
struct tagModifyTable{
    int nCount; 
	int nDelete; 
	PWORD_CHAIN pWordItemHead; 
};
typedef struct tagModifyTable MODIFY_TABLE,*PMODIFY_TABLE;


/*词典即索引表*/
struct Dictionary
{
	INDEX_TABLE   m_IndexTable[CC_NUM]; 
};
typedef struct Dictionary *pDictionary;

/*词性标注结构*/
struct tagContext
{
	int nKey;             /*标注整数表示*/
	int **aContextArray;  /*上下文相关矩阵*/ 
	int *aTagFreq;        /*标注词的词频*/
	int nTotalFreq;       /*整个频率*/
	struct tagContext *next; /*上下文词性标注的结构*/
};
typedef struct tagContext MYCONTEXT,*PMYCONTEXT; /*上下文词性标注的结构*/

/*二元上下文上下文词性标注的结构*/
struct ContextState  
{
	int m_nTableLen;      /*上下文词性标注表长度*/
	int *m_pSymbolTable;  /*符号表*/
	PMYCONTEXT m_pContext;/*上下文词性标注的结构*/
	int m_nCategory;      /*类别*/  
};
typedef struct  ContextState *pContextState;

/*标注链表结构*/
struct tagArrayChain{ 
	unsigned int col,row;  /*行列*/
    double value;          /*double值*/
	int nPOS;              /*位置*/
	int nWordLen;          /*词长度*/
	char *sWord;           /*词内容*/
    struct tagArrayChain *next; /*下一个内容*/
};
typedef struct tagArrayChain ARRAY_CHAIN,*PARRAY_CHAIN;

/*动态数组结构*/
struct DynamicArray  
{
	unsigned int m_nCol,m_nRow;  /*行列*/
	int m_bRowFirst;             /*首行*/
	PARRAY_CHAIN m_pHead;        /*动态链表头*/
};
typedef struct DynamicArray *pDynamicArray;

/*队列元素*/
struct tagQueueElem{
    unsigned int nParent; /*行列*/
	unsigned int nIndex;  /*元素索引*/
	double eWeight;       /*权重*/
    struct tagQueueElem *next; /*下一队列元素*/
};
typedef struct tagQueueElem QUEUE_ELEMENT,*PQUEUE_ELEMENT;

/*行列*/
struct Queue  
{
	PQUEUE_ELEMENT m_pHead; /*行列元素头指针*/
	PQUEUE_ELEMENT m_pLastAccess; /*行列元素尾指针*/	
};
typedef struct Queue *pQueue;

/*标注类型:正常、人名、地名和翻译名*/
enum TAG_TYPE{
		TT_NORMAL,
		TT_PERSON,
		TT_PLACE,
		TT_TRANS_PERSON
};

/*词性标注的结构*/
struct Span
{
	int m_nUnknownIndex;                  /*未登陆词索引*/
	int m_nUnknownWords[MAX_UNKNOWN_PER_SENTENCE][2]; /*未登陆词数组*/
	double m_dWordsPossibility[MAX_UNKNOWN_PER_SENTENCE]; /*未登陆词数组的频率*/
	pContextState m_context;   /*上下文标注结构*/
	enum TAG_TYPE m_tagType;   /*未登陆词标注类型*/
	int m_nStartPos;           /*开始位置*/
	int m_nBestTag[MAX_WORDS_PER_SENTENCE]; /*每个句子中词的最好标注数组*/
	char m_sWords[MAX_WORDS_PER_SENTENCE][WORD_MAXLENGTH];/*每个句子中的词内容*/
	int m_nWordPosition[MAX_WORDS_PER_SENTENCE]; /*每个句子中词的位置*/
	int m_nTags[MAX_WORDS_PER_SENTENCE][MAX_POS_PER_WORD]; /*二维数组元素为词标注,行是句中词数,列是词位置*/
	char m_nBestPrev[MAX_WORDS_PER_SENTENCE][MAX_POS_PER_WORD];/*数组元素为最佳前驱词,行是句中词数,列是词位置*/
	char m_nCurLength;   /*当前长度*/
	double m_dFrequency[MAX_WORDS_PER_SENTENCE][MAX_POS_PER_WORD];/*数组元素为频率,行是句中词数,列是词位置*/
};
typedef struct Span *pSpan;

/*分割图的结构*/
struct SegGraph
{
	unsigned int m_nAtomCount; /* 原子个数*/
	pDynamicArray m_segGraph;  /* 分割图为动态数组*/
	char m_sAtom[MAX_SENTENCE_LEN][WORD_MAXLENGTH]; /* 原子内容,行是句子最大长度,列是词的最大长度*/
	int m_nAtomLength[MAX_SENTENCE_LEN];  /* 原子长度,数组下标为句子长度*/
	int m_nAtomPOS[MAX_SENTENCE_LEN];     /* 原子位置*/
};
typedef struct SegGraph *pSegGraph;

/* 最短路径结构*/
struct NShortPath
{
	int m_nResultCount;        /* 路径个数*/
	pDynamicArray m_apCost;    /* 连接值的动态数组*/
    unsigned int m_nValueKind; /* 值的类型*/
    unsigned int m_nVertex; /* 顶点个数*/
	pQueue   *m_pParent;    /* 队列*/
	double **m_pWeight; 
};
typedef struct NShortPath *pNShortPath;	

/* 未登陆词*/
struct UnknowWord
{
	pSpan m_roleTag;  /* 词性标注结构*/
	int m_nPOS;       /* 位置*/
	char m_sUnknownFlags[10]; /* 未登陆词的标记*/
	pDictionary m_dict;  /* 词典*/
};
typedef struct UnknowWord *pUnknowWord;

/* 分词结构*/
struct Segment
{
	int *m_npWordPosMapTable; /* 词位置映射表*/
	int m_nWordCount;         /* 词的个数*/
	PWORD_RESULT *m_pWordSeg; /* 分词结构*/
	int m_nSegmentCount;      /* 分割个数*/
	pDynamicArray m_graphOptimum; /* 最优图的动态数组*/
	pSegGraph m_graphSeg;        /* 分割图的结构*/
};
typedef struct Segment *pSegment;

/* 分词结构*/
struct Result
{
	double m_dResultPossibility[MAX_SEGMENT_NUM]; /* 概率*/
	int m_nOperateType;                           /* 操作类型*/ 
	int m_nOutputFormat;                          /* 输出格式*/
	pSegment m_Seg;                               /* 分词*/
	PWORD_RESULT *m_pResult;                      /* 分词的结构*/    
	pDictionary m_dictCore,m_dictBigram;          /* 词典*/
	pSpan m_POSTagger;                            /* 词性标注的结构*/
	pUnknowWord m_uPerson,m_uTransPerson,m_uPlace;/* 人、地和翻译*/
	double m_dSmoothingPara;                      /* 平滑系数*/
	int m_nResultCount;                           /* 结果数目*/        
};
typedef struct Result *pResult;
void GetSegmentString(char* sSource,pResult myResult,char **stopList,char*RemovedResult);
void IResult(pResult p,int OperateType,int OutputFormat);
int LoadStopList(char *stopListPath ,char **stopList);
void UResult(pResult p);
char* Trim(char *str);

#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -