📄 splitword.c

📁 另外一个中文分词程序
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/******************************************
	http://www.sqlet.com
	mail:199909@gmail.com
	
	中文分词测试版	
	author:linfj
	词典文件:sqlet.dict
*******************************************/
#include <stdio.h>
#include <string.h>
#include <malloc.h>

#define BOOL				char
#define bool				BOOL
#define TRUE				1
#define FALSE				0
#define MAX_CWORD_LEN			10				//最长的词
#define MAX_SWORD_LEN			256				//最长的句子
#define	MAX_CDIM			90

//不进行索引的单词
char *arrayEnglishStop[] = {
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", 
"1", "2", "3", "4", "5", "6", "7", "8", "9", "0",  
"about", "above", "after", "again", "all", "also", "am", "an", "and", "any", "are", "as", "at", 
"back", "be", "been", "before", "behind", "being", "below", "but", "by", 
"can", "click", "do", "does", "done", "each", "else", "etc", "ever", "every", 
"few", "for", "from", "generally", "get", "go", "gone", "has", "have", "hello", "here", "how", 
"if", "in", "into", "is", "just", "keep", "later", "let", "like", "lot", "lots", "made", 
"make", "makes", "many", "may", "me", "more", "most", "much", "must", "my", "need", "no", "not", 
"now", "of", "often", "on", "only", "or", "other", "others", "our", "out", "over", "please", "put", 
"so", "some", "such", "than", "that", "the", "their", "them", "then", "there", "these", "they", 
"this", "try", "to", "up", "us", "very", "want", "was", "we", "well", "what", "when", "where", 
"which", "why", "will", "with", "within", "you", "your", "yourself"
};

//词典索引时,字或词不需要索引
char *arrayChineseStop[] = {
"的","吗","么","啊","说","对","在","和","是",
"被","最","所","那","这","有","将","会","与",
"於","于","他","她","它","您","为","欢迎"
};

//全角的ASCII,要全部转成半角英文字符,以后还要加入其它的符号如,.;/|等
//区码为163的都要转成ascii
//163	！ ＂ ＃ ￥ ％ ＆ ＇ （ ） ＊ ＋ ， － ． ／ ０ １ ２ ３ ４ ５ ６ ７ ８ ９ ： ； ＜ ＝ ＞ ？ ＠ Ａ Ｂ Ｃ Ｄ Ｅ Ｆ Ｇ Ｈ Ｉ Ｊ Ｋ Ｌ Ｍ Ｎ Ｏ Ｐ Ｑ Ｒ Ｓ Ｔ Ｕ Ｖ Ｗ Ｘ Ｙ Ｚ ［ ＼ ］ ＾ ＿ ｀ ａ ｂ ｃ ｄ ｅ ｆ ｇ ｈ ｉ ｊ ｋ ｌ ｍ ｎ ｏ ｐ ｑ ｒ ｓ ｔ ｕ ｖ ｗ ｘ ｙ ｚ ｛ ｜ ｝ ￣ 
//		!	  #  $  %  &  '   (  )  % +  ,  -  .  /   0   1  2 		
unsigned char *arrayWideAscii[] = {
"ａ","ｂ","ｃ","ｄ","ｅ","ｆ","ｇ","ｈ","ｉ","ｊ","ｋ","ｌ","ｍ","ｎ","ｏ","ｐ","ｑ","ｒ","ｓ","ｔ","ｕ","ｖ","ｗ","ｘ","ｙ","ｚ","．",
"Ａ","Ｂ","Ｃ","Ｄ","Ｅ","Ｆ","Ｇ","Ｈ","Ｉ","Ｊ","Ｋ","Ｌ","Ｍ","Ｎ","Ｏ","Ｐ","Ｑ","Ｒ","Ｓ","Ｔ","Ｕ","Ｖ","Ｗ","Ｘ","Ｙ","Ｚ","－"
};

//标点符号及汉字的标点符号,注意 + - " 这三个符号，因为在搜索的时候需要通过他们进行异或等条件判断

char arrayAsciiSymbol[] ={
'!','\\','*','(',')','-','_','+','=','{','}','[',']',':',';','\'','\"',',','<','>','.','?','/','|','@','#','$','%','^','&'
};

//BIG5与GB对照,把所有big5转成gb后进行处理
//$arrayBig5ToG = array ();
//UTF8的转换

//汉字词典
typedef struct _WORD_NODE
{
	char	strWord[MAX_CWORD_LEN+1];
	// todo ,可以增加 两个字，三个字，四个字，五个字的数组，这样查起来更快
	struct  _WORD_NODE	*nextWord;
}WORD_NODE;
//分词结果
typedef struct _SEG_NODE
{
	char	strWord[MAX_CWORD_LEN+1];
	struct	_SEG_NODE	*nextWord;
}SEG_NODE;

struct _CH_DICT 
{
	WORD_NODE	*lstWord;
}CH_DICT[MAX_CDIM][MAX_CDIM];

struct _SEG_LIST 
{
	SEG_NODE	*lstWord;
}SEG_LIST[MAX_CDIM][MAX_CDIM];

/*同义词典
	咖啡馆,咖啡屋
	神六，神舟六号
synonymous
*/
char  *strTrim(char str[])
{
  int firstchar=0;
  int endpos=0;
  int i;
  int firstpos=0;

  for(i=0;str[i]!='\0';i++)
  {
    if(str[i]==' ' || str[i] == '\r' || str [i] == '\n' || str [i]=='\t')
	{
      if(firstchar==0) firstpos++;
    }
    else
	{
      endpos=i;
      firstchar=1;
    }
  }

  for(i=firstpos;i<=endpos;i++)
    str[i-firstpos]=str[i];
  str[i-firstpos]='\0';

  return str;
}

int addDictWord(char *strWord , int len)
{
	unsigned char firstChar,lastChar;
	WORD_NODE* curLst;
	
	WORD_NODE* newWord ,*curTmp ;
	
	firstChar = strWord[0] ;
	lastChar  = strWord[len-1];
	
	if (firstChar < 161 || lastChar < 161 )	//非汉字或是汉字的全角符号
		  return -1;
	
	newWord = (WORD_NODE*)malloc(sizeof(WORD_NODE));
	if ( newWord == NULL)
		return -1;
	strcpy(newWord->strWord,strWord);
	newWord->nextWord = NULL;
	
	firstChar -= 161 ;
	lastChar  -= 161 ;
	curLst = CH_DICT[firstChar][lastChar].lstWord;

	if( curLst == NULL)		//reinit list;
	{
		CH_DICT[firstChar][lastChar].lstWord = newWord ;

		return 0;
	}
	curTmp = curLst ;
	while(curTmp -> nextWord != NULL )
	{
		curTmp = curTmp->nextWord;
	}

	curTmp -> nextWord = newWord ;
	
	return 0;
}

int addSegWord(unsigned char *strWord , int len)
{
	unsigned char firstChar,lastChar;
	SEG_NODE* curLst;
	
	SEG_NODE* newWord ,*curTmp ;
	
	firstChar = strWord[0] ;
	lastChar  = strWord[len-1];

	//查看是否已经存在	
	firstChar %= MAX_CDIM ;
	lastChar  %= MAX_CDIM ;
	curLst = SEG_LIST[firstChar][lastChar].lstWord;

	curTmp = curLst ;
	while(curTmp != NULL )
	{
//		if ( strcasecmp(curTmp->strWord,(char *)strWord) == 0)
		if ( strcmp(curTmp->strWord,(char *)strWord) == 0)
			 return 0;		//已经存在
		curTmp = curTmp->nextWord;
	}

	newWord = (SEG_NODE*)malloc(sizeof(SEG_NODE));
	if ( newWord == NULL)
		return -1;
	strcpy(newWord->strWord,(char *)strWord);
	newWord->nextWord = NULL;
	
	if( curLst == NULL)		//reinit list;
	{
		SEG_LIST[firstChar][lastChar].lstWord = newWord ;

		return 0;
	}
	curTmp = curLst ;
	while(curTmp -> nextWord != NULL )
	{
		curTmp = curTmp->nextWord;
	}

	curTmp -> nextWord = newWord ;
	
	return 0;
}

int freeDict()
{
	int i ,j ;
	WORD_NODE *curLst,*curTmp ,*tmp;
	
	for ( i = 0 ; i < MAX_CDIM ; i ++ )
	  for ( j = 0 ; j < MAX_CDIM ; j ++ )
	  {
			curLst = CH_DICT[i][j].lstWord;
			curTmp = curLst;
			while ( curTmp != NULL ) 
			{
				tmp = curTmp ;
		//		printf("%s|",curTmp->strWord);
				curTmp = curTmp -> nextWord ;
//				if ( curTmp == NULL)
//					printf("\n");

				free(tmp);
				tmp = ( WORD_NODE *)NULL;
			}
			CH_DICT[i][j].lstWord = (WORD_NODE *)NULL;
	 }
	  return TRUE;
}

int freeSeg()
{
	int i ,j ;
	SEG_NODE *curLst,*curTmp ,*tmp;
	
	for ( i = 0 ; i < MAX_CDIM ; i ++ )
	  for ( j = 0 ; j < MAX_CDIM ; j ++ )
	  {
			curLst = SEG_LIST[i][j].lstWord;
			curTmp = curLst;
			while ( curTmp != NULL )
			{
				tmp = curTmp ;
				printf("%s|",curTmp->strWord);
				curTmp = curTmp -> nextWord ;
				if ( curTmp == NULL)
					printf("\n");

				free(tmp);
				tmp = ( SEG_NODE *)NULL;
			}
			SEG_LIST[i][j].lstWord = (SEG_NODE *)NULL;
	 }
	  return TRUE;
}

BOOL searchWord( unsigned char *strWord,int len )
{
	WORD_NODE *curLst,*curTmp;
	unsigned char firstChar,lastChar;
	
	firstChar = strWord[0] ;
	lastChar  = strWord[len-1];

	firstChar -= 161 ;
	lastChar  -= 161 ;
	curLst = CH_DICT[firstChar][lastChar].lstWord;

	curTmp = curLst;
	while ( curTmp != NULL ) 
	{
		if ( strcmp((char *)strWord,curTmp->strWord) == 0)
				return TRUE;
		curTmp = curTmp -> nextWord ;
	}
	return FALSE;
}

int segWord ( unsigned char *strText  , int iWordLen , BOOL bChinese )
{
	int i = 0 ,j = 0 , k = 0 , l = 0;
	unsigned char strChar[MAX_CWORD_LEN+1],strChar1[5],strChar2[5],strChar3[7];	
	BOOL	bFound = FALSE;
	
	i = iWordLen  ; 
	if ( FALSE == bChinese )
	{		//英文	
		//检查 是否在stop数组里
		addSegWord(strText,iWordLen);
		return 0;	
	}
12 下一页
💿 文件大小 832 K
👤 上传用户 feigo156
📂 所属分类多国语言处理
📄 代码行数 583 行
💻 语言类型 C语言
🏷️ 相关标签

#分 #程序
更多分资源 →
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -