📄 splitword.c
字号:
return FALSE;
}
int segWord ( unsigned char *strText , int iWordLen , BOOL bChinese )
{
int i = 0 ,j = 0 , k = 0 , l = 0;
unsigned char strChar[MAX_CWORD_LEN+1],strChar1[5],strChar2[5],strChar3[7];
BOOL bFound = FALSE;
i = iWordLen ;
if ( FALSE == bChinese ) { //英文
//检查 是否在stop数组里
addSegWord(strText,iWordLen);
return 0;
}
while ( i > 1 ) {
for ( j = MAX_CWORD_LEN ; j >= 2 ; j -=2 ) { //最长xxx个汉字
if ( i < j )
continue;
l = 0 ;
for ( k = i - j ,l = 0 ; k < i ; k ++,l ++ )
strChar[l] = strText[k];
strChar[l] = '\0';
if ( 8 == j ) {//4个字时,无论哪种情况下都要比较前两个及后两个,防止"后三字是一个词,但第一个字跟再前一个字是一个词",同时可以捕获成词由两个词组成的
//如,让“我看看怒火凤凰",应该是"怒火"+凤凰,而不是"怒"+"火凤凰"
strChar1[0] = strChar[0];strChar1[1] = strChar[1];
strChar1[2] = strChar[2];strChar1[3] = strChar[3];
strChar1[4] = '\0';
strChar2[0] = strChar[4];strChar2[1] = strChar[5];
strChar2[2] = strChar[6];strChar2[3] = strChar[7];
strChar2[4] = '\0';
bFound = searchWord(strChar1 , 4 ); //先取前两个字比较
if ( TRUE == bFound ) {
bFound = searchWord(strChar2 , 4 ); //再取后两个字比较
if ( TRUE == bFound ) { //后两个是词
addSegWord(strChar1,4); //则,前后两个字,当做两个词加入
addSegWord(strChar2,4);
i -= j ;
break;
}
} else { //再比较前三字是否一个词
strChar3[0] = strChar[0];strChar3[1] = strChar[1];
strChar3[2] = strChar[2];strChar3[3] = strChar[3];
strChar3[4] = strChar[4];strChar3[5] = strChar[5];
strChar3[6] = '\0';
bFound = searchWord(strChar3,6);
if ( TRUE == bFound ) { //是三个字的,前三个加入,后一个单字加入
addSegWord(strChar3,6);
strChar2[0] = strChar[6];strChar2[1] = strChar[7];
strChar2[2] = '\0';
addSegWord(strChar2,2);
i -= j ;
break;
}// 前三字不是词
}
} else if ( 6 == j ){ /*判断3个汉字,两种情况: 1+2 和 2+1,1+2止面8==j的地方已经捕获岐义,1+2好像不会有岐义
1+2 情况不要拆开
但以下地名如何处理:北京市房产公司,按理“北京”和“北京市“应该找到相同的,就是说,要能智能识别成"北京”和“北京市”是一样的*/
} else if ( 2 == j ) { //单个汉字
addSegWord(strChar,2);
i -= 2 ;
break;
}
bFound = searchWord(strChar , l );//in_array($strChar , $arrayChineseDict );//匹配词典
if ( TRUE == bFound ) {
i -= j;
addSegWord(strChar,l);
break;
} else {
//1.是否全是外来词
//2.是否全是数字
//3.如果第一二字是”姓氏“,则默认为姓名
//4.判断其它名字
}
} // end for j
} // end while $i > 0
return 0;
}
int initSegList()
{
int i , j ;
for ( i = 0 ; i < MAX_CDIM ; i ++ ) {
for ( j = 0 ; j < MAX_CDIM ; j ++ ) {
SEG_LIST[i][j].lstWord = (SEG_NODE *)NULL;
}
}
}
int loadDict(char *strFilename)
{
int len = 0 , i ,j ;
FILE *fpDict = NULL;
unsigned char firstChar,lastChar;
char sLine[128];
for ( i = 0 ; i < MAX_CDIM ; i ++ ) {
for ( j = 0 ; j < MAX_CDIM ; j ++ ) {
CH_DICT[i][j].lstWord = (WORD_NODE *)NULL;
}
}
fpDict = fopen(strFilename,"r");
if ( fpDict == ( FILE *)NULL ) {
printf("open dict file %s error !\n",strFilename);
return -1;
}
while ( !feof(fpDict)) {
fgets(sLine,128,fpDict);
strTrim(sLine);
len = strlen(sLine);
if ( len < 1 || len > MAX_CWORD_LEN )
printf("%s error!\n",sLine);
addDictWord(sLine,len);
}
fclose(fpDict);
return 0;
}
BOOL isEnglishStop(unsigned char *strWord)
{
//,arrayEnglishStop
return FALSE ;
}
inline BOOL isAsciiSymbol (char cChar)
{
int i = 0 ;
for ( i = 0 ; i < sizeof(arrayAsciiSymbol) ; i ++ )
if ( cChar == arrayAsciiSymbol[i])
return TRUE;
return FALSE;
}
int segSentence (char *strText ,BOOL bSpace )
{
int iTextLen = strlen(strText) ;
int iWordLen= 0 ;
int i = 0 ;
int iNexti = 0 ;
BOOL bSep = FALSE ;//否是分隔符或者是一个词结束
BOOL bChinese = FALSE ;//上一个有效字符类型,
//false:英文 true: Chinese
BOOL bFound = FALSE ;
unsigned char strWord[MAX_SWORD_LEN + 1]; //当前词
unsigned char strChar[3]; //
unsigned char cChar; //
memset(strWord,0,sizeof(strWord));
strChar[0] = '\0';
for ( i = 0 ; i < iTextLen ; i ++ ) {
cChar = (unsigned char )strText[i];
if (128 > cChar) { //英文字符
/****如果连续空格不算分隔的话,用下面这一段代码 ****/
if ( ' ' == cChar || '\t' == cChar || '\r' == cChar || '\n' == cChar ){
if ( TRUE == bChinese ) { //如果前面一个有效字符是Chinese
if ( TRUE == bSpace && (' ' == cChar || '\t' == cChar))
bSep = TRUE ;
else continue ; //继续取下一个字符,因为汉字的词可以换行或用空格隔开
} else {
bSep = TRUE ;
}
} else {
bSep = isAsciiSymbol(cChar); //判断是否是分隔符
}
if ( (TRUE == bSep || TRUE == bChinese ) && 0 < iWordLen ) { //一个单词结束
if ( iWordLen > MAX_SWORD_LEN )
iWordLen = MAX_SWORD_LEN;
strWord[iWordLen] = '\0';
segWord(strWord,iWordLen,bChinese);
iWordLen = 0 ;
}
if ( FALSE == bSep ) {
strWord[iWordLen] = cChar;
//同一个单词的字母,并在一起
iWordLen ++ ;
}
bChinese = FALSE ;
} else { //字符Chinese
if ( FALSE == bChinese && FALSE == bSep && 0 < iWordLen ) {//以前是英文,碰到汉字,就当英文单词结束
if ( iWordLen > MAX_SWORD_LEN )
iWordLen = MAX_SWORD_LEN;
strWord[iWordLen] = '\0';
bFound = isEnglishStop(strWord);
if ( FALSE == bFound ) {
segWord(strWord,iWordLen,bChinese);
} // end if
iWordLen = 0 ;
}
iNexti = i + 1 ;
if ( iNexti < iTextLen ) {
if ( 128 > ((unsigned char)strText[iNexti]) ) { //单字符>=128,不处理该字符
continue ;
}
} else { //最后一个字符是大于128的单字符
break;
}
strChar[0] = strText[i];
strChar[1] = strText[i+1];
strChar[2] = '\0';
//是否有可能>128的字符,只有一个字符,如果有需要另外判断,现在默认一定会有至少两个字符同时出现
bChinese = TRUE ; //是汉字Chinese
i ++ ; //只需要加1
if (strChar[0] == 0xa1 && strChar[1] == 0xa1 ) { //Chinese空格,16进制:A1,A1
if ( TRUE == bSpace )
bSep = TRUE ;
else continue ; //连续Chinese空格
}else if ( strChar[0] < 176 ) {//中文标点等非汉字字符
bSep = TRUE ;
}else bSep = FALSE;
if ( TRUE == bSep && 0 < iWordLen ) {
if ( iWordLen > MAX_SWORD_LEN )
iWordLen = MAX_SWORD_LEN;
strWord[iWordLen] = '\0';
segWord(strWord,iWordLen,bChinese);
iWordLen = 0 ;
}
if ( FALSE == bSep ) {
strWord[iWordLen++] = strChar[0];
strWord[iWordLen++] = strChar[1];
}
} // end if ord
} // end for $i
if ( 0 < iWordLen ) { //还有未处理的单词
if ( iWordLen > MAX_SWORD_LEN )
iWordLen = MAX_SWORD_LEN;
strWord[iWordLen] = '\0';
segWord(strWord,iWordLen,bChinese);
iWordLen = 0 ;
}
return 0;
}
int main(int argc,char *argv[])
{
char strText[] = ",毛泽东在北京,小明是中国人,我不想去北京市因为我觉得很没有意思的啊,只是做做样子让我瞧瞧烈火凤凰,常常白日做梦呢";
// char strText[] = "我在平和服装加工厂工作的样子";
//char strText[] = "营销分析不再处理卡式智能网原始话单,改从新卡式分摊系统采集汇总数据,通过存储过程实现相关纬度编码的转换(转换规则由王海源整理,黄薇确认)";
// char strText []="我不想去北京市因为我觉得很没有意思的啊";
loadDict("sqlet.dict");
initSegList();
printf("开始分词\n");
segSentence (strText ,TRUE ) ;
freeSeg();
printf("结束分词\n");
freeDict();
return 0 ;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -