⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 utility.cpp

📁 中科院分词系统VC++版本
💻 CPP
📖 第 1 页 / 共 2 页
字号:
//////////////////////////////////////////////////////////////////////
//ICTCLAS简介:计算所汉语词法分析系统ICTCLAS(Institute of Computing Technology, Chinese Lexical Analysis System),
//             功能有:中文分词;词性标注;未登录词识别。
//             分词正确率高达97.58%(973专家评测结果),
//             未登录词识别召回率均高于90%,其中中国人名的识别召回率接近98%;
//             处理速度为31.5Kbytes/s。
//著作权:  Copyright?2002-2005中科院计算所 职务著作权人:张华平 刘群
//遵循协议:自然语言处理开放资源许可证1.0
//Email: zhanghp@software.ict.ac.cn
//Homepage:www.nlp.org.cn;mtgroup.ict.ac.cn
/****************************************************************************
 *
 * Copyright (c) 2000, 2001 
 *     Machine Group
 *     Software Research Lab.
 *     Institute of Computing Tech.
 *     Chinese Academy of Sciences
 *     All rights reserved.
 *
 * This file is the confidential and proprietary property of 
 * Institute of Computing Tech. and the posession or use of this file requires 
 * a written license from the author.
 * Filename: Utility.c
 * Abstract:
 *           Utility functions for Chinese Language Processing
 * Author:   Kevin Zhang 
 *          (zhanghp@software.ict.ac.cn)
 * Date:     2002-1-8
 *
 * Notes:
 *                
 ****************************************************************************/
#include "stdafx.h"
#include "Utility.h"
#include <stdio.h>
#include <string.h>
/*********************************************************************
 *
 *  Func Name  : GB2312_Generate
 *
 *  Description:  Generate the GB2312 List file
 *              
 *
 *  Parameters : sFilename: the file name for the output GB2312 List
 *    
 *  Returns    : bool
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-8
 *********************************************************************/
bool GB2312_Generate(char *sFileName)
{
   FILE *fp;
   unsigned int i,j;

   if((fp=fopen(sFileName,"wt"))==NULL)
	   return false;//fail while opening the file
   for(i=161;i<255;i++)
	   for(j=161;j<255;j++)
		   fprintf(fp,"%c%c,%d,%d\n",i,j,i,j);
   fclose(fp);
   return true;
}
/*********************************************************************
 *
 *  Func Name  : CC_Generate
 *
 *  Description:  Generate the Chinese Char List file
 *              
 *
 *  Parameters : sFilename: the file name for the output CC List
 *    
 *  Returns    : bool
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-8
 *********************************************************************/
bool CC_Generate(char *sFileName)
{
   FILE *fp;
   unsigned int i,j;
   if((fp=fopen(sFileName,"wt"))==NULL)
	   return false;//fail while opening the file
   for(i=176;i<255;i++)
	   for(j=161;j<255;j++)
		   fprintf(fp,"%c%c,%d,%d\n",i,j,i,j);
   fclose(fp);
   return true;
}
/*********************************************************************
 *
 *  Func Name  : CC_Find
 *
 *  Description: Find a Chinese sub-string in the Chinese String 
 *              
 *
 *  Parameters :  string:Null-terminated string to search
 *
 * 			      strCharSet:Null-terminated string to search for
 *
 *  Returns    : char *
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-8
 *********************************************************************/
 char *CC_Find( char *string, char *strCharSet)
{
 char *cp=strstr(string,strCharSet);
   if(cp!=NULL&&(cp-string)%2==1)
   {
	  return NULL;
   }
   return cp;
}
/*********************************************************************
 *
 *  Func Name  : charType
 *
 *  Description: Judge the type of sChar or (sChar,sChar+1)
 *              
 *
 *  Parameters : sFilename: the file name for the output CC List
 *    
 *  Returns    : int : the type of char
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-8
 *********************************************************************/
int charType(unsigned char *sChar)
{
  if(*sChar<128)
  {
	 if(strchr("\042!,.?()[]{}+=",(int)*sChar))
		 return CT_DELIMITER;
	 return CT_SINGLE;
  }
  else if(*sChar==162)
	  return CT_INDEX;
  else if(*sChar==163&&*(sChar+1)>175&&*(sChar+1)<186)
	  return CT_NUM;
  else if(*sChar==163&&(*(sChar+1)>=193&&*(sChar+1)<=218||*(sChar+1)>=225&&*(sChar+1)<=250))
	  return CT_LETTER;
  else if(*sChar==161||*sChar==163)
	  return CT_DELIMITER;
  else if(*sChar>=176&&*sChar<=247)
      return CT_CHINESE;
  else
      return CT_OTHER;
}
/*********************************************************************
 *
 *  Func Name  : GetCCPrefix
 *
 *  Description: Get the max Prefix string made up of Chinese Char
 *              
 *
 *  Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
 *    
 *  Returns    : the end of the sub-sentence
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-8
 *********************************************************************/
unsigned int  GetCCPrefix(unsigned char *sSentence)
{
   unsigned int nLen=strlen((const char *)sSentence),nCurPos=0;
   while(nCurPos<nLen&&sSentence[nCurPos]>175&&sSentence[nCurPos]<248)
   {
      nCurPos+=2;//Get next Chinese Char
   }
   return nCurPos;
}
/*********************************************************************
 *
 *  Func Name  : IsAllSingleByte
 *
 *  Description: Judge the string is all made up of Single Byte Char
 *              
 *
 *  Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
 *    
 *  Returns    : the end of the sub-sentence
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-24
 *********************************************************************/
bool IsAllChinese(unsigned char *sString)
{
	unsigned int nLen=strlen((const char *)sString),i=0;
	while(i<nLen-1&&sString[i]<248&&sString[i]>175)
	{
		i+=2;
	}
	if(i<nLen)
		return false;
	return true;
}
/*********************************************************************
 *
 *  Func Name  : IsAllNonChinese
 *
 *  Description: Judge the string is all made up of Single Byte Char
 *              
 *
 *  Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
 *    
 *  Returns    : the end of the sub-sentence
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-24
 *********************************************************************/
bool IsAllNonChinese(unsigned char *sString)
{
	unsigned int nLen=strlen((const char *)sString),i=0;
	while(i<nLen)
	{
		if(sString[i]<248&&sString[i]>175)
			return false;
		if(sString[i]>128)
			i+=2;
		else
			i+=1;
	}
    return true;
}
/*********************************************************************
 *
 *  Func Name  : IsAllSingleByte
 *
 *  Description: Judge the string is all made up of Single Byte Char
 *              
 *
 *  Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
 *    
 *  Returns    : the end of the sub-sentence
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-24
 *********************************************************************/
bool IsAllSingleByte(unsigned char *sString)
{
	unsigned int nLen=strlen((const char *)sString),i=0;
	while(i<nLen&&sString[i]<128)
	{
		i++;
	}
	if(i<nLen)
		return false;
	return true;
}
/*********************************************************************
 *
 *  Func Name  : IsAllNum
 *
 *  Description: Judge the string is all made up of Num Char
 *              
 *
 *  Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
 *    
 *  Returns    : the end of the sub-sentence
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-24
 *********************************************************************/
bool IsAllNum(unsigned char *sString)
{

	unsigned int nLen=strlen((const char *)sString),i=0;
	char sChar[3];
	sChar[2]=0;
	if(i<nLen)//Get prefix such as + -
	{
		sChar[0]=sString[i++];
		if(sChar[0]<0)//Get first char
			sChar[1]=sString[i++];
		else
			sChar[1]=0;
		if(!strstr("±+—-+",sChar))
		{
			i=0;
		}
	}
	while(i<nLen-1&&sString[i]==163&&sString[i+1]>175&&sString[i+1]<186)
	{
		i+=2;
	}
	if(i<nLen)//Get middle delimiter such as .
	{
		sChar[0]=sString[i++];
		if(sChar[0]<0)//Get first char
			sChar[1]=sString[i++];
		else
			sChar[1]=0;
		if(CC_Find("∶·./",sChar)||sChar[0]=='.'||sChar[0]=='/')
		{//98.1%
			while(i<nLen-1&&sString[i]==163&&sString[i+1]>175&&sString[i+1]<186)
			{
				i+=2;
			}
		}	
		else
		{
			i-=strlen(sChar);
		}
	}

	if(i>=nLen)
		return true;
	while(i<nLen&&sString[i]>'0'-1&&sString[i]<'9'+1)
	{//single byte number char
		i+=1;
	}
	if(i<nLen)//Get middle delimiter such as .
	{
		sChar[0]=sString[i++];
		if(sChar[0]<0)//Get first char
			sChar[1]=sString[i++];
		else
			sChar[1]=0;
		if(CC_Find("∶·./",sChar)||sChar[0]=='.'||sChar[0]=='/')
		{//98.1%
			while(i<nLen&&sString[i]>'0'-1&&sString[i]<'9'+1)
			{
				i+=1;
			}
		}	
		else
		{
			i-=strlen(sChar);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -