pws.c

来自「这是用C写的中文分词程序」· C语言代码 · 共 323 行
323 行
/*    PonySE word segmenter    Copyright (C) 2007-2008 PonySE    This program is free software: you can redistribute it and/or modify    it under the terms of the GNU General Public License as published by    the Free Software Foundation, either version 3 of the License, or    (at your option) any later version.    This program is distributed in the hope that it will be useful,    but WITHOUT ANY WARRANTY; without even the implied warranty of    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    GNU General Public License for more details.    You should have received a copy of the GNU General Public License    along with this program.  If not, see <http://www.gnu.org/licenses/>.*//** * PonySE word segmenter header file  * @file pws.h * @brief PonySE word segment header file * @version 0.0.2 * @author chengyan * @date 03/07/2008 0.0.2 change name to "PonySE word segment" and change all function * @date 03/07/2008 0.0.1 add function ws_get_words(), by chengyan * @date 12/27/2007 0.0.0 created, by chengyan *//* stdc */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <memory.h>/* STL */#include <vector>#include <map>#include <string>/* this project */#include "pws.h"#define LENGTH_LINEBUF     33#define MAX_LENGTH_KEYWORD 24#define LENGTH_KEYWORDBUF  25/** * word splitter spliting result */struct pws_result{	long length;		/* word list length */	long * wordid;		/* word id list, only words' identification */	const char ** words;	/* word details list */	/*	long * word_attr;	words attribute	*/};/* maximum length of loaded keyword */static int g_keyword_maxlength = 0;/* dictionary map */static std::map<std::string,long> g_map_dict;/* wordid -> word *//*static std::map<long, std::string> g_map_wordid;*//** * initialize PonySE word segmenter * @remark dictionary file format: * 	1<space>keyword1[\r]<\n> * 	2<space>keyword2[\r]<\n> * 	... */int pws_initialize( const char * dict_path ){	char linebuf[LENGTH_LINEBUF];	FILE * fp = 0;	char * pos = 0, * pos_word = 0;	long word_id = 0;	int keyword_len = 0; /* keyword length */	if ( dict_path==0 || *dict_path==0 )		return -1;	if ( (fp=fopen(dict_path,"r")) == 0 )		return -2;	while ( 1 )	{		if ( fgets(linebuf,LENGTH_LINEBUF,fp) == 0 )			break;		pos = linebuf + 1;		pos_word = 0;		while ( *pos != 0 )		{			if ( *pos == 32 ) /* convert "1<space>keyword1[\r]<\n>" to "1<\0>keyword1[\r]<\n>" */			{				*pos = 0;				pos_word = pos+1;			}			if ( *pos=='\r' || *pos=='\n' ) /* clean last '\r' or '\n' */			{				*pos = 0;				keyword_len = pos - pos_word;				break;			}			pos++;		}		if ( pos_word==0 || *pos_word==0 )			continue;		if ( (word_id=atol(linebuf)) < 0 )			continue;		g_map_dict[pos_word] = word_id;		/* g_map_wordid[word_id] = pos_word; */		/* get keyword max length */		if ( g_keyword_maxlength < keyword_len )			g_keyword_maxlength = keyword_len;	}	fclose( fp );	return 0;	}/** * segment content to wordid array(saved in pws_result_obj) */void pws_segment( const char * content, long len, pws_result_t * pws_result_obj ){	char buf[LENGTH_KEYWORDBUF];	const char * pos = 0, * end_pos = 0;	short len_cpy = 0, len_have = 0;	std::map<std::string, long>::iterator iter_dict;	std::vector<long> vec_wordid;	/* set default value */	pws_result_obj->length = 0;	pws_result_obj->wordid = 0;	pws_result_obj->words = 0;	pos = content;	end_pos = pos + (long)len;	while ( pos < end_pos )	{		len_have = end_pos - pos;		len_cpy = len_have<g_keyword_maxlength ? len_have : g_keyword_maxlength;		memcpy( buf, pos, len_cpy );		while ( len_cpy > 1 )		{			*(buf+len_cpy) = 0;			iter_dict = g_map_dict.find( buf );			if ( iter_dict != g_map_dict.end() )			{				vec_wordid.push_back( iter_dict->second );				break;			}			len_cpy--;		}		pos += len_cpy;	}	if ( vec_wordid.empty() == true )		return;	pws_result_obj->length = vec_wordid.size();	pws_result_obj->wordid = (long *)malloc( sizeof(long) * pws_result_obj->length );	if ( pws_result_obj->wordid == 0 )	{		pws_result_obj->length = 0;		return;	}	std::copy( vec_wordid.begin(), vec_wordid.end(), pws_result_obj->wordid );}/** * segment content to wordid array and words string array(saved in pws_result_obj) */void pws_segment_full( const char * content, long len, pws_result_t * pws_result_obj ){	char buf[LENGTH_KEYWORDBUF];	const char * pos = 0, * end_pos = 0;	short len_cpy = 0, len_have = 0;	std::map<std::string, long>::iterator iter_dict;	std::vector<long> vec_wordid;	std::vector<const char *> vec_words;	/* set default value */	pws_result_obj->length = 0;	pws_result_obj->wordid = 0;	pws_result_obj->words = 0;	pos = content;	end_pos = pos + (long)len;	while ( pos < end_pos )	{		len_have = end_pos - pos;		len_cpy = len_have<g_keyword_maxlength ? len_have : g_keyword_maxlength;		memcpy( buf, pos, len_cpy );		while ( len_cpy > 1 )		{			*(buf+len_cpy) = 0;			iter_dict = g_map_dict.find( buf );			if ( iter_dict != g_map_dict.end() )			{				vec_wordid.push_back( iter_dict->second );				vec_words.push_back( (iter_dict->first).c_str() );				break;			}			len_cpy--;		}		pos += len_cpy;	}	if ( vec_wordid.empty() == true )		return;	pws_result_obj->length = vec_wordid.size();	pws_result_obj->wordid = (long *)malloc( sizeof(long) * pws_result_obj->length );	if ( pws_result_obj->wordid == 0 )	{		pws_result_obj->length = 0;		return;	}	pws_result_obj->words = (const char **)malloc( sizeof(char*) * pws_result_obj->length );	if ( pws_result_obj->words == 0 )	{		pws_result_obj->length = 0;		free ( pws_result_obj->wordid );		pws_result_obj->wordid = 0;		return;	}	std::copy( vec_wordid.begin(), vec_wordid.end(), pws_result_obj->wordid );	std::copy( vec_words.begin(), vec_words.end(), pws_result_obj->words );}/** * release PonySE word segmenter */void pws_release(){	g_map_dict.clear();}/** * create a pws_result_t object */pws_result_t * pws_res_create(){	pws_result_t * tmp_ret = (pws_result_t *)malloc( sizeof(pws_result_t) );	if ( tmp_ret != 0 )		memset( tmp_ret, 0, sizeof(pws_result_t) );	return tmp_ret;}/** * only clean a pws_result_t object */void pws_res_clean( pws_result_t * pws_result_obj ){	if ( pws_result_obj->wordid != 0 )	{		free( pws_result_obj->wordid );		pws_result_obj->wordid = 0;	}	if ( pws_result_obj->words != 0 )	{		free( pws_result_obj->words );		pws_result_obj->words = 0;	}	pws_result_obj->length = 0;}/** * free and destroy a pws_result_t object which created by function pws_res_create */void pws_res_free( pws_result_t ** pws_result_obj ){	if ( pws_result_obj != 0 )	{		pws_res_clean( *pws_result_obj );		free( *pws_result_obj );		*pws_result_obj = 0;	}}/** * get words' identification number with a ws_result_object */long pws_res_wordid( const pws_result_t * pws_result_obj, long ** wordid ){	*wordid = pws_result_obj->wordid;	return pws_result_obj->length;}/** * get words' identification number and word details with a pws_result_obj */long pws_res_words( const pws_result_t * pws_result_obj, long ** wordid, const char *** word ){	if ( pws_result_obj->words == 0 )		return 0;	*wordid = pws_result_obj->wordid;	*word   = pws_result_obj->words;	return pws_result_obj->length;}
pws.c - 源码说明

本页面展示了「这是用C写的中文分词程序」中的 pws.c 源码文件，采用 C语言编程语言编写，共 323 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与分相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?