📄 pws.c
字号:
/* PonySE word segmenter Copyright (C) 2007-2008 PonySE This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.*//** * PonySE word segmenter header file * @file pws.h * @brief PonySE word segment header file * @version 0.0.2 * @author chengyan * @date 03/07/2008 0.0.2 change name to "PonySE word segment" and change all function * @date 03/07/2008 0.0.1 add function ws_get_words(), by chengyan * @date 12/27/2007 0.0.0 created, by chengyan *//* stdc */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <memory.h>/* STL */#include <vector>#include <map>#include <string>/* this project */#include "pws.h"#define LENGTH_LINEBUF 33#define MAX_LENGTH_KEYWORD 24#define LENGTH_KEYWORDBUF 25/** * word splitter spliting result */struct pws_result{ long length; /* word list length */ long * wordid; /* word id list, only words' identification */ const char ** words; /* word details list */ /* long * word_attr; words attribute */};/* maximum length of loaded keyword */static int g_keyword_maxlength = 0;/* dictionary map */static std::map<std::string,long> g_map_dict;/* wordid -> word *//*static std::map<long, std::string> g_map_wordid;*//** * initialize PonySE word segmenter * @remark dictionary file format: * 1<space>keyword1[\r]<\n> * 2<space>keyword2[\r]<\n> * ... */int pws_initialize( const char * dict_path ){ char linebuf[LENGTH_LINEBUF]; FILE * fp = 0; char * pos = 0, * pos_word = 0; long word_id = 0; int keyword_len = 0; /* keyword length */ if ( dict_path==0 || *dict_path==0 ) return -1; if ( (fp=fopen(dict_path,"r")) == 0 ) return -2; while ( 1 ) { if ( fgets(linebuf,LENGTH_LINEBUF,fp) == 0 ) break; pos = linebuf + 1; pos_word = 0; while ( *pos != 0 ) { if ( *pos == 32 ) /* convert "1<space>keyword1[\r]<\n>" to "1<\0>keyword1[\r]<\n>" */ { *pos = 0; pos_word = pos+1; } if ( *pos=='\r' || *pos=='\n' ) /* clean last '\r' or '\n' */ { *pos = 0; keyword_len = pos - pos_word; break; } pos++; } if ( pos_word==0 || *pos_word==0 ) continue; if ( (word_id=atol(linebuf)) < 0 ) continue; g_map_dict[pos_word] = word_id; /* g_map_wordid[word_id] = pos_word; */ /* get keyword max length */ if ( g_keyword_maxlength < keyword_len ) g_keyword_maxlength = keyword_len; } fclose( fp ); return 0; }/** * segment content to wordid array(saved in pws_result_obj) */void pws_segment( const char * content, long len, pws_result_t * pws_result_obj ){ char buf[LENGTH_KEYWORDBUF]; const char * pos = 0, * end_pos = 0; short len_cpy = 0, len_have = 0; std::map<std::string, long>::iterator iter_dict; std::vector<long> vec_wordid; /* set default value */ pws_result_obj->length = 0; pws_result_obj->wordid = 0; pws_result_obj->words = 0; pos = content; end_pos = pos + (long)len; while ( pos < end_pos ) { len_have = end_pos - pos; len_cpy = len_have<g_keyword_maxlength ? len_have : g_keyword_maxlength; memcpy( buf, pos, len_cpy ); while ( len_cpy > 1 ) { *(buf+len_cpy) = 0; iter_dict = g_map_dict.find( buf ); if ( iter_dict != g_map_dict.end() ) { vec_wordid.push_back( iter_dict->second ); break; } len_cpy--; } pos += len_cpy; } if ( vec_wordid.empty() == true ) return; pws_result_obj->length = vec_wordid.size(); pws_result_obj->wordid = (long *)malloc( sizeof(long) * pws_result_obj->length ); if ( pws_result_obj->wordid == 0 ) { pws_result_obj->length = 0; return; } std::copy( vec_wordid.begin(), vec_wordid.end(), pws_result_obj->wordid );}/** * segment content to wordid array and words string array(saved in pws_result_obj) */void pws_segment_full( const char * content, long len, pws_result_t * pws_result_obj ){ char buf[LENGTH_KEYWORDBUF]; const char * pos = 0, * end_pos = 0; short len_cpy = 0, len_have = 0; std::map<std::string, long>::iterator iter_dict; std::vector<long> vec_wordid; std::vector<const char *> vec_words; /* set default value */ pws_result_obj->length = 0; pws_result_obj->wordid = 0; pws_result_obj->words = 0; pos = content; end_pos = pos + (long)len; while ( pos < end_pos ) { len_have = end_pos - pos; len_cpy = len_have<g_keyword_maxlength ? len_have : g_keyword_maxlength; memcpy( buf, pos, len_cpy ); while ( len_cpy > 1 ) { *(buf+len_cpy) = 0; iter_dict = g_map_dict.find( buf ); if ( iter_dict != g_map_dict.end() ) { vec_wordid.push_back( iter_dict->second ); vec_words.push_back( (iter_dict->first).c_str() ); break; } len_cpy--; } pos += len_cpy; } if ( vec_wordid.empty() == true ) return; pws_result_obj->length = vec_wordid.size(); pws_result_obj->wordid = (long *)malloc( sizeof(long) * pws_result_obj->length ); if ( pws_result_obj->wordid == 0 ) { pws_result_obj->length = 0; return; } pws_result_obj->words = (const char **)malloc( sizeof(char*) * pws_result_obj->length ); if ( pws_result_obj->words == 0 ) { pws_result_obj->length = 0; free ( pws_result_obj->wordid ); pws_result_obj->wordid = 0; return; } std::copy( vec_wordid.begin(), vec_wordid.end(), pws_result_obj->wordid ); std::copy( vec_words.begin(), vec_words.end(), pws_result_obj->words );}/** * release PonySE word segmenter */void pws_release(){ g_map_dict.clear();}/** * create a pws_result_t object */pws_result_t * pws_res_create(){ pws_result_t * tmp_ret = (pws_result_t *)malloc( sizeof(pws_result_t) ); if ( tmp_ret != 0 ) memset( tmp_ret, 0, sizeof(pws_result_t) ); return tmp_ret;}/** * only clean a pws_result_t object */void pws_res_clean( pws_result_t * pws_result_obj ){ if ( pws_result_obj->wordid != 0 ) { free( pws_result_obj->wordid ); pws_result_obj->wordid = 0; } if ( pws_result_obj->words != 0 ) { free( pws_result_obj->words ); pws_result_obj->words = 0; } pws_result_obj->length = 0;}/** * free and destroy a pws_result_t object which created by function pws_res_create */void pws_res_free( pws_result_t ** pws_result_obj ){ if ( pws_result_obj != 0 ) { pws_res_clean( *pws_result_obj ); free( *pws_result_obj ); *pws_result_obj = 0; }}/** * get words' identification number with a ws_result_object */long pws_res_wordid( const pws_result_t * pws_result_obj, long ** wordid ){ *wordid = pws_result_obj->wordid; return pws_result_obj->length;}/** * get words' identification number and word details with a pws_result_obj */long pws_res_words( const pws_result_t * pws_result_obj, long ** wordid, const char *** word ){ if ( pws_result_obj->words == 0 ) return 0; *wordid = pws_result_obj->wordid; *word = pws_result_obj->words; return pws_result_obj->length;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -