📄 hzstrseg.cpp
字号:
#include <stdio.h>
#include <string>
#include <fstream>
#include <iostream> //for cout
#include <vector> //for v_tids[i]
#include <conio.h>
#include "Dict.h"
#include "index.cpp" //call index function
//#include "lexicon.h" //int freq
#include <ctime> //calculate program's run time
#define MaxWordLength 8 // 最大词长为8个字节(即4个汉字)
#define Separator " " // 词界标记
using namespace std;
Dict dict;//在内存中创建一个map,读入词典内容。
vector<int> v_tids; //store segment word id in lexicon
vector<string> v_terms; //store segment word
//int word_freq[LEXICON_LEN];
string SegmentHzStrMM (string s1)
{// 对一个字符串进行最大匹配法分词的函数
string s2=""; // 用s2存放分词结果
while(!s1.empty()) { // 如果输入不为空
int len=s1.length(); // 取输入串长度
if (len>MaxWordLength) // 如果输入串长度大于最大词长
len=MaxWordLength; // 只在最大词长范围内进行处理
string w=s1.substr(0, len); // 将输入串左边等于最大词长长度串取出作为候选词
bool isw=dict.IsWord(w);
while(len>2 && isw==false) { // 如果不是词
len-=2; // 从候选词右边减掉一个汉字,将剩下的部分作为候选词
w=w.substr(0, len);
isw=dict.IsWord(w);
}
s2 += w + Separator; // 将匹配得到的词连同词界标记加到输出串末尾
//yuanlv 获得该词w在词典中的编号,传递到tids[i]中
int id = dict.get_id(w);
v_tids.push_back(id);
v_terms.push_back(w);
/*++word_freq[id];
cout << w << word_freq[id] << endl;
*/
s1 = s1.substr(w.length());
}
return s2;
}
string SegmentSentenceMM (string s1)
{// 对句子进行分词处理的函数
string s2=""; //保存分词结果,以" " 分隔
int i,dd;
while(!s1.empty()) {
unsigned char ch=(unsigned char) s1[0];
if(ch<128) { // 处理西文字符
i=1;
dd=s1.length(); //获得该行句子的字符串长度
while(i<dd && ((unsigned char)s1[i]<128) && (s1[i]!=10) && (s1[i]!=13)) // s1[i]不能是换行符或回车符
i++;
if ((ch!=32) && (ch!=10) && (ch!=13)) // 如果不是西文空格或换行或回车符
s2 += s1.substr(0, i) + Separator;
else {
if (ch==10 || ch==13) // 如果是换行或回车符,将它拷贝给s2输出
s2+=s1.substr(0, i);
}
s1=s1.substr(i);
continue;
}
else {
if (ch<176) { // 中文标点等非汉字字符
i=0;
dd=s1.length();
while(i<dd && ((unsigned char)s1[i]<176) && ((unsigned char)s1[i]>=161)
&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
&& (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
|| (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
|| (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191))) //
i=i+2; // 假定没有半个汉字
if (i==0)
i=i+2;
if (!(ch==161 && (unsigned char)s1[1]==161)) // 不处理中文空格
s2+=s1.substr(0, i) + Separator; // 其他的非汉字双字节字符可能连续输出
s1=s1.substr(i);
continue;
}
}
// 以下处理汉字串
i=2;
dd=s1.length();
while(i<dd && (unsigned char)s1[i]>=176)
// while(i<dd && (unsigned char)s1[i]>=128 && (unsigned char)s1[i]!=161)
i+=2;
s2+=SegmentHzStrMM(s1.substr(0,i));
s1=s1.substr(i); //s1: from i to end
}
// 以下程序用于将表示时间的单位合并成一个分词单位
/*
int TmpPos;
const char * p;
string s2_part_1;
if ((TmpPos = s2.find(" 年/")) != string:npos) {
s2_part_1=s2.Mid(0,TmpPos);
p=(LPCTSTR) s2_part_1;
p=p+TmpPos-2;
if (p[0]=='1'||p[0]=='2'||p[0]=='3'||p[0]=='4'||p[0]=='5'||p[0]=='6'||p[0]=='7'||p[0]=='8'||p[0]=='9'||p[0]=='0') {
s2_part_1=s2_part_1.Mid(0,TmpPos-1);
s2=s2_part_1+s2.Mid(TmpPos+2);
}
}
if (s2.Find(" 月/")>=0) {
TmpPos=s2.Find(" 月/");
s2_part_1=s2.Mid(0,TmpPos);
p=(LPCTSTR) s2_part_1;
p=p+TmpPos-2;
if (p[0]=='1'||p[0]=='2'||p[0]=='3'||p[0]=='4'||p[0]=='5'||p[0]=='6'||p[0]=='7'||p[0]=='8'||p[0]=='9'||p[0]=='0') {
s2_part_1=s2_part_1.Mid(0,TmpPos-1);
s2=s2_part_1+s2.Mid(TmpPos+2);
}
}
if (s2.Find(" 日/")>=0) {
TmpPos=s2.Find(" 日/");
s2_part_1=s2.Mid(0,TmpPos);
p=(LPCTSTR) s2_part_1;
p=p+TmpPos-2;
if (p[0]=='1'||p[0]=='2'||p[0]=='3'||p[0]=='4'||p[0]=='5'||p[0]=='6'||p[0]=='7'||p[0]=='8'||p[0]=='9'||p[0]=='0') {
s2_part_1=s2_part_1.Mid(0,TmpPos-1);
s2=s2_part_1+s2.Mid(TmpPos+2);
}
}
*/
return s2;
}
void SegmentAFileMM (string FileName)
{ // 对文件进行分词处理的函数
ifstream fin(FileName.c_str());
ofstream fout((FileName + ".seg").c_str());
//cout << "create result file " << endl;
string line;
while(getline(fin, line)) {// 循环读入文件中的每一行
line = SegmentSentenceMM(line); // 调用句子分词函数进行分词处理
fout << line; // 将分词结果写入目标文件
//cout << line << endl; //show in cmd model
fout << endl;
}
fin.close();
fout.flush();
fout.close();
}
void segment(const string filename) //接收数据文件,返回切分后词的 tids[i] 信息。
{
//begin to use clock
clock_t t_begin = clock();
SegmentAFileMM(filename); //run function
clock_t t_end = clock();
//calculate time spend, and print it
printf("done! use: ");
//printf("%lf second\n",(long double)(t_end - t_begin)/CLOCKS_PER_SEC);
cout << (long double)(t_end - t_begin)/CLOCKS_PER_SEC << " s";
cout << endl;
}
void print(vector<int> &v_tids, vector<string> &v_terms)
{
vector<int>::iterator iter_id = v_tids.begin();
vector<string>::iterator iter_term = v_terms.begin();
for(;iter_id != v_tids.end() && iter_term != v_terms.end();
++iter_id, ++iter_term)
{
cout << (*iter_id) << " " << (*iter_term) << endl;
}
}
int main()
{
string filename;
cout << "input filename: " << endl;
cin>> filename;
segment(filename);
print(v_tids, v_terms); //test word id, english word has no tid
//new int array ,size = v_tids.size(), store element in v_tids
int t_count = v_tids.size();
cout << "array count: " << t_count << endl;
int *termid = new int[t_count];
for(int i = 0; i < t_count; ++i)
{
termid[i] = v_tids[i];
cout << termid[i] << " "<< endl;
}
//call index.h, 20081015 add
index indexer ;
int docid = 1;
indexer.wordleverindexing(docid,termid,t_count);
//cout << "input filename: " << endl;
//cin>> filename;
//segment(filename);
//print(v_tids, v_terms); //test word id, english word has no tid
//new int array ,size = v_tids.size(), store element in v_tids
//t_count = v_tids.size();
// cout << "array count: " << t_count << endl;
//int *termid = new int[t_count];
// for(int i = 0; i < t_count; ++i)
//{
// termid[i] = v_tids[i];
// cout << termid[i] << " "<< endl;
//}
//int docid1 = 2;
//indexer.wordleverindexing(docid1,termid,t_count);
cout << "index finish..." << endl;
getch();
return 0;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -