📄 test.cpp
字号:
// Test.cpp : Defines the entry point for the console application.
//张静静
//s08070890
#include "stdafx.h"
// Dictionary.h
#include <iostream>
#include <string>
#include <fstream>
#include <sstream>
#include <map>
using namespace std;
class CDictionary
{
public:
CDictionary(); //将词典文件读入并构造为一个哈希词典
~CDictionary();
int FindWord(string w); //在哈希词典中查找词
private:
string strtmp; //读取词典的每一行
string word; //保存每个词
map<string,string> wordhash;// 用于读取词典后的哈希
map<string,string>::iterator worditer;
};
//将词典文件读入并构造为一个哈希词典
CDictionary::CDictionary()
{
ifstream infile("wordlexicon.txt"); // 打开词典
if (!infile.is_open()) // 打开词典失败则退出程序
{
cerr << "Unable to open input file: " << "wordlexicon"
<< " -- bailing out!" << endl;
exit(-1);
}
int i=0;
while (getline(infile, strtmp, '\n')) // 读入词典的每一行并将其添加入哈希中
{
istringstream istr(strtmp);
istr >> word; //读入每行第一个词
wordhash.insert(pair <string, string>(word, "1")); //插入到哈希中
}
}
CDictionary::~CDictionary()
{
}
//在哈希词典中查找词,若找到1,则返回,否则返回0
int CDictionary::FindWord(string w)
{
if (wordhash.find(w) != wordhash.end()) return 1;
else return 0;
}
// 主程序main.cpp
//#include "Dictionary.h"
# define MaxWordLength 10 // 最大词长为10个字节(即5个汉字)
# define Separator "/ " // 词界标记
CDictionary WordDic; //初始化一个词典
//对字符串用最大匹配法逆向处理
string SegmentSentence(string s1)
{
string s2 = ""; //用s2存放分词结果
while(!s1.empty())
{
int len =(int) s1.length(); // 取输入串长度
if (len > MaxWordLength) // 如果输入串长度大于最大词长
{
len = MaxWordLength; // 只在最大词长范围内进行处理
}
string w = s1.substr(s1.length() - len, len); //(逆向用)将输入串右边等于最大词长长度串取出作为候选词
int n = WordDic.FindWord(w); // 在词典中查找相应的词
while(len > 2 && n == 0) // 如果不是词
{
len -= 2; // 从候选词左边减掉一个汉字,将剩下的部分作为候选词
w = s1.substr(s1.length() - len, len); //逆向用
n = WordDic.FindWord(w);
}
w = w + Separator; // (逆向用)
s2 = w + s2 ; // (逆向用)将匹配得到的词连同词界标记加到输出串前面
s1 = s1.substr(0, s1.length() - len); // (逆向用)从s1处开始
}
return s2;
}
//对句子进行最大匹配法处理,包含对特殊字符的处理
string SegmentSentenceMM (string s1)
{
string s2 = ""; //用s2存放分词结果
int i;
int dd;
while(!s1.empty() )
{
unsigned char ch = (unsigned char)s1[0];
if (ch < 128) // 处理西文字符
{
i = 1;
dd = (int)s1.length();
while (i < dd && ((unsigned char)s1[i] < 128) && (s1[i] != 10) && (s1[i] != 13)) // s1[i]不能是换行符或回车符
{
i++;
}
if ((ch != 32) && (ch != 10) && (ch != 13)) // 如果不是西文空格或换行或回车符
{
s2 += s1.substr(0,i) + Separator;
}
else
{
if (ch == 10 || ch == 13) // 如果是换行或回车符,将它拷贝给s2输出
{
s2 += s1.substr(0, i);
}
}
s1 = s1.substr(i,dd);
continue;
}
else
{
if (ch < 176) // 中文标点等非汉字字符
{
i = 0;
dd = (int)s1.length();
while(i < dd && ((unsigned char)s1[i] < 176) && ((unsigned char)s1[i] >= 161)
&& (!((unsigned char)s1[i] == 161 && ((unsigned char)s1[i+1] >= 162 && (unsigned char)s1[i+1] <= 168)))
&& (!((unsigned char)s1[i] == 161 && ((unsigned char)s1[i+1] >= 171 && (unsigned char)s1[i+1] <= 191)))
&& (!((unsigned char)s1[i] == 163 && ((unsigned char)s1[i+1] == 172 || (unsigned char)s1[i+1] == 161)
|| (unsigned char)s1[i+1] == 168 || (unsigned char)s1[i+1] == 169 || (unsigned char)s1[i+1] == 186
|| (unsigned char)s1[i+1] == 187 || (unsigned char)s1[i+1] == 191)))
{
i = i + 2; // 假定没有半个汉字
}
if (i == 0)
{
i = i + 2;
}
if (!(ch == 161 && (unsigned char)s1[1] == 161)) // 不处理中文空格
{
s2+=s1.substr(0, i) + Separator; // 其他的非汉字双字节字符可能连续输出
}
s1 = s1.substr(i, dd);
continue;
}
}
// 以下处理汉字串
i = 2;
dd = (int)s1.length();
while(i < dd && (unsigned char)s1[i] >= 176)
{
i += 2;
}
s2 += SegmentSentence(s1.substr(0, i));
s1 = s1.substr(i,dd);
}
return s2;
}
void getFreq(string s1) {
string s2 = ""; // 用s2存放分词结果
int start = 0;
int mid = start + MaxWordLength;
string w ="";
map<string,int> result;// 存结果
map<string,int>::iterator worditer;
// 统计词频
int len = (int)s1.length(); // 取输入串长度
while (start < len) {
if (mid > len) {
mid = len;
}
w = s1.substr(start, mid);
typedef map <string, int>::const_iterator CIT;
CIT cit=result.find(w);
if (cit!=result.end()){
//if (result.find()!= wordhash.end()) {// 已经出现过的词
//cout << cit->first << " " << cit->second << endl;
//string str="";
//str=cit->second;
//cit->second;
//int i=atoi(str.c_str());
int i=cit->second;
i++;
result.insert(pair <string, int>(w, i));
start = mid;
mid = start + MaxWordLength;
continue;
} else {
if (WordDic.FindWord(w) == 1 || (mid < (start + 2))) {
result.insert(pair <string, int>(w, 1));
start = mid;
mid = start + MaxWordLength;
continue;
} else {
mid--;
continue;
}
}
}
ofstream outfile1("result.txt"); //确定输出文件
if (!outfile1.is_open())
{
cerr << "Unable to open result.txt"
<< "--bailing out!" << endl;
exit(-1);
}
typedef map <string, int>::const_iterator CIT;
for(CIT p=result.begin(); p!=result.end(); ++p)
{
outfile1 << p->first <<' '<<p->second << endl; // 将分词结果写入目标文件;
}
}
int main(int argc, char *argv[])
{
string strtmp; //用于保存从语料库中读入的每一行
string line; //用于输出每一行的结果
ifstream infile("test.txt"); // 打开输入文件
if (!infile.is_open()) // 打开输入文件失败则退出程序
{
cerr << "Unable to open input file: " << argv[1]
<< " -- bailing out!" << endl;
exit(-1);
}
ofstream outfile1("SegmentResult.txt"); //确定输出文件
if (!outfile1.is_open())
{
cerr << "Unable to open file:SegmentResult.txt"
<< "--bailing out!" << endl;
exit(-1);
}
while (getline(infile, strtmp, 'n')) //读入语料库中的每一行并用最大匹配法处理
{
line = strtmp;
line = SegmentSentenceMM(line); // 调用分词函数进行分词处理
outfile1 << line << endl; // 将分词结果写入目标文件
}
ifstream infile1("test.txt"); // 打开输入文件
if (!infile1.is_open()) // 打开输入文件失败则退出程序
{
cerr << "Unable to open input file: " << argv[1]
<< " -- bailing out!" << endl;
exit(-1);
}
line="";
while (getline(infile1, strtmp, 'n')) //读入语料库中的每一行并用最大匹配法处理
{
line += strtmp;
}
getFreq(line);
return 0;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -