📄 fenci.cpp
字号:
#include<iostream>
#include<string>
#include <fstream>
#include "order.h"
#define indexLength 100
using namespace std;
typedef struct node
{
std::string str;
int tag;
}node;
node index[1000]; //词库建立n个索引位置
//初始化词库索引
int iniIndex(char path[])
{
int i = 1, sum = 0, count = 0, k;
char buffer[100];
ifstream fs;
fs.open(path);
while(!fs)
{
cout << endl << "打开词库文件失败,请重新输入词库文件路径!" << endl;
return -1;
}
while( !fs.eof() )
{
//每100条词作为一个索引
while(count < indexLength && !fs.eof() )
{
fs.getline(buffer, 100);
k = 0;
while( buffer[k] != '\0' )
k++;
sum += k;
sum += 2;
count++;
}
index[i].str = buffer;
index[i].tag = sum;
cout << buffer << endl;
i++;
count = 0;
}
fs.close();
return i;
}
//根据单个字,查找出相关单词在字库中的范围
int getIndex(std::string str, int Start, int End)
{
int i = Start, j = End, middle = (i + j) / 2;
while( j - i >= 2 ) //首先将范围缩至2~3个
{
if( str < index[middle].str )
j = middle;
else
i = middle;
middle = (i + j) / 2;
}
//返回str所在文件块的初始位置
return i;
}
//将输入的字符串分为各个单词,str为初始字符串,path为字典路径
std::string separate(std::string str, char dictionary[], int Start, int End)
{
int i, k, tag = 0; //tag记录当前所处理到的字符串位置
int m = 0;
std::string temp, result;
char buffer[256];
//以一个单一汉字作为单词
std::string StartWord = str.substr(tag, 2);
//首先将以StartWord为首的单词读入,然后进行查找。
ifstream fs;
fs.open(dictionary);
while( StartWord != "" )
{
//temp记录所能匹配的最长字符串
if(temp == "")
temp = StartWord;
//找到单词在字库中的范围
k = getIndex(StartWord, Start, End) + m;
fs.seekg( index[k].tag, ios::beg);
std::string current;
for(i = 0; i < indexLength; i++)
{
fs.getline(buffer, 100);
current = buffer;
if( current == str.substr(tag, StringLength(current)) )
{
if( StringLength(current) > StringLength(temp) )
temp = current;
}
else if(current > str.substr(tag, StringLength(current)) )
break;
}
if( current <= str.substr(tag, StringLength(current)) ){
//下一个索引区仍有可能存在匹配的字符串
m++;
}else{
result += temp;
result += " ";
tag += StringLength(temp);
StartWord = str.substr(tag, 2);
m = 0;
temp = "";
}
}
fs.close();
return result;
}
int separateFile(ifstream &in, ofstream &out, char dictionary[], int Start, int End)
{
char buffer[3] = {'0','0', '\0'};
std::string input, output, temp;
while( !in.eof() )
{
in.get(buffer[0]);
//如果不是汉字字符,直接输出
while(buffer[0] >= 0 && buffer[0] <= 255 && !in.eof())
{
out << buffer[0];
in.get(buffer[0]);
}
if( in.eof() )
return 0;
in.get(buffer[1]);
temp = buffer;
if(temp >= "啊" && temp <= "齄")
input += temp;
else
{
if(input != "")
output = separate(input, dictionary, Start, End);
output += " " + temp + " ";
out << output ;
input = "";
output = "";
}
}
return 0;
}
int main()
{
int i;
cout << "如果需要对词库进行排序操作,请输入1;如果词库已经排好序,请输入0:" << endl;
cin >> i;
if(i == 1)
{
if( order() != 0)
{
cout << endl << "原始词库的排序出现错误!" << endl;
return 0;
}
}
int j;
char infile[256], outfile[256], dictionary[256];
ifstream in;
ofstream out;
cout << endl << "请输入词库文件路径:" << endl;
cin >> dictionary;
j = iniIndex(dictionary);
while(j == -1)
{
cout << endl << "词库索引建立失败,请检查词库路径是否正确!请重新输入,退出0" << endl;
cin >> dictionary;
if(dictionary[0] == '0')
return 0;
j = iniIndex(dictionary);
}
cout << endl << "请输入待分词文件路径:" << endl ;
cin >> infile;
in.open(infile);
while(!in)
{
cout << endl << "打开文件失败,请检查文件路径名是否正确!请重新输入,退出0" << endl;
in.close();
in.clear();
cin >> infile;
if(infile[0] == '0')
return -1;
in.open(infile);
}
cout << endl << "请输入分词后文件输出路径:" << endl;
cin >> outfile;
out.open(outfile);
while(!out)
{
cout << endl << "在此处创建文件失败!请重新输入路径,退出0 " << endl;
out.close();
out.clear();
cin >> outfile;
if(outfile[0] == '0')
return -2;
out.open(outfile);
}
if(separateFile(in, out, dictionary, 0, j - 1) == 0)
cout << endl << "文件分词成功!!" << endl;
else
cout << endl << "文件分词失败!!" << endl;
in.close();
out.close();
return 0;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -