📄 fenci.cpp

📁 自己下载一个语料库
💻 CPP
字号:
#include<iostream>
#include<string>
#include <fstream>
#include "order.h"
#define indexLength 100
using namespace std;
typedef struct node
{
	std::string str;
	int tag;
}node;

node index[1000];		//词库建立n个索引位置

//初始化词库索引
int iniIndex(char path[])
{
	int i = 1, sum = 0, count = 0, k;
	char buffer[100];
	ifstream fs;
	fs.open(path);
	while(!fs)
	{
		cout << endl << "打开词库文件失败，请重新输入词库文件路径!" << endl;
		return -1;
	}
	while( !fs.eof() )
	{
		//每100条词作为一个索引
		while(count < indexLength && !fs.eof() )
		{
			fs.getline(buffer, 100);
			k = 0;
			while( buffer[k] != '\0' )
				k++;
			sum += k;
			sum += 2;
			count++;
		}
		index[i].str = buffer;
		index[i].tag = sum;
		cout << buffer << endl;
		i++;
		count = 0;
	}
	fs.close();
	return i;
}

//根据单个字，查找出相关单词在字库中的范围
int getIndex(std::string str, int Start, int End)
{
	int i = Start, j = End, middle = (i + j) / 2;
	while( j - i >= 2 )		//首先将范围缩至2~3个
	{
		if( str < index[middle].str )
			j = middle;
		else
			i = middle;
		middle = (i + j) / 2;
	}
	//返回str所在文件块的初始位置
	return i;
}

//将输入的字符串分为各个单词，str为初始字符串,path为字典路径
std::string separate(std::string str, char dictionary[], int Start, int End)
{
	int i, k, tag = 0;	//tag记录当前所处理到的字符串位置
	int m = 0;	
	std::string temp, result;	
	char buffer[256];
	//以一个单一汉字作为单词
	std::string StartWord = str.substr(tag, 2);

	//首先将以StartWord为首的单词读入，然后进行查找。
	ifstream fs;
	fs.open(dictionary);
	
	while( StartWord != "" )
	{
		//temp记录所能匹配的最长字符串
		if(temp == "")
			temp = StartWord;

		//找到单词在字库中的范围
		k = getIndex(StartWord, Start, End) + m;
		
		fs.seekg( index[k].tag, ios::beg);

		std::string current;
		for(i = 0; i < indexLength; i++)
		{
			fs.getline(buffer, 100);
			current = buffer;
			if( current == str.substr(tag, StringLength(current)) )
			{
				if( StringLength(current) > StringLength(temp) )
					temp = current;
			}
			else if(current > str.substr(tag, StringLength(current)) )
				break;
		}
		
		if( current <= str.substr(tag, StringLength(current)) ){
			//下一个索引区仍有可能存在匹配的字符串
			m++;
		}else{
			result += temp;
			result += "  ";
			tag += StringLength(temp);
			StartWord = str.substr(tag, 2);
			m = 0;
			temp = "";
		}
	}

	fs.close();
	return result;
}

int separateFile(ifstream &in, ofstream &out, char dictionary[], int Start, int End)
{
	char buffer[3] = {'0','0', '\0'};
	std::string input, output, temp;
	while( !in.eof() )
	{
		in.get(buffer[0]);
		//如果不是汉字字符，直接输出
		while(buffer[0] >= 0 && buffer[0] <= 255 && !in.eof())
		{
			out << buffer[0];
			in.get(buffer[0]);
		}

		if( in.eof() )
			return 0;

		in.get(buffer[1]);
		temp = buffer;
		if(temp >= "啊" && temp <= "齄")
			input += temp;
		else
		{
			if(input != "")
				output = separate(input, dictionary, Start, End);
			output += "  " + temp + "  ";
			out << output ;
			input = "";
			output = "";
		}
	}
	return 0;
}

int main()
{

	int i;
	cout << "如果需要对词库进行排序操作，请输入1；如果词库已经排好序，请输入0：" << endl;
	cin >> i;
	if(i == 1)
	{
		if( order() != 0)
		{
			cout << endl << "原始词库的排序出现错误！" << endl;
			return 0;
		}
	}
	int j;
	char infile[256], outfile[256], dictionary[256];
	ifstream in;
	ofstream out;

	cout << endl << "请输入词库文件路径：" << endl;
	cin >> dictionary;

	j = iniIndex(dictionary);

	while(j == -1)
	{
		cout << endl << "词库索引建立失败，请检查词库路径是否正确！请重新输入，退出0" << endl;
		cin >> dictionary;
		if(dictionary[0] == '0')
			return 0;
		j = iniIndex(dictionary);
	}

	cout << endl << "请输入待分词文件路径：" << endl ;
	cin >> infile;
	in.open(infile);
	while(!in)
	{
		cout << endl << "打开文件失败，请检查文件路径名是否正确!请重新输入，退出0" << endl;
		in.close();
		in.clear();
		cin >> infile;
		if(infile[0] == '0')
			return -1;
		in.open(infile);
	}

	cout << endl << "请输入分词后文件输出路径：" << endl;
	cin >> outfile;
	out.open(outfile);
	while(!out)
	{
		cout << endl << "在此处创建文件失败!请重新输入路径,退出0 " << endl;
		out.close();
		out.clear();
		cin >> outfile;
		if(outfile[0] == '0')
			return -2;
		out.open(outfile);
	}
	
	if(separateFile(in, out, dictionary, 0, j - 1) == 0)
		cout << endl << "文件分词成功！！" << endl;
    else
		cout << endl << "文件分词失败！！" << endl;

	in.close();
	out.close();

	return 0;
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -