⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 order.h

📁 自己下载一个语料库
💻 H
字号:
#include<iostream>
#include<string>
#include<fstream>
#define Length 5000	//初始顺串长度
#define MAXSTRING "齄齄齄齄"
using namespace std;

int StringLength(std::string str)
{
	int i = 0;
	while(str[i] != '\0')
		i++;
	return i;
}

//快速排序
template <class Record>
class QuickSorter{
	private:
		int SelectPivot(int left, int right);		//选择轴值
		int Partition(Record Array[], int left, int right);	//分割,返回轴值位置
		void swap(Record Array[], int i, int j);
		bool lt(Record Array[], int i, Record j);
		bool gt(Record Arrya[], int i, Record j);
	public:
		void Sort(Record Array[], int left, int right);
};

template< class Record >
void QuickSorter<Record>::swap(Record Array[], int i, int j)
{
	Record temp = Array[i];
	Array[i] = Array[j];
	Array[j] = temp;
}

template< class Record >
bool QuickSorter<Record>::lt(Record Array[], int i, Record j)
{
	return Array[i] < j;
}

template< class Record >
bool QuickSorter<Record>::gt(Record Array[], int i, Record j)
{
	return Array[i] > j;
}

template<class Record>
void QuickSorter<Record>::Sort(Record Array[], int left, int right){
	if(right <= left)	//排序完成
		return;
	int pivot = SelectPivot(left, right);		//选择轴值
	swap(Array, pivot, right);					//分割前将轴值放到数组末端
	pivot = Partition(Array, left, right);		//进行分割,返回轴值位置
	Sort(Array, left, pivot - 1);				//对左边数组分割
	Sort(Array, pivot + 1, right);				//对右边数组分割
}

template<class Record>
int QuickSorter<Record>::SelectPivot(int left, int right){
	return (left + right) / 2;					//选择中间地址作为轴值
}

template<class Record>
int QuickSorter<Record>::Partition(Record Array[], int left, int right){
	Record Temp;	//临时变量
	int i = left;
	int j = right;
	Temp = Array[j];
	while(i != j){
		//i指针向右移动,直到找到大于等于轴值地记录
		while( (i < j) && lt(Array, i, Temp) )
			i++;
		if( i < j )
		{
			Array[j] = Array[i];
			j--;
		}
		//j指针向右移动,直到找到小于轴值地记录
		while( (i < j) && gt(Array, j, Temp) )
			j--;
		if(i < j)
		{
			Array[i] = Array[j];
			i++;
		}
	}
	Array[i] = Temp;
	return i;
}

template<class T>
class LoserTree{
	private:
		int MaxSize;		//最大选手数
		int n;				//当前选手数
		int LowExt;			//最底层外部节点数
		int offset;			//最底层外部节点之上地结点总数
		int* B;				//赢者数数组
		T* L;				//元素数组
		void Play(int p, int lc, int rc);
		int winner(T A[], int i, int j) { return (A[i] < A[j]) ? i : j ; };
		int loser(T A[], int i, int j) { return (A[i] >= A[j]) ? i : j ; };
	public:
		LoserTree(int Treesize);
		~LoserTree();
		//初始化败者树
		void Initialize(T A[], int size);
		//返回最终胜者索引
	    int Winner();
		//位置i的选手改变后重构败者树
		void RePlay(int i);
};

template<class T>
LoserTree<T>::LoserTree(int TreeSize){
	MaxSize = TreeSize;
	B = new int[MaxSize];
	n = 0;
}

template<class T>
LoserTree<T>::~LoserTree(){
	delete[] B;
}

template<class T>
int LoserTree<T>::Winner(){
	return (n) ? B[0] : 0 ;
}

/********************************
初始化败者树
size: 选手个数
A[]:  输入顺串
*********************************/
template<class T>
void LoserTree<T>::Initialize(T A[], int size ){
	if(size > MaxSize || size < 2){
		cout << "Bad Input!" << endl ;
		return;
	}

	//初始化成员变量
	n = size;
	L = A;

	int i, s;

	//计算 s=2^log(n-1);
	for(s = 1; 2*s <= n-1; s += s);
	
	LowExt = 2 * (n - s);
	offset = 2 * s - 1;

	for(i = 2; i <= LowExt; i += 2)
		Play( (i + offset)/2, i-1, i);
	if( n % 2 ){//n为奇数时,用L[LowExt+1]和其父节点比赛
		Play( n/2, B[(n-1)/2], LowExt+1);
		i = LowExt + 3;
	}
	for(; i <= n; i += 2)
		Play( (i-LowExt+n-1)/2, i-1, i);
}

/******************************
在内部节点B[p]处开始比赛
******************************/
template<class T>
void LoserTree<T>::Play(int p, int lc, int rc){
	B[p] = loser(L, lc, rc);	//败者索引放在B[p]中
	int temp1, temp2;
	temp1 = winner(L, lc, rc);	

	while(p > 1 && p % 2){// 右子女,需沿路径继续向上比赛
		temp2 = winner(L, temp1, B[p/2]);
		B[p/2] = loser(L, temp1, B[p/2]);
		temp1 = temp2;
		p /= 2;
	}

	B[p/2] = temp1;
}

/***************************
选手i的值改变后重新开始比赛
****************************/
template<class T>
void LoserTree<T>::RePlay(int i){
	if( i <= 0 || i > n ){
		cout << "out of bound!" << endl;
		return;
	}

	int p;
	if(i <= LowExt)
		p = (i + offset) / 2;
	else
		p = (i - LowExt + n - 1) / 2;

	//B[0]中始终保持胜者索引
	B[0] = winner(L, i, B[p]);
	//B[p]中保存败者索引
	B[p] = loser(L, i, B[p]);

	for(; (p/2) >= 1; p /= 2){
		int temp;
		temp = winner(L, B[p/2], B[0]);
		B[p/2] = loser(L, B[p/2], B[0]);
		B[0] = temp;
	}
}

//从文件Start位置开始,读取count组记录到数组a[]中,返回读取的数据量
int GetContent(ifstream &in, int Start, std::string *a, int count, int Total)
{
	int sum = 0, i = 0, j;
	char buffer[256];
	in.seekg(Start, ios::beg);
	// while(!in.eof() && i < count && sum < Total)
	while( i < count && sum < Total)          //当文件没有结束
	{
		in.getline(buffer, 100);        //读入一行到缓冲区
		a[i++] = buffer;
		j = 0;
		while(buffer[j] != '\0')
			j++;
		sum += j;
		sum += 2;
	}
	//if( in.eof() && sum >= Totoal )
	if( i < count && sum >= Total )
		a[i++] = MAXSTRING;
	return sum;
}

//对文件进行初始排序,结果生成n个初始顺串,将各个顺串的初始位置记录在index数组中,返回生成顺串的个数
//in为输入文件流,out为输出文件流
int iniDictionary(ifstream &in, ofstream &out, int index[])
{
	char buffer[256];
	std::string b[Length];
	int i, j = 0, k, insum = 0, outsum = 0;
	index[0] = 0;
	while( !in.eof() )
	{
		i = 0;
		
		in.seekg(insum, ios::beg);
		while( !in.eof() && i < Length)          //当文件没有结束
		{
			in.getline(buffer, 100);        //读入一行到缓冲区
			k = 0;
			while(buffer[k] != '\0')
				k++;
			insum += k;
			insum += 2;				//记录指针位置
			b[i] = buffer;			//将单词放入数组中			
			i++;			
		}

		QuickSorter<std::string> sort;		//对一组记录进行快速排序
		sort.Sort(b, 0, i-1);

		for(k = 0; k < i; k++)
		{
			out << b[k];
			out << '\n';
			outsum += StringLength(b[k]);
			outsum += 2;
		}
		index[++j] = outsum;					//记录一次排序完成后各个顺串的初始位置
	}
	
	in.close();
	out.close();
	return j;
}


//racer为待排序数组,SIZE为待排序顺串的个数
void multiMerge(ifstream &in, ofstream &out, int SIZE, int *index)
{
	int i, winner, count;	
	std::string **instr = new std::string*[SIZE+1];	//每个顺串读取一定地数据,进行归并排序,0为输出顺串
	for(i = 0; i <= SIZE; i++)
		instr[i] = new std::string[Length/SIZE];
	int *insum = new int[SIZE+1];			//记录各个顺串在文件中的起始位置
	int *inindex = new int[SIZE+1];			//每个顺串目前指针,0为输出顺串指针
	std::string *racer = new std::string[SIZE+1];
	
	for(i = 0; i <= SIZE; i++)	//初始化各个顺串的初始位置
	{
		insum[i] = index[i];
		inindex[i] = 0;
	}

	for(i = 1; i <= SIZE; i++)
	{	//初始化待排序地顺串
		count = GetContent(in, insum[i-1], &instr[i][0], Length/SIZE, index[i]-insum[i-1]);
		insum[i-1] += count;
		//初始化败者树的参赛者
		racer[i] = instr[i][0];
	}

	LoserTree<std::string> lt(SIZE);
	lt.Initialize(racer, SIZE);		//建立败者树
	
	winner = lt.Winner();	//取得最终胜者索引

	while( racer[winner] != MAXSTRING )
	{
		//将胜者写入输出缓冲区
		instr[0][inindex[0]++] = instr[winner][inindex[winner]++];

		//输出缓冲区满则将其输入到文件中
		if( inindex[0] >= Length/SIZE )	
		{
			for(i = 0; i < inindex[0]; i++)
			{
				out << instr[0][i] ;
				out << '\n';
			}
			//输出缓冲区指针归零
			inindex[0] = 0;			
		}
		if( inindex[winner] >= Length/SIZE )	//一个顺串归并完成
		{
			if(insum[winner - 1] > index[winner] )	//这个大顺串已经排序完成,放入标志
				racer[winner] = MAXSTRING;
			else
			{
				count = GetContent(in, insum[winner-1], &instr[winner][0], Length/SIZE, index[winner]-insum[winner-1]);
				insum[winner-1] += count;
				//改变参赛选手的值
				racer[winner] = instr[winner][0];
				//将顺串指针归零
				inindex[winner] = 0;
			}
		}
		else
			racer[winner] = instr[winner][inindex[winner]];	//取新的参赛者

		//重新比赛
		lt.RePlay(winner);
		winner = lt.Winner();
	}
	
	//排序完成后,输出缓存区中剩余的内容
	for(i = 0; i < inindex[0]; i++)
	{
		out << instr[0][i] ;
		out << '\n';
	}

	delete[] insum;
	delete[] inindex;
	delete[] racer;
	for(i = 0; i <= SIZE; i++)
		delete[] instr[i];
	delete []instr;
}

int order()
{
	int index[100];		//记录排序后各个数据块的地址
	int count;			//记录初始顺串的个数
	char infile[100], outfile[100];

	ifstream in;
	ofstream out;

	cout << endl << "请输入原始词库文件路径:" << endl;
	cin >> infile;
	in.open(infile);
	
	while(!in)
	{
		cout << endl << "此路径下没有此文件,打开文件失败!请重新输入文件路径,0退出" << endl;
		in.close();
		in.clear();
		cin >> infile;
		if(infile[0] == '0')
			return -1;
		in.open(infile);
	}
	
	cout << endl << "请输入初次排序后顺串文件的输出路径:" << endl;
	cin >> outfile;
	out.open(outfile, ios::app);	
	
	while(!out)
	{
		cout << endl << "在此路径下创建文件失败!请重新输入文件路径,0退出" << endl;
		out.close();
		out.clear();
		cin >> outfile;
		if(outfile[0] == '0')
			return -2;
		out.open(outfile);
	}
	
	count = iniDictionary(in, out, index);

	cout << endl << "顺串初始化成功!" << endl;

	in.close();
	out.close();
	in.clear();
	out.clear();

	in.open(outfile);
	cout << endl << "请输入最终排好序文件的输出路径:" << endl;
	cin >> outfile;
	out.open(outfile, ios::app);
	
	while(!out)
	{
		cout << endl << "在此路径下创建文件失败!请重新输入文件路径,0退出" << endl;
		out.close();
		out.clear();
		cin >> outfile;
		if(outfile[0] == '0')
			return -2;
		out.open(outfile);
	}
	
	multiMerge(in, out, count, index);

	cout << endl << "词库排序成功!" << endl;

	in.close();
	out.close();

	return 0;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -