⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 word.cpp

📁 一个可以将进行中文分词后的文档与标准文档进行比较的工具
💻 CPP
字号:
#include "stdafx.h"
#include "word.h"



int ComparePre(WordVector& vr, WordVector& vo, const char* fileinfo) //词语匹配  
{
	int count=0;
	
	int ir=0,io=0;
	
	WordInfo  *pwir, *pwio;

	ofstream offf(fileinfo);

	ofstream offf_wrong("分词错误排序.txt");
	map<string,int> smap;


	while ( (ir<vr.size()) && (io<vo.size()) )
	{
		pwir = &vr[ir];  //评测文本
		pwio = &vo[io];	 //标准文本
			
		if(pwir->content != pwio->content)
		{
			offf<<ir<<"\t"<<pwir->content<<"\t"<<io<<"\t"<<pwio->content<<endl;
			string tt = pwir->content + "\t" + pwio->content;
			smap[tt]++;
			
			ir++; io++;
			if(ir==vr.size()) break;
			if(io==vo.size()) break;
			
			pwir = &vr[ir];  //评测文本
			pwio = &vo[io];	 //标准文本

			while (pwir->startPos != pwio->startPos)
			{
				if (pwir->startPos > pwio->startPos)		
				{ 
					io++; 
					if (io==vo.size()) break;
					pwio = &vo[io];

				}
				else 
				{
					ir++; 
					if (ir==vr.size()) break;
					pwir = &vr[ir];
				}
			}
		}
		else 
		{
			count++;
			ir++; io++;
		}
		
	}
	
	offf.close();

	multimap<int,string> smulmap;

	map<string,int>::iterator its;			//分析有多少词是不同的
	for(its = smap.begin();its!=smap.end();its++)
	{
		string ss = its->first;
		int ii = its->second;
		smulmap.insert(multimap<int,string>::value_type(ii,ss));
	}
		

	multimap<int,string>::iterator itss;
	for(itss = smulmap.end(); itss != smulmap.begin(); itss--)
		offf_wrong<<itss->first<<"\t"<<itss->second<<endl;

	offf_wrong.close();	
	
	return count;
}

void Compare(const char* filepath_cmp, const char* filepath_std, const char* filename_cmp, const char* filename_std, const char* curDirectory)
{
	ifstream fr(filepath_cmp);//评测文本
	ifstream fo(filepath_std);//标准文本
	string hispath = curDirectory;
	hispath += "\\";
	hispath += "history.txt";
	ofstream historyfile(hispath.c_str(),iostream::app);
	ofstream resultfile("result.txt");



	string sr,so;

	WordVector vr,vo;
	
	int pos = 0; //记录实际 汉字的起始位置
	while (fr>>sr){  //构造结果数组 
		WordInfo wi;
		wi.startPos = pos;

		if(sr[0] == '[')
		{
			sr = sr.substr(1);
		}
		int chineseLen = sr.find_first_of('/');
		if (chineseLen == string::npos)
		{
			chineseLen = sr.length();
			wi.content = sr;
		}
		else 
		{
			string ssr = sr.substr(0,chineseLen);
			wi.content = ssr;
		}

		pos += chineseLen;
		vr.push_back(wi);

	}
	


	string fileinfo;

	string filename1 = filename_cmp;
	//pos = filename1.find_last_of("\\");
	//fileinfo = filename1.substr(pos+1);
	//pos = filename1.find_last_of(".");
	fileinfo = filename1;//.substr(0,pos);

//	pos = filename1.find_last_of(".");
	//string filename2 = filename1.substr(0,pos);
	//string filename3 = filename1.substr(pos);
	filename1 = ".\\" + filename1 + "_word" + ".txt";

	

	ofstream oof(filename1.c_str());
	vector<WordInfo>::iterator itsv ;
	for(itsv = vr.begin(); itsv != vr.end(); itsv++)
	{
		WordInfo temps = *itsv;
		oof<<temps.content<<endl;
	}
	oof.close();	



	pos = 0; //记录实际 汉字的起始位置
	while (fo>>so){ //构造对照数组
		WordInfo wi;
		//wi.content = so;
		wi.startPos = pos;
		
		if(so[0] == '[')
		{
			so = so.substr(1);
		}

		int chineseLen = so.find_first_of('/');
		if (chineseLen == string::npos)
		{
			chineseLen = so.length();
			wi.content = so;
		}
		else 
		{
			string ssr = so.substr(0,chineseLen);
			wi.content = ssr;
		}
		pos += chineseLen;
		vo.push_back(wi);
		
	}

	string fileinfo1;

	filename1 = filename_std;

	//pos = filename1.find_last_of("\\");
	//fileinfo1 = filename1.substr(pos+1);
	//pos = filename1.find_last_of(".");
	fileinfo1 = filename1;//.substr(0,pos);
	fileinfo = ".\\" + fileinfo +"_" + fileinfo1 + ".txt";


	//pos = filename1.find_last_of(".");
	//filename2 = filename1.substr(0,pos);
	//filename3 = filename1.substr(pos);
	filename1 = ".\\" + filename1 + "_word" + ".txt";

	ofstream oof1(filename1.c_str());
	
	for(itsv = vo.begin(); itsv != vo.end(); itsv++)
	{
		WordInfo temps = *itsv;
		oof1<<temps.content<<endl;
	}
	oof1.close();	

	int sc = ComparePre(vr,vo,fileinfo.c_str());
	float s1 = (float)sc/(float)vr.size();
	float s2 = (float)sc/(float)vo.size();
	float s3 = 2*s1*s2/(s1+s2);
	
	resultfile<<"\n\n"<<filepath_cmp<<endl<<filepath_std<<endl
				<<"切分出的词语总数                           "<<vr.size()<<endl
				<<"标准结果中的词语总数                       "<<vo.size()<<endl
				<<"切分出的词语中出现在标准结果中的词语数     "<<sc<<endl
				<<"分词正确率                                 "<<s1*100<<"%"<<endl
				<<"分词召回率                                 "<<s2*100<<"%"<<endl
				<<"分词F值                                    "<<s3*100<<"%"<<endl;

	historyfile<<"\n\n"<<filepath_cmp<<endl<<filepath_std<<endl
				<<"切分出的词语总数                           "<<vr.size()<<endl
				<<"标准结果中的词语总数                       "<<vo.size()<<endl
				<<"切分出的词语中出现在标准结果中的词语数     "<<sc<<endl
				<<"分词正确率                                 "<<s1*100<<"%"<<endl
				<<"分词召回率                                 "<<s2*100<<"%"<<endl
				<<"分词F值                                    "<<s3*100<<"%"<<endl;
	

	resultfile.close();
	historyfile.close();
	fr.close();
	fo.close();


}






















⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -