word.cpp

来自「一个可以将进行中文分词后的文档与标准文档进行比较的工具」· C++ 代码 · 共 258 行

CPP

258 行

#include "stdafx.h"
#include "word.h"



int ComparePre(WordVector& vr, WordVector& vo, const char* fileinfo) //词语匹配　　
{
	int count=0;
	
	int ir=0,io=0;
	
	WordInfo  *pwir, *pwio;

	ofstream offf(fileinfo);

	ofstream offf_wrong("分词错误排序.txt");
	map<string,int> smap;


	while ( (ir<vr.size()) && (io<vo.size()) )
	{
		pwir = &vr[ir];  //评测文本
		pwio = &vo[io];	 //标准文本
			
		if(pwir->content != pwio->content)
		{
			offf<<ir<<"\t"<<pwir->content<<"\t"<<io<<"\t"<<pwio->content<<endl;
			string tt = pwir->content + "\t" + pwio->content;
			smap[tt]++;
			
			ir++; io++;
			if(ir==vr.size()) break;
			if(io==vo.size()) break;
			
			pwir = &vr[ir];  //评测文本
			pwio = &vo[io];	 //标准文本

			while (pwir->startPos != pwio->startPos)
			{
				if (pwir->startPos > pwio->startPos)		
				{ 
					io++; 
					if (io==vo.size()) break;
					pwio = &vo[io];

				}
				else 
				{
					ir++; 
					if (ir==vr.size()) break;
					pwir = &vr[ir];
				}
			}
		}
		else 
		{
			count++;
			ir++; io++;
		}
		
	}
	
	offf.close();

	multimap<int,string> smulmap;

	map<string,int>::iterator its;			//分析有多少词是不同的
	for(its = smap.begin();its!=smap.end();its++)
	{
		string ss = its->first;
		int ii = its->second;
		smulmap.insert(multimap<int,string>::value_type(ii,ss));
	}
		

	multimap<int,string>::iterator itss;
	for(itss = smulmap.end(); itss != smulmap.begin(); itss--)
		offf_wrong<<itss->first<<"\t"<<itss->second<<endl;

	offf_wrong.close();	
	
	return count;
}

void Compare(const char* filepath_cmp, const char* filepath_std, const char* filename_cmp, const char* filename_std, const char* curDirectory)
{
	ifstream fr(filepath_cmp);//评测文本
	ifstream fo(filepath_std);//标准文本
	string hispath = curDirectory;
	hispath += "\\";
	hispath += "history.txt";
	ofstream historyfile(hispath.c_str(),iostream::app);
	ofstream resultfile("result.txt");



	string sr,so;

	WordVector vr,vo;
	
	int pos = 0; //记录实际　汉字的起始位置
	while (fr>>sr){  //构造结果数组 
		WordInfo wi;
		wi.startPos = pos;

		if(sr[0] == '[')
		{
			sr = sr.substr(1);
		}
		int chineseLen = sr.find_first_of('/');
		if (chineseLen == string::npos)
		{
			chineseLen = sr.length();
			wi.content = sr;
		}
		else 
		{
			string ssr = sr.substr(0,chineseLen);
			wi.content = ssr;
		}

		pos += chineseLen;
		vr.push_back(wi);

	}
	


	string fileinfo;

	string filename1 = filename_cmp;
	//pos = filename1.find_last_of("\\");
	//fileinfo = filename1.substr(pos+1);
	//pos = filename1.find_last_of(".");
	fileinfo = filename1;//.substr(0,pos);

//	pos = filename1.find_last_of(".");
	//string filename2 = filename1.substr(0,pos);
	//string filename3 = filename1.substr(pos);
	filename1 = ".\\" + filename1 + "_word" + ".txt";

	

	ofstream oof(filename1.c_str());
	vector<WordInfo>::iterator itsv ;
	for(itsv = vr.begin(); itsv != vr.end(); itsv++)
	{
		WordInfo temps = *itsv;
		oof<<temps.content<<endl;
	}
	oof.close();	



	pos = 0; //记录实际　汉字的起始位置
	while (fo>>so){ //构造对照数组
		WordInfo wi;
		//wi.content = so;
		wi.startPos = pos;
		
		if(so[0] == '[')
		{
			so = so.substr(1);
		}

		int chineseLen = so.find_first_of('/');
		if (chineseLen == string::npos)
		{
			chineseLen = so.length();
			wi.content = so;
		}
		else 
		{
			string ssr = so.substr(0,chineseLen);
			wi.content = ssr;
		}
		pos += chineseLen;
		vo.push_back(wi);
		
	}

	string fileinfo1;

	filename1 = filename_std;

	//pos = filename1.find_last_of("\\");
	//fileinfo1 = filename1.substr(pos+1);
	//pos = filename1.find_last_of(".");
	fileinfo1 = filename1;//.substr(0,pos);
	fileinfo = ".\\" + fileinfo +"_" + fileinfo1 + ".txt";


	//pos = filename1.find_last_of(".");
	//filename2 = filename1.substr(0,pos);
	//filename3 = filename1.substr(pos);
	filename1 = ".\\" + filename1 + "_word" + ".txt";

	ofstream oof1(filename1.c_str());
	
	for(itsv = vo.begin(); itsv != vo.end(); itsv++)
	{
		WordInfo temps = *itsv;
		oof1<<temps.content<<endl;
	}
	oof1.close();	

	int sc = ComparePre(vr,vo,fileinfo.c_str());
	float s1 = (float)sc/(float)vr.size();
	float s2 = (float)sc/(float)vo.size();
	float s3 = 2*s1*s2/(s1+s2);
	
	resultfile<<"\n\n"<<filepath_cmp<<endl<<filepath_std<<endl
				<<"切分出的词语总数                           "<<vr.size()<<endl
				<<"标准结果中的词语总数                       "<<vo.size()<<endl
				<<"切分出的词语中出现在标准结果中的词语数     "<<sc<<endl
				<<"分词正确率                                 "<<s1*100<<"％"<<endl
				<<"分词召回率                                 "<<s2*100<<"％"<<endl
				<<"分词F值                                    "<<s3*100<<"％"<<endl;

	historyfile<<"\n\n"<<filepath_cmp<<endl<<filepath_std<<endl
				<<"切分出的词语总数                           "<<vr.size()<<endl
				<<"标准结果中的词语总数                       "<<vo.size()<<endl
				<<"切分出的词语中出现在标准结果中的词语数     "<<sc<<endl
				<<"分词正确率                                 "<<s1*100<<"％"<<endl
				<<"分词召回率                                 "<<s2*100<<"％"<<endl
				<<"分词F值                                    "<<s3*100<<"％"<<endl;
	

	resultfile.close();
	historyfile.close();
	fr.close();
	fo.close();


}

word.cpp - 源码说明

本页面展示了「一个可以将进行中文分词后的文档与标准文档进行比较的工具」中的 word.cpp 源码文件，采用 C++ 编程语言编写，共 258 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与文档相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?