⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zwfc.cpp

📁 中文分词算法,效率很高,使用词典树装搜索进行单词切割,并提供扩充词库的函数!
💻 CPP
字号:
//---------------------------------------------------------------------------


#pragma hdrstop

#include "ZWFC.h"

//---------------------------------------------------------------------------
#pragma package(smart_init)
//---------------------------------------------------------------------------
Dictiory::Dictiory()
{
	int i,j;
	for(i = 0; i < e1 - s1;i++)
		for(j = 0; j < e2 - s2;j++)
			hash[i][j] = -1;
	H.resize(6768);		
	fin.open(Sfilename.c_str());
	fout.open(Dfilename.c_str());		
	LoadDic();
}
//---------------------------------------------------------------------------
Dictiory::Dictiory(string sfilename,string dfilename)
{
	int i,j;
	for(i = 0; i < e1 - s1;i++)
		for(j = 0; j < e2 - s2;j++)
			hash[i][j] = -1;
	H.resize(6768);		
	fin.open(sfilename.c_str());
	fout.open(dfilename.c_str());
	LoadDic();
}
//---------------------------------------------------------------------------
int Dictiory::BinarySearch(string str,int k)
{
	int len = str.length();
	int L = 0,R = H[k].W.size() - 1,M;
	while(L <= R)
	{
		M = (L + R)/2;
		if(H[k].W[M]->key.size() == len)
			return M;
		else if(H[k].W[M]->key.size() < len)
			L = M + 1;
		else R = M - 1;
	}	
	return -1;
}
//---------------------------------------------------------------------------
void Dictiory::AddWord(string str,int k)
{	
	if(str.length() > H[k].size)
	{
		H[k].size = str.length();
		Second *t = new Second(str);
		H[k].W.push_back(t);
	}
	else
		InsertWord(str,k);	
}
//---------------------------------------------------------------------------
void Dictiory::InsertWord(string str,int k)
{
	int in = BinarySearch(str,k);	
	if(in == -1)
	{
	   int L = 0,R = H[k].W.size() - 1;
	   int len = str.length();
	   while(L <= R&&len > H[k].W[L]->key.size())
		L++;
	   H[k].W.resize(H[k].W.size() + 1);		
	   for(int i = R + 1;i > L;i--)
		  H[k].W[i] = H[k].W[i - 1];
	   Second *t = new Second(str);
	   H[k].W[L] = t;
	}
	else
	{
		Second *pre,*t = H[k].W[in];
		while(t)
		{
		  pre = t;
		  t = t->next;
		}
		pre->next = new Second(str);
	}
}
//---------------------------------------------------------------------------
int Dictiory::GetNum()
{
	char cstr[maxwordlen];
	fin.getline(cstr,maxwordlen);
	int n = 0,i;
	for(i = 0; i < strlen(cstr);i++)
		n = n * 10 + cstr[i] - '0';
	return n;
}
//---------------------------------------------------------------------------
void Dictiory::LoadDic()
{
	char cstr[maxwordlen];
	string str;
	int i,j,k = 0,wordnumber;
    while(fin.getline(cstr,maxwordlen))
    {
		i = (unsigned char)cstr[0] - s1;
		j = (unsigned char)cstr[1] - s2;
		hash[i][j] = k;		
		H[k].key = cstr;				
		wordnumber = GetNum();				
		for(i = 0; i < wordnumber;i++)
		{
             fin.getline(cstr,maxwordlen);             
			 str = cstr;
			 str = str.substr(2,str.length() - 2);			 
	    	 AddWord(str,k);
		}
		k++;
	}
}
//---------------------------------------------------------------------------
bool Dictiory::IsC(char c)
{
	unsigned value =  unsigned((unsigned char)c);
	return value >= s1&&value < e1;
}
//---------------------------------------------------------------------------
bool Dictiory::IsEc(char c)
{
	unsigned value =  unsigned((unsigned char)c);
	return value <= 0X7F;
}
//---------------------------------------------------------------------------
bool Dictiory::IsWord(string str,int k,int t)
{	
	Second *temp = H[k].W[t];
	while(temp)
	{
		if(temp->key == str)
			return true;
		temp = temp->next;
	}
	return false;
}
//---------------------------------------------------------------------------
void Dictiory::SkipNotChinese(string &str,stack<string> &stk)
{	
	unsigned L = 0,R = str.length();
	while(L < R&&!IsC(str[L]))
	{		
		if(!IsEc(str[L]))
			L++;		
		L++;
	}
	if(L > 0)
	{
	    stk.push(str.substr(0,L));	
	    str = str.substr(L,R - L);
	}
}
//---------------------------------------------------------------------------
void Dictiory::SegmentWord(string s)
{
	stack<string> stk;
	fcin.open(s.c_str());	
	char cstr[maxwordlen];
	string str,sstr;	
	int i,j,startpos,endpos;
	char c;
	while(fcin.read(&c,sizeof(char)))
	{
		if(!IsC(c))
		{		
			if(!str.empty())
			{
				cout << str << " " << str.length() << endl;
		        startpos = 0,endpos = str.length();
		        while(startpos < endpos)
		        {				
			       if(str.length() <= 2)
			       {				
				       stk.push(str);
				       if(!sstr.empty())
				       {
					       str = sstr;
					       sstr.clear();
				        }
				        startpos += 2;
			        }
			       else
			       {			
				       i = (unsigned char)str[0] - s1,j = (unsigned char)str[1] - s2;				
				       if(hash[i][j] >= 0)
				       {
					        string word = str.substr(2,str.length() - 2);
					        int in = BinarySearch(word,hash[i][j]);
					        if((in != -1)&&IsWord(word,hash[i][j],in))
					        {
						        stk.push(H[hash[i][j]].key + word);
						        startpos += str.length();
						        str = sstr;
						        sstr.clear();						
					         }
					         else
					         {
						         sstr = sstr + str.substr(0,2);
						         str = str.substr(2,str.length() - 2);
					         }
				         }
				         else
				         {						
					          sstr = sstr + str.substr(0,2);
					          str = str.substr(2,str.length() - 2);
				         }
			         }
		         }
		         while(!stk.empty())
 	             {
		            fout << stk.top() << endl;
		            stk.pop();
	              }
			}							
			str.clear();
			str += c;
			while(fcin.read(&c,sizeof(char))&&!IsC(c))			
					 str += c;
			 fout << str << endl;
			 cout << str << " " << str.length() <<  endl;
			 str.clear();
			 str += c;
			 fcin.read(&c,sizeof(char));
			 str += c;
		}	
		else
		{
			 str += c;
			 fcin.read(&c,sizeof(char));
			 str += c;
		}
	}
}
//---------------------------------------------------------------------------
void Dictiory::PrintDic()
	{
		for(int i = 0; i < e1 - s1;i++)
			for(int j = 0; j < e2 - s2;j++)
			{
				if(hash[i][j] >= 0)
				{
					fout << H[hash[i][j]].key << endl;
					for(int k = 0; k < H[hash[i][j]].W.size() ;k++)
					{
						Second *t = H[hash[i][j]].W[k];
						while(t)
						{
							fout << H[hash[i][j]].key;
							fout << t->key << endl;
							t = t->next;
						}
					}
				}
				fout << endl;
			}
	}
//---------------------------------------------------------------------------

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -