split.cpp

来自「自己写的简单分词程序」· C++ 代码 · 共 297 行

CPP

297 行

#include "Split.h"

Split::Split()
{
	pStr = new string[100];
	pResult = new string[100];
	strEng = new string[200];
	nEngStrNum = new int[100];
	nLength = 0;
	index = 0;
	nEngLength = 0;
	for(int i = 0; i < 100; i++)
	{
		nEngStrNum[i] = 0;
	}
}

string* Split::SplitString(char *Source)
{
	char *pChTemp = new char[3];

	int iPos = 0;
	int index = 0;
	pChTemp[2] = 0;
	int i = 0;
	bool bEng = false;


	while(Source[iPos] != 0)
	{
		int j = iPos;

		pChTemp[0] = Source[j];

		if(Source[j] < 0)
		{
			if(bEng == true)
			{
				i++;
			}
			bEng = false;
			pChTemp[1] = Source[j+1];
			j = j+1;
			pStr[nLength] = pChTemp;
		}
		else
		{
			nEngStrNum[i] = nEngStrNum[i] + 1;//记录连续出现的英文字符串的个数
			bEng = true;
			pChTemp[1] = 0;
			pStr[nLength] = pChTemp;
			while(1)
			{
				char c = ' ';
				if(memcmp(&Source[j+1],&c,1) == 0)
				{
					break;
				}
				
				if(Source[j+1] > 0)
				{
					pChTemp[0] = Source[j+1];
					pChTemp[1] = 0;
					j = j + 1;
					pStr[nLength] = pStr[nLength] + pChTemp;
				}
				else
				{	
					break;
				}
			}
			
		}

		j++;
		nLength++;
		iPos = j;
	}

	return pStr;
}

int Split::getLength()
{
	return this->nLength;
}

//正向最大匹配法
string* Split::Seg(vector<string> strArr)
{
	string strSource = "";
	bool   bIsEngString = false;
	int l = 0;
	
	for(int j = 0; j <= nLength;)
	{
		if(j == nLength)
		{
			if(strSource != " "&&strSource != "")
			{
				strSource = pStr[j];
				pResult[index] = strSource;
				cout<<strSource<<endl;
				break;
			}
			else
			{
				break;
			}
			
		}
		else
		{
			int i = j;
			
			//生成要进行匹配的字符串
			for(i; i < nLength; i++)
			{
				strSource = strSource + pStr[i];	
			}
			//////////////////////////////////////////////////////////////////////////
			
			while(1)
			{
				if(FindStrInDic(strSource,strArr) == false)
				{
					//如果只剩下一个字符，那么要判断剩下的是两个英文字符，还是一个中文字符
					if(strSource.length() == 2 && strSource[strSource.length()-1] < 0 )
					{
						if(strSource != " "&&strSource != "")
						{
							pResult[index] = strSource;
							cout<<strSource<<endl;
							index++;
							
						}
						break;
					}
					else if(strSource.length() == 1)
					{
						if(strSource != " "&&strSource != "")
						{
							pResult[index] = strSource;
							cout<<strSource<<endl;
							index++;
						
						}
						break;
						
					}
					//////////////////////////////////////////////////////////////////////////
					
					//判断最后一个字符是英语字符还是中文字符,如果是strSource是一个英文字符串，则不再进行分词,跳出此次循环
					if(strSource[strSource.length()-1] < 0)
					{
						strSource = strSource.substr(0,strSource.length()-2);
					}
					else
					{
						//判断是否是一个英文字符串
						for(int n = 0; n < strSource.length(); n++)
						{
							if(strSource[n] < 0)
							{
								strSource = strSource.substr(0,strSource.length()-1);
								break;
							}
							if(n == strSource.length() - 1)
							{
								bIsEngString = true;
							}
						}
						//////////////////////////////////////////////////////////////////////////
						
						if(bIsEngString == true)
						{
							FindInterpunction(strSource);
							for(int k = 0; k < nEngLength; k++)
							{
								if(strSource != " "&&strSource != "")
								{
									pResult[index] = strEng[k];
									index++;
									cout<<strEng[k]<<endl;
									string str;
									strEng[k].erase();
								}
								
							}
							nEngLength = 0;
							break;
						}
						
					}
					/////////////////////////////////////////////////////////////////////////////
					
				}
				else
				{
					if(strSource != " "&&strSource != "")
					{
						pResult[index] = strSource;
						cout<<strSource<<endl;
						index++;
					}
					break;
				}
			
			}

			//判断最后一个字符是英语字符还是中文字符,如果是一个英文字符串则把它看做一个词
			if(bIsEngString == true)
			{
				j = j + nEngStrNum[l];
				l++;
				bIsEngString = false;
			}
			else if(strSource[strSource.length()-1] < 0)
			{
				j = j + strSource.length()/2;
			}
			else 
			{
				j = j + strSource.length();
			}
			//////////////////////////////////////////////////////////////////////////
			
			strSource = "";
		}	
	}
	
	return this->pResult;
}

bool Split::FindStrInDic(string str,vector<string> strArr)
{
	vector<string>::iterator it = find(strArr.begin(),strArr.end(),str);
	if(it != strArr.end())
	{
		return true;
	}
	else
	{
		return false;
	}
}

int Split::getIndex()
{
	return this->index;
}

//如果英文字符串中含有标点符号，则取出来
string* Split::FindInterpunction(string str)
{
	char Interpunction[] = ", . ? / < > ; : ' \" [ { ] } \\ | ! @ # $ % ^ & * ( )";
	char* Temp = new char[200];
	strcpy(Temp,str.c_str());
	char *token = strtok(Temp,Interpunction);
	int length = 0;//用来记录标点符号的位置token中取出标点符号
	if(token == NULL)
	{
		strEng[nEngLength] = str;
		nEngLength++;
		return strEng;
	}
	else
	{
		while(token != NULL)
		{
			strEng[nEngLength] = token;
			nEngLength++;
			
			//第一次取标点符号时，由于前面没有标点符号，则不需要加一，以后取出标点符号时，由于第
			//一个标点符号没有算进token中，所以需要加一
			if(length == 0)
			{
				length = length + strlen(token);
			}
			else
			{
				length = length + strlen(token) + 1;
			}
			//////////////////////////////////////////////////////////////////////////
			
			if(str[length] != 0)
			{
				strEng[nEngLength] = str[length];
				nEngLength++;
			}
			
			token = strtok(NULL,Interpunction);
		}
	}
	return strEng;
	
}

split.cpp - 源码说明

本页面展示了「自己写的简单分词程序」中的 split.cpp 源码文件，采用 C++ 编程语言编写，共 297 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫开发者社区收录了大量与中文分词相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?