⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 chinese_get.cpp

📁 从网页中提取出汉字信息
💻 CPP
字号:
#include<iostream.h>
#include<string.h>
#include<fstream.h>
char *statekey="tileh12p<>/TILEH12P<>/";//关键字表
char delchar[33]="0123456789&#@ %^=-\"<>/;:!(+)?,._";//需要删掉的字符
int state_switch[12][11]={   0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 ,
                             2 ,0 ,0 ,0 ,7 ,0 ,0 ,9 ,1 ,0 ,11,
							 0 ,3 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 ,
							 4 ,0, 0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 ,
							 0, 0 ,5 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 ,
							 0, 0 ,0 ,6 ,0 ,0 ,0, 0 ,1 ,0 ,0 ,
							 0, 0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,10,0 ,
							 0, 0 ,0 ,0 ,0 ,8 ,8 ,0 ,1 ,0 ,0 ,
							 0, 0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,10,0 ,
							 0, 0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,10,0 ,
                             10,10,10,10,10,10,10,10,1 ,10,10,
                             0, 0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 };
//state_switch为状态转化矩阵,其中0为开始态7,10,12,15为终止态
class character_pickde
{
    public:
		char current_ch[1];//当前读入字符
        int state;//当前状态
		int befor_state;//前一个状态

        character_pickde(char,int);//构造函数
		int check_key(char);//查找字表
		int judgeabc(char);//判断是否为字母
		int judgedelchar(char);//判断是否为需要删掉的字符
		char *exchange(int);//把数字转化为字符串
		char *string_join(char*,char*);//字符串连接
		int judge_char(char*,char*);//汉字判断是否相等
		void operate(int);//汉字提取操作
};
//-------------------------------------------------------------------
character_pickde::character_pickde(char a,int b)//构造初始化
{
	current_ch[0]=a;
	state=b;
	befor_state=b;
}
//-------------------------------------------------------------------

int character_pickde::check_key(char e)//查找字表
{
	for(int i=0;i<22;i++)
		if(e==statekey[i])
			return i%11;
	return -1;
}
//--------------------------------------------------------------------
int character_pickde::judgeabc(char e)//判断是否为字母
{
	if((e>='a'&&e<='z')||(e>='A'&&e<='Z'))
		return 1;
	return 0;
}
//--------------------------------------------------------------------
int character_pickde::judgedelchar(char e)//判断是否需要删掉
{
	for(int i=0;i<33;i++)
		if(e==delchar[i])
			return 1;
	return 0;
}
//----------------------------------------------------------------------
int character_pickde::judge_char(char *a,char *b)//判断汉字是否相等
{
	for(int i=0;i<2;i++)
		if(a[i]!=b[i])
		    return 0;
		return 1;
}
//--------------------------------------------------------------------
char *character_pickde::exchange(int a)//数字转化为字符串
{
	char *word1=new char[3];
	word1[0]='#';word1[1]='#';word1[2]='#';
	char e;
	int c=0,b=0,m=a;
	while(a>0)
	{
		b=a%10;
		a=int(a/10);
		switch(b)
		{
		    case 0:e='0';break;
		    case 1:e='1';break;
		    case 2:e='2';break;
			case 3:e='3';break;
			case 4:e='4';break;
			case 5:e='5';break;
			case 6:e='6';break;
			case 7:e='7';break;
			case 8:e='8';break;
			case 9:e='9';break;
			default:cout<<"ERROR!";
		}
		word1[c]=e;
		c++;
	}
	char *word=new char[3];
	word[0]='#';word[1]='#';word[2]='#';
    int i,j;
	for(i=2,j=0;i>=0;i--)
	{
		if(word1[i]!='#')
		{
			word[j]=word1[i];
			j++;
		}
	}
	return word;
}
//---------------------------------------------------------------------
char *character_pickde::string_join(char *a,char *b)//字符串连接
{
	unsigned int len=0;unsigned i=0,j=0,m=0;
	char *stem=new char[strlen(a)+7];
	for(j=0;j<strlen(a);j++)
			stem[j]=a[j];
	for(i=0;i<3;i++)
	{
		stem[strlen(a)+i]=b[i];
	}
	for(unsigned int k=0;k<strlen(a)+3;k++)
	{
		if(stem[k]=='#')
		{
			m=k;
			break;
		}
		else m=strlen(a)+3;
	}
	stem[m]='.';stem[m+1]='t';stem[m+2]='x';stem[m+3]='t';stem[m+4]='\0';
	return stem;
}
//---------------------------------------------------------------------
void character_pickde::operate(int num_name)
{
	char *stemp=new char[3];
	char *cop=new char[2];
	for(int i=0;i<3;i++)
		stemp[i]=exchange(num_name)[i];
	char *filename1="E:\\中文web页面分类\\程序\\财经\\";
	char *filename2="E:\\中文web页面分类\\程序\\处理后的财经\\";
	ifstream infile(string_join(filename1,exchange(num_name)));
	ofstream outfile(string_join(filename2,exchange(num_name)));
	char e;
	int across=0,portrait=0;//横向和纵向坐标
	int check=0;
	int flag=0;
	while(!infile.eof())
	{
		infile.get(current_ch,2,char(-1));		
		e=current_ch[0];
		cop[1]=e;
		check=check_key(e);
		if(check!=-1)
			state=state_switch[state][check];
		if(befor_state==10)
			flag=1;
		if(flag==1&&judgeabc(e)==0&&judgedelchar(e)==0&&e!='\n')
		{
			outfile.write(current_ch,strlen(current_ch));
		}
		if(flag==1&&state==11)
		{
	        outfile.write("\n",strlen("\n"));
			flag=0;
		}
		befor_state=state;
		cop[0]=e;
	}
	infile.close();
	outfile.close();
}
//-----------------------------------------------------------------------
void main()
{
	int start_n=1,file_number=1;
	cout<<"请输入所要提取的开始文件和文件数:";
	cin>>start_n;
	cin>>file_number;
	character_pickde ch_pi('0',0);
	for(int i=start_n;i<start_n+file_number;i++)
	{
			ch_pi.operate(i);
	}
	cout<<"结束";
}


⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -