⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 extract.cpp

📁 从htm/html格式的网页文件中提取内容。将要提取内容的网页文件用鼠标拖入窗口
💻 CPP
📖 第 1 页 / 共 2 页
字号:
//================================================================================
// CLASS   : Extract
// FUNCTION: 提取网页内容,去除html标签。
// AUTHOR  : 2006-05-21 Created by navy
// REFER   : 输入文件名(*.htm/html)
// NOTE    : Extract.cpp , implementation file
//================================================================================
#include "StdAfx.h"
#include "Extract.h"
#include "windows.h"

Extract::Extract()
{	
	strUseful = STR_USEFUL ;           // "<TITLE>,<TR>,<TD>,<P>,<BR>,<DIV>"
}

//--------------------------------------------------------------------------------
// FUNCTION: 处理文件。
// IN      : 文件名。
// OUT     : .txt文件
// AUTHOR  : 2006-05-24 Created by navy .
// NOTE    : 
//--------------------------------------------------------------------------------
void Extract::funProcessFile(string strFileName)
{
	funProcess(strFileName);
}

//--------------------------------------------------------------------------------
// FUNCTION: 处理目录中所有扩展名为strExteName的文件
// IN      : 源目录、目标目录、扩展名
// OUT     : txt文件。
// AUTHOR  : 2006-05-24 Created by navy .
// NOTE    : 目标目录为空时,默认为源目录
//--------------------------------------------------------------------------------
void Extract::funProcessDirectory(string strDirSour,string strDirDest,string strExteName)
{
	long h;
	_finddata_t fdata;
	string filename;
	int nCount = 0 ;
	if(strDirDest.empty())
	{
		strDirDest = strDirSour ;
	}
	strDirSource = strDirSour + "\\" ;
	strDirDestination = strDirDest + "\\" ; 
	h = _findfirst((strDirSource+"\\*."+strExteName).c_str(),&fdata);
	if (h == -1) 
	{
		//cout << "No match files .";
		return;
	}
	//funProcess( strDirDest+"\\"+fdata.name);
	if (fdata.size < 3*1024*1024) 
	{
		nCount ++ ;
		cout << nCount << " : " <<"processing " << fdata.name << "    ";
		funProcess(fdata.name);
	}
	else
	{
		//nCount ++ ;
		cout << "********************************************************" << endl;
		cout << "* size over 3M !" << endl;
		cout << "* file name : " << fdata.name << "  did not process .  "  << endl;
		cout << "********************************************************" << endl;
	}
	
	while (1)
	{
		if (_findnext(h,&fdata) == 0)
		{
			//funProcess( strDirDest+"\\"+fdata.name);
			if (fdata.size < 3*1024*1024) 
			{
				nCount ++ ;
				cout << nCount << " : " <<"processing " << fdata.name << "    ";
				funProcess(fdata.name);
			}
			else
			{
				//nCount ++ ;
				cout << "********************************************************" << endl;
				cout << "* size over 3M !" << endl;
				cout << "* file name : " << fdata.name << "  did not process .  "  << endl;
				cout << "********************************************************" << endl;
			}
		}
		else
		{
			break;
		}		
	}
	_findclose(h);
}

//--------------------------------------------------------------------------------
// FUNCTION: 打开网页文件,去除标签,提取内容。
// IN      : 网页文件(*.htm/html)。
// OUT     : txt文件。
// AUTHOR  : 2006-05-22 Created by navy .
// NOTE    : 
//--------------------------------------------------------------------------------
void Extract::funProcess(string filename)
{
	ifstream fin((strDirSource + filename).c_str());
	string strLine,strResult;
	int i = 1 , j = 1 ;
	char *sResultFile;
	sResultFile=new char[100];
	//strUseful = STR_USEFUL ;           // "<TITLE>,<TR>,<TD>,<P>,<BR>,<DIV>"

	funDestFilename(filename.c_str(),sResultFile);
	cout << "." ;                      //输出“.”号

	if (!fin) 
	{
		cout<<"Can't open the file !"<<endl ;
		return ; 
	}


	while ( getline(fin,strLine) )
	{
		if (i%3000 == 0) cout << "." ;
		if (i++ == 10000)
		{
			cout <<" " << j++ << "W";
			i = 1 ;
		}
		funConvCapital(strLine);
		//funPreProcess(strLine);		
		while((strLine[strLine.size()-1] =='\r') || (strLine[strLine.size()-1] =='\n')) //\r是换行,\n回车
		{
			strLine = strLine.substr(0, strLine.size()-1);
		}
		strResult+=strLine;		
	}
	
	cout << "." ;                      //输出“.”号
	funDelete2(strResult,"SCRIPT",0);    //去除<SCRIPT></SCRIPT>间的内容
	funDelete2(strResult,"STYLE",0);     //去除<STYLE></STYLE>间的内容
	funDelete2(strResult,"XML",0);     //去除<XML></XML>间的内容

	if ( STR_A ) funDelete2(strResult,"A",0); 
	funDelete(strResult,"<A");         //将<A ...>变为<A>
	funDelete(strResult,"<P");         //将<P ...>变为<P>
	funDelete(strResult,"<TD");

	if ( STR_REPLACE_VERTICAL ) funReplace(strResult,"|"," "); 
	if ( STR_SPACE )   funReplace(strResult," ",""); //是否删除所有空格

	funDelUselessLink(strResult);      //删除无用链接

	cout << "." ;                      //输出“.”号
	funKeepListLabel(strResult,0);     //0表示从第一个字符开始查找
	cout << "." ;                      //输出“.”号
	funReplace(strResult,"	"," ");      //用空格替换TAB
	funReplace(strResult,"  "," ");      //最多保留一个空格
	cout << "." ;                      //输出“.”号
	funResult(strResult);
	cout << "." ;                      //输出“.”号
	funOutput(sResultFile);

	/**debug*******************/
//	ofstream fout(sResultFile);
//	fout<< strResult << endl;
//	fout.close();
	/**debug*******************/

	//delete sResultFile;
	fin.close();
}


//--------------------------------------------------------------------------------
// FUNCTION: 将处理结果存入向量中。
// IN      : strResult
// OUT     : vecResult
// AUTHOR  : 2006-05-22 Created by navy .
// NOTE    : 
//--------------------------------------------------------------------------------
void Extract::funResult(string strResult)
{
	int nPos;
	string strTmp;

	funDelBegAnd(strResult,0); //替换以&开头以;结束的字符串
	vecResult.clear();
	nPos = strResult.find("<BR");
	while (nPos != -1)
	{
		strTmp = strResult.substr(0,nPos);
		funDelSideSpace(strTmp);       //去除前后空格
		if ( !strTmp.empty() ) {
			vecResult.push_back(strTmp);
		}
		strResult = strResult.substr(nPos+3);
		nPos = strResult.find("<BR");
	} 
	if (!strResult.empty()) 
	{
		vecResult.push_back(strResult);
	}
}

//--------------------------------------------------------------------------------
// FUNCTION: 将vecResult输出到filename.txt中。
// IN      : filename
// OUT     : 将结果保存在文件中。
// AUTHOR  : 2006-05-22 Created by navy .
// NOTE    : 
//--------------------------------------------------------------------------------
void Extract::funOutput(char* filename)
{
	ofstream fout;
	int i,n;
	string strTmp,strTmpOld;
	
	fout.open((strDirDestination + filename).c_str());
	n=vecResult.size();

	for ( i=0;i<n;i++ )
	{
		strTmpOld = strTmp ; 
		strTmp = vecResult[i] ;	     //cout << strTmp <<endl;
		funDelSideSpace(strTmp);     //cout << strTmp <<endl;
		if((i > 0) && (strTmp == strTmpOld)) continue;
		if ( strTmp == "·" ) continue;
		//fout << "" << strTmp << endl;
		//fout << "    " << strTmp << endl; //加空格,好看一些。
		fout << strTmp << endl; //不加空格
	}

	cout << "  Finish !" << endl;

	fout.close();
}

//--------------------------------------------------------------------------------
// FUNCTION: 将小写字母转为大写。
// IN      : 一行。
// OUT     : 
// AUTHOR  : 2006-05-27 Created by navy .
// NOTE    : 
//--------------------------------------------------------------------------------
void Extract::funConvCapital(string& strLine)
{
	if(STR_TI_CAPITAL)  
	{
		funReplace(strLine,"<ti","<TI");
		funReplace(strLine,"</ti","</TI");
	}
	if(STR_TR_CAPITAL)  
	{
		funReplace(strLine,"<tr","<TR");
		funReplace(strLine,"</tr","</TR");
	}
	if(STR_TD_CAPITAL)  
	{
		funReplace(strLine,"<td","<TD");
		funReplace(strLine,"</td","</TD");
	}
	if(STR_P_CAPITAL)   
	{
		funReplace(strLine,"<p","<P");
	}
	if(STR_BR_CAPITAL)  
	{
		funReplace(strLine,"<br","<BR"); 
	}
	if(STR_DI_CAPITAL)  
	{
		funReplace(strLine,"<di","<DI");
		funReplace(strLine,"</di","</DI");
	}
	if(STR_XML_CAPITAL) 
	{
		funReplace(strLine,"xml","XML");
	}
	if(STR_A_CAPITAL ) 
	{		
		funReplace(strLine,"<a","<A");
		funReplace(strLine,"</a>","</A>");
	}
	if(STR_SPACE ) 
	{
		funReplace(strLine," ",""); 
	}
	funReplace(strLine,"script","SCRIPT");
	funReplace(strLine,"style","STYLE");
}


//--------------------------------------------------------------------------------
// FUNCTION: 对strLine进行预处理。
// IN      : 一行。
// OUT     : 将结果保存在变量中。
// AUTHOR  : 2006-05-21 Created by navy .
// NOTE    : 
//--------------------------------------------------------------------------------
void Extract::funPreProcess(string& strLine)
{
	wchar_t ws[1024];
	char ss[2048];
	int nTrans = MultiByteToWideChar(CP_ACP, 0, strLine.c_str(), strLine.size(), ws, 1024 );
	ws[nTrans] =0;

	wstring wss=ws;

	funReplace(wss,L" ",L" ");        //将" "(全角的空格)替换成空格

	nTrans = WideCharToMultiByte(CP_ACP, 0, wss.c_str(), wss.size(), ss, 2048, NULL, NULL );
	ss[nTrans] = 0;
	strLine=ss;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -