⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 extract.cpp

📁 从htm/html格式的网页文件中提取内容。将要提取内容的网页文件用鼠标拖入窗口
💻 CPP
📖 第 1 页 / 共 2 页
字号:


//--------------------------------------------------------------------------------
// FUNCTION: 去除<>间的字符。
// IN      : strResult,以及分隔符。
// OUT     : 传址
// AUTHOR  : 2006-05-21 Created by navy .
//           2006-05-25 Modified by navy .
// CALL    : funDelete(strResult,"<TD"); 
// NOTE    : Eg:<TD style="FONT-SIZE: 24pt" width="100%"> 变为:<TD>
//--------------------------------------------------------------------------------
void Extract::funDelete(string& strResult,string separator)
{
	string strTemp1,strTemp2;
	int nPos1,nPos2;
	int n1,n2;
	nPos1 = strResult.find(separator);
	nPos2 = strResult.find(">",nPos1);
	if ((nPos1 == -1) || (nPos2 == -1)) return; //如果没有<或>,直接退出。
	
	n1 = funCountSymbol(strResult,nPos1,nPos2,"<"); //n1、n2至少等于1
	n2 = funCountSymbol(strResult,nPos1,nPos2,">");

	while (n1 != n2)
	{
		nPos2 = strResult.find(">",nPos2+1);
		n1 = funCountSymbol(strResult,nPos1,nPos2,"<");
		n2 = funCountSymbol(strResult,nPos1,nPos2,">");
	}

	if(nPos2 == -1) return;                      //注意:如果找不到,要返回。

	strTemp1=strResult.substr(0,nPos1);
	strTemp2=strResult.substr(nPos2+1);
	
	funDelete(strTemp2,separator);
	strResult=strTemp1+separator+">"+strTemp2;
}

//--------------------------------------------------------------------------------
// FUNCTION: 返回位置i,j间的symbol个数。
// IN      : 
// OUT     : 
// AUTHOR  : 2006-05-25 Created by navy .
// CALL    : int n = funCountSymbol(strResult,4,19,"<"); 结果n=4
// NOTE    : Eg:<TD><A<B<BR>>BC</B>>DD</A></TD> 变为:<TD>
//               0123456789012345678901234567890  
//                         1         2         3
//--------------------------------------------------------------------------------
int Extract::funCountSymbol(string strResult,int i,int j,string symbol)
{
	int nPos;
	int nCount = 0 ;
	nPos = strResult.find(symbol,i);
	while((nPos != -1) && (nPos <= j))
	{
		nCount ++ ; 
		nPos = strResult.find(symbol,nPos+1);
	}
	return nCount ; 
}

//--------------------------------------------------------------------------------
// FUNCTION: 去除<><>间的字符。
// IN      : strResult,以及分隔符。
// OUT     : 传址
// AUTHOR  : 2006-05-22 Created by navy .
//           2006-05-29 Modified by navy .
// NOTE    : Eg:abc<SCRIPT language=JavaScript>……</SCRIPT>abc 变为:abcabc
// NOTE    : 存在嵌套的情况
//           abc<SCRIPT><SCRIPT>……</SCRIPT>……</SCRIPT>abc 变为:abcabc
//--------------------------------------------------------------------------------
void Extract::funDelete2(string& strResult,string separator,int nPos)
{
	string strTemp1,strTemp2;
	string sepa1 = "<"+separator ;
	string sepa2 = "</"+separator+">" ;
	int nPos1,nPos2;
	int n1,n2;
	
	nPos1=strResult.find(sepa1,nPos);
	nPos2=strResult.find(sepa2,nPos1);
	if ((nPos1 == -1) || (nPos2 == -1)) return; //如果没有<或>,直接退出。

	n1 = funCountSymbol(strResult,nPos1,nPos2,sepa1);
	n2 = funCountSymbol(strResult,nPos1,nPos2+sepa2.length(),sepa2);

	while (n1 != n2)
	{
		nPos2 = strResult.find(sepa2,nPos2+1);
		n1 = funCountSymbol(strResult,nPos1,nPos2,sepa1);
		n2 = funCountSymbol(strResult,nPos1,nPos2+sepa2.length(),sepa2);
	}

	if(nPos2 == -1) return;                      //注意:如果找不到,要返回。

	strTemp1=strResult.substr(0,nPos1);
	strTemp2=strResult.substr(nPos2+sepa2.length());
	
	funDelete2(strTemp2,separator,0);
	strResult=strTemp1 + strTemp2 ;
}


//--------------------------------------------------------------------------------
// FUNCTION: 保留指定标签之间的字符。
// IN      : strResult
// OUT     : strResult
// AUTHOR  : 2006-05-22 Created by navy .
//           2006-05-26 Modified by navy .
// NOTE    : Eg:<HTML><TD><FONT>a<A>b</A>c</FONT></TD></HTML> 变为:<TD>a<A>bc
// NOTE    : Eg:<TABLE <BR>><TD>a</TD></TABLE> 变为:_a (_表示空格)
//--------------------------------------------------------------------------------
void Extract::funKeepListLabel(string& strResult,int nPosOld)
{
	//string strUseful = "<TITLE>,<TR>,<TD>,<P>,<BR>,<DIV>"; 
	string strTemp1,strTemp2;
	string sepa;
	int nPos,nPos1,nPos2;
	int n1,n2;

	nPos1 = strResult.find("<"); 
	nPos2 = strResult.find(">",nPos1); 

	while ((nPos1 != -1) && (nPos2 != -1)) {
		sepa=strResult.substr(nPos1,3); 
		nPos=strUseful.find(sepa); 
		n1 = funCountSymbol(strResult,nPos1,nPos2,"<");
		n2 = funCountSymbol(strResult,nPos1,nPos2,">");
		while (n1 != n2)
		{
			nPos2 = strResult.find(">",nPos2+1);
			n1 = funCountSymbol(strResult,nPos1,nPos2,"<");
			n2 = funCountSymbol(strResult,nPos1,nPos2,">");
		}
		if(nPos2 == -1)                  //<TITLE>博客园 - webcool</</TITLE>
		{
			nPos2 = strResult.find(">",nPos1); 
		}
		strTemp1 = strResult.substr(0,nPos1); 
		strTemp2 = strResult.substr(nPos2+1); 
		funDelSideSpace(strTemp1);
		funDelSideSpace(strTemp2);
		if ( nPos != -1 ) 
		{
			if (sepa == "<TD") strTemp1 += " " ;       
			//else if (sepa == "<DI") strTemp1 += " " ;  //遇div,回车
			else strTemp1 += "<BR" ; 
		}
		strResult = strTemp1+strTemp2;
		nPos1=strResult.find("<",strTemp1.length()); 
		nPos2=strResult.find(">",strTemp1.length()); 
	}	
}


//--------------------------------------------------------------------------------
// FUNCTION: 更改扩展名。
// IN      : 网页文件(*.htm/html)。
// OUT     : 原文件名,扩展名为txt。
// AUTHOR  : 2006-05-21 Created by navy .
// NOTE    : 
//--------------------------------------------------------------------------------
void Extract::funDestFilename(const char* filename,char* sResultFile) 
{
	strcpy(sResultFile,filename);
	unsigned int nLen=strlen(filename);
	if (filename[strlen(filename)-1] == 'l')     //.html
	{
		strncpy(sResultFile+nLen-5,".txt",10);
	}
	else                                         //.htm  
	{
		strncpy(sResultFile+nLen-4,".txt",10);
	}	
	sResultFile[nLen+4]=0;
}

//--------------------------------------------------------------------------------
// FUNCTION: 将strLine中的字符串strOld用strNew替换。
// IN      : 输入一行
// OUT     : 替换后的行
// AUTHOR  : 2006-05-21 Created by navy .
// NOTE    : 
//--------------------------------------------------------------------------------
void Extract::funReplace(string& strLine,string strOld,string strNew)
{
	int nPos=0;
	while ((nPos=strLine.find(strOld)) != -1) {
 		strLine=strLine.substr(0,nPos)+strNew+strLine.substr(nPos+strOld.length());
	}
}

//--------------------------------------------------------------------------------
// FUNCTION: 将strLine中的字符串strOld用strNew替换。重载函数,处理宽字符。
// IN      : 输入一行
// OUT     : 替换后的行
// AUTHOR  : 2006-05-21 Created by navy .
// NOTE    : 
//--------------------------------------------------------------------------------
void Extract::funReplace(wstring& strLine,wstring strOld,wstring strNew)
{
	int nPos=0;
	while ((nPos=strLine.find(strOld)) != -1) {
 		strLine=strLine.substr(0,nPos)+strNew+strLine.substr(nPos+strOld.length());
	}
}

//--------------------------------------------------------------------------------
// FUNCTION: 去除strLine两边的空格。
// IN      : strLine
// OUT     : strLine
// AUTHOR  : 2006-05-22 Created by navy .
// NOTE    : 
//--------------------------------------------------------------------------------
void Extract::funDelSideSpace(string& strLine)
{
	int nPos;
	if (strLine.empty()) return;
	nPos=strLine.find_first_not_of(" ");
	if ( nPos == -1 ) {
		strLine = "";
		return;
	}
	strLine=strLine.substr(nPos);
	nPos=strLine.find_last_not_of(" ");
	strLine=strLine.substr(0,nPos+1);
}

//--------------------------------------------------------------------------------
// FUNCTION: 去除以&开头以;结束的字符串。
// IN      : strResult
// OUT     : strResult
// AUTHOR  : 2006-05-24 Created by navy .
//           2006-05-29 Modified by navy .
// NOTE    : 1."&nbsp;"-->" "   2."&lt;"-->"<"      3."&gt;"-->">"
//           4."&amp;"-->"&"    5."&quot;"-->"\"    6."&copy;"-->"(C)"
//           7."&reg;"-->"?"    8."&trade;"-->"TM"  9."&#8226;"-->"·"
//--------------------------------------------------------------------------------
void Extract::funDelBegAnd(string& strResult,int nPos) //从第nPos个位置开始
{
	//a&nbsp;&&nbsp;;;&nbsp;编辑
	string strTmp1,strTmp2;
	string strSymbol;
	int nPos1,nPos2;

	nPos1 = strResult.find("&",nPos);
	nPos2 = strResult.find(";",nPos1);
	while ((nPos1 != -1) || (nPos2 != -1)) 
	{
		strSymbol = strResult.substr(nPos1+1,nPos2-nPos1-1);
		if(strSymbol == "nbsp")       strSymbol = " ";
		else if(strSymbol == "lt")    strSymbol = "<";
		else if(strSymbol == "gt")    strSymbol = ">";
		else if(strSymbol == "amp")   strSymbol = "&";
		else if(strSymbol == "quot")  strSymbol = "\"";
		else if(strSymbol == "copy")  strSymbol = "(C)";
		else if(strSymbol == "reg")   strSymbol = "?";
		else if(strSymbol == "trade") strSymbol = "TM";
		else if(strSymbol == "#8226") strSymbol = "·";
		else
		{
			nPos1 = strResult.find("&",nPos1+1); //找下一个&
			nPos2 = strResult.find(";",nPos1);
			continue;
		}

		strTmp1 = strResult.substr(0,nPos1) ;
		strTmp2 = strResult.substr(nPos2 + 1) ;
		strResult = strTmp1 + strSymbol + strTmp2;
		
		nPos1 = strResult.find("&",strTmp1.length());
		nPos2 = strResult.find(";",nPos1);
	}	
}

//--------------------------------------------------------------------------------
// FUNCTION: 删除无用链接(在<TR>和</TR>间除链接外,没有其他内容)
// IN      : strResult
// OUT     : 传址
// AUTHOR  : 2006-05-27 Created by navy .
// NOTE    : Eg:a<TR><TD><A>bc</A></TD></TR><TR><TD>d</TD></TR>e 变为:
//               a<TR><TD>d</TD></TR>e
//--------------------------------------------------------------------------------
void Extract::funDelUselessLink(string& strResult) 
{
	string strTmp1,strTmp2,strTmp3;
	int nPos1,nPos2;
	nPos1 = strResult.find("<TR");
	nPos2 = strResult.find("</TR>");
	while ((nPos1 != -1) && (nPos2 != -1)) {
		strTmp1 = strResult.substr(0,nPos1);
		strTmp2 = strResult.substr(nPos1,nPos2 - nPos1 + 5);
		strTmp3 = strResult.substr(nPos2 + 5);
		funDelete2(strTmp2,"A",0);               //删除<A>与</A>间的内容
		funKeepListLabel(strTmp2,0);
		funReplace(strTmp2,"<BR","");
		funReplace(strTmp2," ","");
		if (strTmp2.empty())
		{
			strResult = strTmp1 + strTmp3 ; 
			nPos1 = strResult.find("<TR",strTmp1.length());
			nPos2 = strResult.find("</TR>",nPos1);
			continue;
		}

		nPos1 = strResult.find("<TR",nPos2 + 5);
		nPos2 = strResult.find("</TR>",nPos1);
	}
}

Extract::~Extract()
{

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -