📄 extract.cpp
字号:
#include "StdAfx.h"
#include "Extract.h"
#include "windows.h"
#include ".\extract.h"
Extract::Extract()
{
strUseful = STR_USEFUL ; // "<TITLE>,<TR>,<TD>,<P>,<BR>,<DIV>"
}
//--------------------------------------------------------------------------------
// FUNCTION: 打开网页文件,去除标签,提取内容。
// IN : 网页文件(*.htm/html)。
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funProcess(string &filestring)
{
string strLine,strResult;
int i = 1 , j = 1 ;
strResult =filestring ;
funConvCapital(strResult);
funDelete2(strResult,"SCRIPT",0); //去除<SCRIPT></SCRIPT>间的内容
funDelete2(strResult,"STYLE",0); //去除<STYLE></STYLE>间的内容
funDelete2(strResult,"XML",0); //去除<XML></XML>间的内容
if ( STR_A ) funDelete2(strResult,"A",0);
funDelete(strResult,"<A"); //将<A ...>变为<A>
funDelete(strResult,"<P"); //将<P ...>变为<P>
funDelete(strResult,"<TD");
if ( STR_REPLACE_VERTICAL ) funReplace(strResult,"|"," ");
//if ( STR_SPACE ) funReplace(strResult," ",""); //是否删除所有空格
funDelUselessLink(strResult); //删除无用链接
funKeepListLabel(strResult,0); //0表示从第一个字符开始查找
funReplace(strResult," "," "); //用空格替换TAB
funReplace(strResult," "," "); //最多保留一个空格
funDelBegAnd(strResult,0); //替换以&开头以;结束的字符串
filestring = funReplaceReturnAndEnter(strResult.c_str());
}
//--------------------------------------------------------------------------------
// FUNCTION: 将小写字母转为大写。
// IN : 一行。
// OUT :
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funConvCapital(string& strLine)
{
if(STR_TI_CAPITAL)
{
funReplace(strLine,"<ti","<TI");
funReplace(strLine,"</ti","</TI");
}
if(STR_TR_CAPITAL)
{
funReplace(strLine,"<tr","<TR");
funReplace(strLine,"</tr","</TR");
}
if(STR_TD_CAPITAL)
{
funReplace(strLine,"<td","<TD");
funReplace(strLine,"</td","</TD");
}
if(STR_P_CAPITAL)
{
funReplace(strLine,"<p","<P");
}
if(STR_BR_CAPITAL)
{
funReplace(strLine,"<br","<BR");
}
if(STR_DI_CAPITAL)
{
funReplace(strLine,"<di","<DI");
funReplace(strLine,"</di","</DI");
}
if(STR_XML_CAPITAL)
{
funReplace(strLine,"xml","XML");
}
if(STR_A_CAPITAL )
{
funReplace(strLine,"<a","<A");
funReplace(strLine,"</a>","</A>");
}
if(STR_SPACE )
{
funReplace(strLine," ","");
}
funReplace(strLine,"script","SCRIPT");
funReplace(strLine,"style","STYLE");
}
//--------------------------------------------------------------------------------
// FUNCTION: 对strLine进行预处理。
// IN : 一行。
// OUT : 将结果保存在变量中。
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funPreProcess(string& strLine)
{
wchar_t ws[1024];
char ss[2048];
int nTrans = MultiByteToWideChar(CP_ACP, 0, strLine.c_str(), strLine.size(), ws, 1024 );
ws[nTrans] =0;
wstring wss=ws;
funReplace(wss,L" ",L" "); //将" "(全角的空格)替换成空格
nTrans = WideCharToMultiByte(CP_ACP, 0, wss.c_str(), wss.size(), ss, 2048, NULL, NULL );
ss[nTrans] = 0;
strLine=ss;
}
//--------------------------------------------------------------------------------
// FUNCTION: 去除<>间的字符。
// IN : strResult,以及分隔符。
// OUT : 传址
// CALL : funDelete(strResult,"<TD");
// NOTE : Eg:<TD style="FONT-SIZE: 24pt" width="100%"> 变为:<TD>
//--------------------------------------------------------------------------------
void Extract::funDelete(string& strResult,string separator)
{
string strTemp1,strTemp2;
int nPos1,nPos2;
int n1,n2;
nPos1 = strResult.find(separator);
nPos2 = strResult.find(">",nPos1);
if ((nPos1 == -1) || (nPos2 == -1)) return; //如果没有<或>,直接退出。
n1 = funCountSymbol(strResult,nPos1,nPos2,"<"); //n1、n2至少等于1
n2 = funCountSymbol(strResult,nPos1,nPos2,">");
while (n1 != n2)
{
nPos2 = strResult.find(">",nPos2+1);
n1 = funCountSymbol(strResult,nPos1,nPos2,"<");
n2 = funCountSymbol(strResult,nPos1,nPos2,">");
}
if(nPos2 == -1) return; //注意:如果找不到,要返回。
strTemp1=strResult.substr(0,nPos1);
strTemp2=strResult.substr(nPos2+1);
funDelete(strTemp2,separator);
strResult=strTemp1+separator+">"+strTemp2;
}
//--------------------------------------------------------------------------------
// FUNCTION: 返回位置i,j间的symbol个数。
// IN :
// OUT :
// CALL : int n = funCountSymbol(strResult,4,19,"<"); 结果n=4
// NOTE : Eg:<TD><A<B<BR>>BC</B>>DD</A></TD> 变为:<TD>
// 0123456789012345678901234567890
// 1 2 3
//--------------------------------------------------------------------------------
int Extract::funCountSymbol(string strResult,int i,int j,string symbol)
{
int nPos;
int nCount = 0 ;
nPos = strResult.find(symbol,i);
while((nPos != -1) && (nPos <= j))
{
nCount ++ ;
nPos = strResult.find(symbol,nPos+1);
}
return nCount ;
}
//--------------------------------------------------------------------------------
// FUNCTION: 去除<><>间的字符。
// IN : strResult,以及分隔符。
// OUT : 传址
// NOTE : Eg:abc<SCRIPT language=JavaScript>……</SCRIPT>abc 变为:abcabc
// NOTE : 存在嵌套的情况
// abc<SCRIPT><SCRIPT>……</SCRIPT>……</SCRIPT>abc 变为:abcabc
//--------------------------------------------------------------------------------
void Extract::funDelete2(string& strResult,string separator,int nPos)
{
string strTemp1,strTemp2;
string sepa1 = "<"+separator ;
string sepa2 = "</"+separator+">" ;
int nPos1,nPos2;
int n1,n2;
nPos1=strResult.find(sepa1,nPos);
nPos2=strResult.find(sepa2,nPos1);
if ((nPos1 == -1) || (nPos2 == -1)) return; //如果没有<或>,直接退出。
n1 = funCountSymbol(strResult,nPos1,nPos2,sepa1);
n2 = funCountSymbol(strResult,nPos1,nPos2+sepa2.length(),sepa2);
while (n1 != n2)
{
nPos2 = strResult.find(sepa2,nPos2+1);
n1 = funCountSymbol(strResult,nPos1,nPos2,sepa1);
n2 = funCountSymbol(strResult,nPos1,nPos2+sepa2.length(),sepa2);
}
if(nPos2 == -1) return; //注意:如果找不到,要返回。
strTemp1=strResult.substr(0,nPos1);
strTemp2=strResult.substr(nPos2+sepa2.length());
funDelete2(strTemp2,separator,0);
strResult=strTemp1 + strTemp2 ;
}
//--------------------------------------------------------------------------------
// FUNCTION: 保留指定标签之间的字符。
// IN : strResult
// OUT : strResult
// NOTE : Eg:<HTML><TD><FONT>a<A>b</A>c</FONT></TD></HTML> 变为:<TD>a<A>bc
// NOTE : Eg:<TABLE <BR>><TD>a</TD></TABLE> 变为:_a (_表示空格)
//--------------------------------------------------------------------------------
void Extract::funKeepListLabel(string& strResult,int nPosOld)
{
//string strUseful = "<TITLE>,<TR>,<TD>,<P>,<BR>,<DIV>";
string strTemp1,strTemp2;
string sepa;
int nPos,nPos1,nPos2;
int n1,n2;
nPos1 = strResult.find("<");
nPos2 = strResult.find(">",nPos1);
while ((nPos1 != -1) && (nPos2 != -1)) {
sepa=strResult.substr(nPos1,3);
nPos=strUseful.find(sepa);
n1 = funCountSymbol(strResult,nPos1,nPos2,"<");
n2 = funCountSymbol(strResult,nPos1,nPos2,">");
while (n1 != n2)
{
nPos2 = strResult.find(">",nPos2+1);
n1 = funCountSymbol(strResult,nPos1,nPos2,"<");
n2 = funCountSymbol(strResult,nPos1,nPos2,">");
}
if(nPos2 == -1) //<TITLE>博客园 - webcool</</TITLE>
{
nPos2 = strResult.find(">",nPos1);
}
strTemp1 = strResult.substr(0,nPos1);
strTemp2 = strResult.substr(nPos2+1);
funDelSideSpace(strTemp1);
funDelSideSpace(strTemp2);
if ( nPos != -1 )
{
if (sepa == "<TD") strTemp1 += " " ;
//else if (sepa == "<DI") strTemp1 += " " ; //遇div,回车
else strTemp1 += "<BR" ;
}
strResult = strTemp1+strTemp2;
nPos1=strResult.find("<",strTemp1.length());
nPos2=strResult.find(">",strTemp1.length());
}
}
//--------------------------------------------------------------------------------
// FUNCTION: 更改扩展名。
// IN : 网页文件(*.htm/html)。
// OUT : 原文件名,扩展名为txt。
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funDestFilename(const char* filename,char* sResultFile)
{
strcpy(sResultFile,filename);
unsigned int nLen=strlen(filename);
if (filename[strlen(filename)-1] == 'l') //.html
{
strncpy(sResultFile+nLen-5,".txt",10);
}
else //.htm
{
strncpy(sResultFile+nLen-4,".txt",10);
}
sResultFile[nLen+4]=0;
}
//--------------------------------------------------------------------------------
// FUNCTION: 将strLine中的字符串strOld用strNew替换。
// IN : 输入一行
// OUT : 替换后的行
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funReplace(string& strLine,string strOld,string strNew)
{
int nPos=0;
while ((nPos=strLine.find(strOld)) != -1) {
strLine=strLine.substr(0,nPos)+strNew+strLine.substr(nPos+strOld.length());
}
}
//--------------------------------------------------------------------------------
// FUNCTION: 将strLine中的字符串strOld用strNew替换。重载函数,处理宽字符。
// IN : 输入一行
// OUT : 替换后的行
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funReplace(wstring& strLine,wstring strOld,wstring strNew)
{
int nPos=0;
while ((nPos=strLine.find(strOld)) != -1) {
strLine=strLine.substr(0,nPos)+strNew+strLine.substr(nPos+strOld.length());
}
}
//--------------------------------------------------------------------------------
// FUNCTION: 去除strLine两边的空格。
// IN : strLine
// OUT : strLine
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funDelSideSpace(string& strLine)
{
int nPos;
if (strLine.empty()) return;
nPos=strLine.find_first_not_of(" ");
if ( nPos == -1 ) {
strLine = "";
return;
}
strLine=strLine.substr(nPos);
nPos=strLine.find_last_not_of(" ");
strLine=strLine.substr(0,nPos+1);
}
//--------------------------------------------------------------------------------
// FUNCTION: 去除以&开头以;结束的字符串。
// IN : strResult
// OUT : strResult
// NOTE : 1." "-->" " 2."<"-->"<" 3.">"-->">"
// 4."&"-->"&" 5."""-->"\" 6."©"-->"(C)"
// 7."®"-->"?" 8."™"-->"TM" 9."•"-->"·"
//--------------------------------------------------------------------------------
void Extract::funDelBegAnd(string& strResult,int nPos) //从第nPos个位置开始
{
//a & ;; 编辑
string strTmp1,strTmp2;
string strSymbol;
int nPos1,nPos2;
nPos1 = strResult.find("&",nPos);
nPos2 = strResult.find(";",nPos1);
while ((nPos1 != -1) || (nPos2 != -1))
{
strSymbol = strResult.substr(nPos1+1,nPos2-nPos1-1);
if(strSymbol == "nbsp") strSymbol = " ";
else if(strSymbol == "lt") strSymbol = "<";
else if(strSymbol == "gt") strSymbol = ">";
else if(strSymbol == "amp") strSymbol = "&";
else if(strSymbol == "quot") strSymbol = "\"";
else if(strSymbol == "copy") strSymbol = "(C)";
else if(strSymbol == "reg") strSymbol = "?";
else if(strSymbol == "trade") strSymbol = "TM";
else if(strSymbol == "#8226") strSymbol = "·";
else if(strSymbol == "#149") strSymbol = "·";
else
{
nPos1 = strResult.find("&",nPos1+1); //找下一个&
nPos2 = strResult.find(";",nPos1);
continue;
}
strTmp1 = strResult.substr(0,nPos1) ;
strTmp2 = strResult.substr(nPos2 + 1) ;
strResult = strTmp1 + strSymbol + strTmp2;
nPos1 = strResult.find("&",strTmp1.length());
nPos2 = strResult.find(";",nPos1);
}
}
//--------------------------------------------------------------------------------
// FUNCTION: 删除无用链接(在<TR>和</TR>间除链接外,没有其他内容)
// IN : strResult
// OUT : 传址
// NOTE : Eg:a<TR><TD><A>bc</A></TD></TR><TR><TD>d</TD></TR>e 变为:
// a<TR><TD>d</TD></TR>e
//--------------------------------------------------------------------------------
void Extract::funDelUselessLink(string& strResult)
{
string strTmp1,strTmp2,strTmp3;
int nPos1,nPos2;
nPos1 = strResult.find("<TR");
nPos2 = strResult.find("</TR>");
while ((nPos1 != -1) && (nPos2 != -1)) {
strTmp1 = strResult.substr(0,nPos1);
strTmp2 = strResult.substr(nPos1,nPos2 - nPos1 + 5);
strTmp3 = strResult.substr(nPos2 + 5);
funDelete2(strTmp2,"A",0); //删除<A>与</A>间的内容
funKeepListLabel(strTmp2,0);
funReplace(strTmp2,"<BR","");
funReplace(strTmp2," ","");
if (strTmp2.empty())
{
strResult = strTmp1 + strTmp3 ;
nPos1 = strResult.find("<TR",strTmp1.length());
nPos2 = strResult.find("</TR>",nPos1);
continue;
}
nPos1 = strResult.find("<TR",nPos2 + 5);
nPos2 = strResult.find("</TR>",nPos1);
}
}
Extract::~Extract()
{
}
//替换\n \r字符
char * Extract::funReplaceReturnAndEnter(const char * szReslut)
{
register int x ,y ;
int len;
unsigned char *str;
len =strlen(szReslut);
str = (unsigned char *) new char[ len + 1];
if(str == NULL){
return NULL;
}
for (x = 0 ,y =0; x < len ; x++ )
{
unsigned char szT = (unsigned char) szReslut[x];
if (szT == 10 || szT == 13 )
{
str[y] = ' ' ;
}
else if (szT >= 0xa1 )
{
str[y] = ' ' ;
}
else if (szT == 0x20 )
{
str[y] = ' ' ;
}
else if (szT == 'B' )
{
str[y] = ' ' ;
}
else if (szT == 'R' )
{
str[y] = ' ' ;
}
else if (szT == '<' ) //把前面的两个逗号去除
{
if (y > 2)
{
str[y] = ',' ;
y++ ;
}
else
str[y] = ' ' ;
}
else
{
str[y] = szT ;
y++ ;
}
}
str[y] = '\0';
return ((char *) str);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -