📄 extract.cpp
字号:
//--------------------------------------------------------------------------------
// FUNCTION: 去除<>间的字符。
// IN : strResult,以及分隔符。
// OUT : 传址
// AUTHOR : 2006-05-21 Created by navy .
// 2006-05-25 Modified by navy .
// CALL : funDelete(strResult,"<TD");
// NOTE : Eg:<TD style="FONT-SIZE: 24pt" width="100%"> 变为:<TD>
//--------------------------------------------------------------------------------
void Extract::funDelete(string& strResult,string separator)
{
string strTemp1,strTemp2;
int nPos1,nPos2;
int n1,n2;
nPos1 = strResult.find(separator);
nPos2 = strResult.find(">",nPos1);
if ((nPos1 == -1) || (nPos2 == -1)) return; //如果没有<或>,直接退出。
n1 = funCountSymbol(strResult,nPos1,nPos2,"<"); //n1、n2至少等于1
n2 = funCountSymbol(strResult,nPos1,nPos2,">");
while (n1 != n2)
{
nPos2 = strResult.find(">",nPos2+1);
n1 = funCountSymbol(strResult,nPos1,nPos2,"<");
n2 = funCountSymbol(strResult,nPos1,nPos2,">");
}
if(nPos2 == -1) return; //注意:如果找不到,要返回。
strTemp1=strResult.substr(0,nPos1);
strTemp2=strResult.substr(nPos2+1);
funDelete(strTemp2,separator);
strResult=strTemp1+separator+">"+strTemp2;
}
//--------------------------------------------------------------------------------
// FUNCTION: 返回位置i,j间的symbol个数。
// IN :
// OUT :
// AUTHOR : 2006-05-25 Created by navy .
// CALL : int n = funCountSymbol(strResult,4,19,"<"); 结果n=4
// NOTE : Eg:<TD><A<B<BR>>BC</B>>DD</A></TD> 变为:<TD>
// 0123456789012345678901234567890
// 1 2 3
//--------------------------------------------------------------------------------
int Extract::funCountSymbol(string strResult,int i,int j,string symbol)
{
int nPos;
int nCount = 0 ;
nPos = strResult.find(symbol,i);
while((nPos != -1) && (nPos <= j))
{
nCount ++ ;
nPos = strResult.find(symbol,nPos+1);
}
return nCount ;
}
//--------------------------------------------------------------------------------
// FUNCTION: 去除<><>间的字符。
// IN : strResult,以及分隔符。
// OUT : 传址
// AUTHOR : 2006-05-22 Created by navy .
// 2006-05-29 Modified by navy .
// NOTE : Eg:abc<SCRIPT language=JavaScript>……</SCRIPT>abc 变为:abcabc
// NOTE : 存在嵌套的情况
// abc<SCRIPT><SCRIPT>……</SCRIPT>……</SCRIPT>abc 变为:abcabc
//--------------------------------------------------------------------------------
void Extract::funDelete2(string& strResult,string separator,int nPos)
{
string strTemp1,strTemp2;
string sepa1 = "<"+separator ;
string sepa2 = "</"+separator+">" ;
int nPos1,nPos2;
int n1,n2;
nPos1=strResult.find(sepa1,nPos);
nPos2=strResult.find(sepa2,nPos1);
if ((nPos1 == -1) || (nPos2 == -1)) return; //如果没有<或>,直接退出。
n1 = funCountSymbol(strResult,nPos1,nPos2,sepa1);
n2 = funCountSymbol(strResult,nPos1,nPos2+sepa2.length(),sepa2);
while (n1 != n2)
{
nPos2 = strResult.find(sepa2,nPos2+1);
n1 = funCountSymbol(strResult,nPos1,nPos2,sepa1);
n2 = funCountSymbol(strResult,nPos1,nPos2+sepa2.length(),sepa2);
}
if(nPos2 == -1) return; //注意:如果找不到,要返回。
strTemp1=strResult.substr(0,nPos1);
strTemp2=strResult.substr(nPos2+sepa2.length());
funDelete2(strTemp2,separator,0);
strResult=strTemp1 + strTemp2 ;
}
//--------------------------------------------------------------------------------
// FUNCTION: 保留指定标签之间的字符。
// IN : strResult
// OUT : strResult
// AUTHOR : 2006-05-22 Created by navy .
// 2006-05-26 Modified by navy .
// NOTE : Eg:<HTML><TD><FONT>a<A>b</A>c</FONT></TD></HTML> 变为:<TD>a<A>bc
// NOTE : Eg:<TABLE <BR>><TD>a</TD></TABLE> 变为:_a (_表示空格)
//--------------------------------------------------------------------------------
void Extract::funKeepListLabel(string& strResult,int nPosOld)
{
//string strUseful = "<TITLE>,<TR>,<TD>,<P>,<BR>,<DIV>";
string strTemp1,strTemp2;
string sepa;
int nPos,nPos1,nPos2;
int n1,n2;
nPos1 = strResult.find("<");
nPos2 = strResult.find(">",nPos1);
while ((nPos1 != -1) && (nPos2 != -1)) {
sepa=strResult.substr(nPos1,3);
nPos=strUseful.find(sepa);
n1 = funCountSymbol(strResult,nPos1,nPos2,"<");
n2 = funCountSymbol(strResult,nPos1,nPos2,">");
while (n1 != n2)
{
nPos2 = strResult.find(">",nPos2+1);
n1 = funCountSymbol(strResult,nPos1,nPos2,"<");
n2 = funCountSymbol(strResult,nPos1,nPos2,">");
}
if(nPos2 == -1) //<TITLE>博客园 - webcool</</TITLE>
{
nPos2 = strResult.find(">",nPos1);
}
strTemp1 = strResult.substr(0,nPos1);
strTemp2 = strResult.substr(nPos2+1);
funDelSideSpace(strTemp1);
funDelSideSpace(strTemp2);
if ( nPos != -1 )
{
if (sepa == "<TD") strTemp1 += " " ;
//else if (sepa == "<DI") strTemp1 += " " ; //遇div,回车
else strTemp1 += "<BR" ;
}
strResult = strTemp1+strTemp2;
nPos1=strResult.find("<",strTemp1.length());
nPos2=strResult.find(">",strTemp1.length());
}
}
//--------------------------------------------------------------------------------
// FUNCTION: 更改扩展名。
// IN : 网页文件(*.htm/html)。
// OUT : 原文件名,扩展名为txt。
// AUTHOR : 2006-05-21 Created by navy .
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funDestFilename(const char* filename,char* sResultFile)
{
strcpy(sResultFile,filename);
unsigned int nLen=strlen(filename);
if (filename[strlen(filename)-1] == 'l') //.html
{
strncpy(sResultFile+nLen-5,".txt",10);
}
else //.htm
{
strncpy(sResultFile+nLen-4,".txt",10);
}
sResultFile[nLen+4]=0;
}
//--------------------------------------------------------------------------------
// FUNCTION: 将strLine中的字符串strOld用strNew替换。
// IN : 输入一行
// OUT : 替换后的行
// AUTHOR : 2006-05-21 Created by navy .
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funReplace(string& strLine,string strOld,string strNew)
{
int nPos=0;
while ((nPos=strLine.find(strOld)) != -1) {
strLine=strLine.substr(0,nPos)+strNew+strLine.substr(nPos+strOld.length());
}
}
//--------------------------------------------------------------------------------
// FUNCTION: 将strLine中的字符串strOld用strNew替换。重载函数,处理宽字符。
// IN : 输入一行
// OUT : 替换后的行
// AUTHOR : 2006-05-21 Created by navy .
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funReplace(wstring& strLine,wstring strOld,wstring strNew)
{
int nPos=0;
while ((nPos=strLine.find(strOld)) != -1) {
strLine=strLine.substr(0,nPos)+strNew+strLine.substr(nPos+strOld.length());
}
}
//--------------------------------------------------------------------------------
// FUNCTION: 去除strLine两边的空格。
// IN : strLine
// OUT : strLine
// AUTHOR : 2006-05-22 Created by navy .
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funDelSideSpace(string& strLine)
{
int nPos;
if (strLine.empty()) return;
nPos=strLine.find_first_not_of(" ");
if ( nPos == -1 ) {
strLine = "";
return;
}
strLine=strLine.substr(nPos);
nPos=strLine.find_last_not_of(" ");
strLine=strLine.substr(0,nPos+1);
}
//--------------------------------------------------------------------------------
// FUNCTION: 去除以&开头以;结束的字符串。
// IN : strResult
// OUT : strResult
// AUTHOR : 2006-05-24 Created by navy .
// 2006-05-29 Modified by navy .
// NOTE : 1." "-->" " 2."<"-->"<" 3.">"-->">"
// 4."&"-->"&" 5."""-->"\" 6."©"-->"(C)"
// 7."®"-->"?" 8."™"-->"TM" 9."•"-->"·"
//--------------------------------------------------------------------------------
void Extract::funDelBegAnd(string& strResult,int nPos) //从第nPos个位置开始
{
//a & ;; 编辑
string strTmp1,strTmp2;
string strSymbol;
int nPos1,nPos2;
nPos1 = strResult.find("&",nPos);
nPos2 = strResult.find(";",nPos1);
while ((nPos1 != -1) || (nPos2 != -1))
{
strSymbol = strResult.substr(nPos1+1,nPos2-nPos1-1);
if(strSymbol == "nbsp") strSymbol = " ";
else if(strSymbol == "lt") strSymbol = "<";
else if(strSymbol == "gt") strSymbol = ">";
else if(strSymbol == "amp") strSymbol = "&";
else if(strSymbol == "quot") strSymbol = "\"";
else if(strSymbol == "copy") strSymbol = "(C)";
else if(strSymbol == "reg") strSymbol = "?";
else if(strSymbol == "trade") strSymbol = "TM";
else if(strSymbol == "#8226") strSymbol = "·";
else
{
nPos1 = strResult.find("&",nPos1+1); //找下一个&
nPos2 = strResult.find(";",nPos1);
continue;
}
strTmp1 = strResult.substr(0,nPos1) ;
strTmp2 = strResult.substr(nPos2 + 1) ;
strResult = strTmp1 + strSymbol + strTmp2;
nPos1 = strResult.find("&",strTmp1.length());
nPos2 = strResult.find(";",nPos1);
}
}
//--------------------------------------------------------------------------------
// FUNCTION: 删除无用链接(在<TR>和</TR>间除链接外,没有其他内容)
// IN : strResult
// OUT : 传址
// AUTHOR : 2006-05-27 Created by navy .
// NOTE : Eg:a<TR><TD><A>bc</A></TD></TR><TR><TD>d</TD></TR>e 变为:
// a<TR><TD>d</TD></TR>e
//--------------------------------------------------------------------------------
void Extract::funDelUselessLink(string& strResult)
{
string strTmp1,strTmp2,strTmp3;
int nPos1,nPos2;
nPos1 = strResult.find("<TR");
nPos2 = strResult.find("</TR>");
while ((nPos1 != -1) && (nPos2 != -1)) {
strTmp1 = strResult.substr(0,nPos1);
strTmp2 = strResult.substr(nPos1,nPos2 - nPos1 + 5);
strTmp3 = strResult.substr(nPos2 + 5);
funDelete2(strTmp2,"A",0); //删除<A>与</A>间的内容
funKeepListLabel(strTmp2,0);
funReplace(strTmp2,"<BR","");
funReplace(strTmp2," ","");
if (strTmp2.empty())
{
strResult = strTmp1 + strTmp3 ;
nPos1 = strResult.find("<TR",strTmp1.length());
nPos2 = strResult.find("</TR>",nPos1);
continue;
}
nPos1 = strResult.find("<TR",nPos2 + 5);
nPos2 = strResult.find("</TR>",nPos1);
}
}
Extract::~Extract()
{
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -