📄 extract.cpp
字号:
//================================================================================
// CLASS : Extract
// FUNCTION: 提取网页内容,去除html标签。
// AUTHOR : 2006-05-21 Created by navy
// REFER : 输入文件名(*.htm/html)
// NOTE : Extract.cpp , implementation file
//================================================================================
#include "StdAfx.h"
#include "Extract.h"
#include "windows.h"
Extract::Extract()
{
strUseful = STR_USEFUL ; // "<TITLE>,<TR>,<TD>,<P>,<BR>,<DIV>"
}
//--------------------------------------------------------------------------------
// FUNCTION: 处理文件。
// IN : 文件名。
// OUT : .txt文件
// AUTHOR : 2006-05-24 Created by navy .
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funProcessFile(string strFileName)
{
funProcess(strFileName);
}
//--------------------------------------------------------------------------------
// FUNCTION: 处理目录中所有扩展名为strExteName的文件
// IN : 源目录、目标目录、扩展名
// OUT : txt文件。
// AUTHOR : 2006-05-24 Created by navy .
// NOTE : 目标目录为空时,默认为源目录
//--------------------------------------------------------------------------------
void Extract::funProcessDirectory(string strDirSour,string strDirDest,string strExteName)
{
long h;
_finddata_t fdata;
string filename;
int nCount = 0 ;
if(strDirDest.empty())
{
strDirDest = strDirSour ;
}
strDirSource = strDirSour + "\\" ;
strDirDestination = strDirDest + "\\" ;
h = _findfirst((strDirSource+"\\*."+strExteName).c_str(),&fdata);
if (h == -1)
{
//cout << "No match files .";
return;
}
//funProcess( strDirDest+"\\"+fdata.name);
if (fdata.size < 3*1024*1024)
{
nCount ++ ;
cout << nCount << " : " <<"processing " << fdata.name << " ";
funProcess(fdata.name);
}
else
{
//nCount ++ ;
cout << "********************************************************" << endl;
cout << "* size over 3M !" << endl;
cout << "* file name : " << fdata.name << " did not process . " << endl;
cout << "********************************************************" << endl;
}
while (1)
{
if (_findnext(h,&fdata) == 0)
{
//funProcess( strDirDest+"\\"+fdata.name);
if (fdata.size < 3*1024*1024)
{
nCount ++ ;
cout << nCount << " : " <<"processing " << fdata.name << " ";
funProcess(fdata.name);
}
else
{
//nCount ++ ;
cout << "********************************************************" << endl;
cout << "* size over 3M !" << endl;
cout << "* file name : " << fdata.name << " did not process . " << endl;
cout << "********************************************************" << endl;
}
}
else
{
break;
}
}
_findclose(h);
}
//--------------------------------------------------------------------------------
// FUNCTION: 打开网页文件,去除标签,提取内容。
// IN : 网页文件(*.htm/html)。
// OUT : txt文件。
// AUTHOR : 2006-05-22 Created by navy .
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funProcess(string filename)
{
ifstream fin((strDirSource + filename).c_str());
string strLine,strResult;
int i = 1 , j = 1 ;
char *sResultFile;
sResultFile=new char[100];
//strUseful = STR_USEFUL ; // "<TITLE>,<TR>,<TD>,<P>,<BR>,<DIV>"
funDestFilename(filename.c_str(),sResultFile);
cout << "." ; //输出“.”号
if (!fin)
{
cout<<"Can't open the file !"<<endl ;
return ;
}
while ( getline(fin,strLine) )
{
if (i%3000 == 0) cout << "." ;
if (i++ == 10000)
{
cout <<" " << j++ << "W";
i = 1 ;
}
funConvCapital(strLine);
//funPreProcess(strLine);
while((strLine[strLine.size()-1] =='\r') || (strLine[strLine.size()-1] =='\n')) //\r是换行,\n回车
{
strLine = strLine.substr(0, strLine.size()-1);
}
strResult+=strLine;
}
cout << "." ; //输出“.”号
funDelete2(strResult,"SCRIPT",0); //去除<SCRIPT></SCRIPT>间的内容
funDelete2(strResult,"STYLE",0); //去除<STYLE></STYLE>间的内容
funDelete2(strResult,"XML",0); //去除<XML></XML>间的内容
if ( STR_A ) funDelete2(strResult,"A",0);
funDelete(strResult,"<A"); //将<A ...>变为<A>
funDelete(strResult,"<P"); //将<P ...>变为<P>
funDelete(strResult,"<TD");
if ( STR_REPLACE_VERTICAL ) funReplace(strResult,"|"," ");
if ( STR_SPACE ) funReplace(strResult," ",""); //是否删除所有空格
funDelUselessLink(strResult); //删除无用链接
cout << "." ; //输出“.”号
funKeepListLabel(strResult,0); //0表示从第一个字符开始查找
cout << "." ; //输出“.”号
funReplace(strResult," "," "); //用空格替换TAB
funReplace(strResult," "," "); //最多保留一个空格
cout << "." ; //输出“.”号
funResult(strResult);
cout << "." ; //输出“.”号
funOutput(sResultFile);
/**debug*******************/
// ofstream fout(sResultFile);
// fout<< strResult << endl;
// fout.close();
/**debug*******************/
//delete sResultFile;
fin.close();
}
//--------------------------------------------------------------------------------
// FUNCTION: 将处理结果存入向量中。
// IN : strResult
// OUT : vecResult
// AUTHOR : 2006-05-22 Created by navy .
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funResult(string strResult)
{
int nPos;
string strTmp;
funDelBegAnd(strResult,0); //替换以&开头以;结束的字符串
vecResult.clear();
nPos = strResult.find("<BR");
while (nPos != -1)
{
strTmp = strResult.substr(0,nPos);
funDelSideSpace(strTmp); //去除前后空格
if ( !strTmp.empty() ) {
vecResult.push_back(strTmp);
}
strResult = strResult.substr(nPos+3);
nPos = strResult.find("<BR");
}
if (!strResult.empty())
{
vecResult.push_back(strResult);
}
}
//--------------------------------------------------------------------------------
// FUNCTION: 将vecResult输出到filename.txt中。
// IN : filename
// OUT : 将结果保存在文件中。
// AUTHOR : 2006-05-22 Created by navy .
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funOutput(char* filename)
{
ofstream fout;
int i,n;
string strTmp,strTmpOld;
fout.open((strDirDestination + filename).c_str());
n=vecResult.size();
for ( i=0;i<n;i++ )
{
strTmpOld = strTmp ;
strTmp = vecResult[i] ; //cout << strTmp <<endl;
funDelSideSpace(strTmp); //cout << strTmp <<endl;
if((i > 0) && (strTmp == strTmpOld)) continue;
if ( strTmp == "·" ) continue;
//fout << "" << strTmp << endl;
//fout << " " << strTmp << endl; //加空格,好看一些。
fout << strTmp << endl; //不加空格
}
cout << " Finish !" << endl;
fout.close();
}
//--------------------------------------------------------------------------------
// FUNCTION: 将小写字母转为大写。
// IN : 一行。
// OUT :
// AUTHOR : 2006-05-27 Created by navy .
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funConvCapital(string& strLine)
{
if(STR_TI_CAPITAL)
{
funReplace(strLine,"<ti","<TI");
funReplace(strLine,"</ti","</TI");
}
if(STR_TR_CAPITAL)
{
funReplace(strLine,"<tr","<TR");
funReplace(strLine,"</tr","</TR");
}
if(STR_TD_CAPITAL)
{
funReplace(strLine,"<td","<TD");
funReplace(strLine,"</td","</TD");
}
if(STR_P_CAPITAL)
{
funReplace(strLine,"<p","<P");
}
if(STR_BR_CAPITAL)
{
funReplace(strLine,"<br","<BR");
}
if(STR_DI_CAPITAL)
{
funReplace(strLine,"<di","<DI");
funReplace(strLine,"</di","</DI");
}
if(STR_XML_CAPITAL)
{
funReplace(strLine,"xml","XML");
}
if(STR_A_CAPITAL )
{
funReplace(strLine,"<a","<A");
funReplace(strLine,"</a>","</A>");
}
if(STR_SPACE )
{
funReplace(strLine," ","");
}
funReplace(strLine,"script","SCRIPT");
funReplace(strLine,"style","STYLE");
}
//--------------------------------------------------------------------------------
// FUNCTION: 对strLine进行预处理。
// IN : 一行。
// OUT : 将结果保存在变量中。
// AUTHOR : 2006-05-21 Created by navy .
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funPreProcess(string& strLine)
{
wchar_t ws[1024];
char ss[2048];
int nTrans = MultiByteToWideChar(CP_ACP, 0, strLine.c_str(), strLine.size(), ws, 1024 );
ws[nTrans] =0;
wstring wss=ws;
funReplace(wss,L" ",L" "); //将" "(全角的空格)替换成空格
nTrans = WideCharToMultiByte(CP_ACP, 0, wss.c_str(), wss.size(), ss, 2048, NULL, NULL );
ss[nTrans] = 0;
strLine=ss;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -