📄 html2txt.cpp
字号:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <vector>
#include <string>
#include <assert.h>
#include <time.h>
#ifdef WIN32
#include <io.h>
#include <fcntl.h>
#endif
using namespace std;
int findrev(string &strBuf,const int pos, const char* tag)
{
int taglen = strlen(tag);
char *tmpbf = (char *)malloc(taglen);
for(int i=pos-taglen; i>=0; i--)
{
strBuf.copy(tmpbf,taglen, i);
if(strncmp(tmpbf, tag, taglen) == 0)
{
return i;
}
}
return pos;
}
int DisNest(string &strBuf, int &nBgPos, int &nEndPos, const char *tag)
{
int Tmpnbg, Tmpnend;
int nCount = 0;
char TmpTagBg[10], TmpTagEnd[10];
TmpTagBg[0] = '<';TmpTagBg[1] = '\0';
strcat(TmpTagBg+1, tag);
TmpTagEnd[0] = '<'; TmpTagEnd[1] = '/'; TmpTagEnd[2] = '\0';
strcat(TmpTagEnd+2, tag);
Tmpnbg = nBgPos;
//Tmpnend = nEndPos;
while (1)
{
Tmpnbg = findrev(strBuf,nEndPos, TmpTagBg);
if(Tmpnbg < nBgPos)
{
break;
}
else
{
nCount++;
strBuf[Tmpnbg] = '0';
nEndPos = strBuf.find(TmpTagEnd, nEndPos+1);
}
}
return nCount;
}
int FindMaxRatio(vector<int> &vContenRatio, const int nBgPos, const int nEndPos)
{
int MaxRatio = 0;
int MaxRatioLoc;
for(int i = nBgPos; i <= nEndPos; i++)
{
if(MaxRatio < vContenRatio[i])
{
MaxRatio = vContenRatio[i];
MaxRatioLoc = i;
}
}
return MaxRatio;
}
void main(int argc, char *argv[])
{
int timebg, timeend;
timebg = clock();
if(argc < 7)
{
printf("Usage: %s : -i htmlfile -o txtfile -c config", argv[0]);
exit(0);
}
char *HtmlFile, *TxtFile, *ConfigFile;
HtmlFile = TxtFile = ConfigFile = NULL;
FILE *fpHtml, *fpTxt, *fpConfig;
/* process command line*/
for(int i=1; i<argc; i++)
{
if(argv[i][1] == 'i')
{
HtmlFile = argv[++i];
}
else if(argv[i][1] == 'o')
{
TxtFile = argv[++i];
}
else if(argv[i][1] == 'c')
{
ConfigFile = argv[++i];
}
else
{
printf("Usage: %s : -i htmlfile -o txtfile -c config", argv[0]);
exit(0);
}
} //end of process command line
/* check the file if existing*/
if(strcmp(HtmlFile, "-") == 0) //can be used for the command 'cat' in linux mode
{
#ifdef WIN32
if(_setmode(_fileno(stdin), _O_BINARY ) == -1)
{
printf("Error: STDIN can't be set binary mode!\n");
exit(0);
}
#endif
fpHtml = stdin;
}
else if((fpHtml = fopen(HtmlFile, "rb")) == NULL)
{
printf("Error: can't open input file %s!\n", HtmlFile);
exit(0);
}
if((fpTxt = fopen(TxtFile, "wb")) == NULL)
{
printf("Error: can't open output file %s!\n", TxtFile);
exit(0);
}
if((fpConfig = fopen(ConfigFile, "rb")) == NULL)
{
printf("Error: can't open config file %s!\n", ConfigFile);
exit(0);
} //end of open file
/************************************************************************/
/* read config file */
/************************************************************************/
char *ContentBegin, *ContentEnd;
bool Title;
double ContentRatio;
int WindowLen;
ContentBegin = (char*)malloc(sizeof(char)*128);
ContentEnd = (char *)malloc(sizeof(char)*128);
char *line = (char *)malloc(sizeof(char)*256);
strcpy(ContentBegin, ""); //default value
strcpy(ContentEnd, ""); //default value
Title = false; //default value
ContentRatio = 0.7; //default value
WindowLen = 150; //default value
while(!feof(fpConfig))
{
if(!fgets(line, 256, fpConfig))
{
break;
}
if(line[0] == '#') //配置文件中可以用#做为注释
{
continue;
}
int len = strlen(line) -1;
while(len >= 0 && (line[len] == '\r' || line[len] == '\n'))
{
line[len--] = '\0';
}
string temp;
if(!strncmp(line, "ContentBegin", 12))
{
temp = line;
temp.copy(ContentBegin, strlen(line)-13,13);
ContentBegin[strlen(line)-13] = '\0';
temp.erase();
}
else if(!strncmp(line, "ContentEnd", 10))
{
temp = line;
temp.copy(ContentEnd, strlen(line)-11, 11);
ContentEnd[strlen(line)-11] = '\0';
temp.erase();
}
else if(!strncmp(line, "Title", 5))
{
char ti[2];
temp = line;
temp.copy(ti, 1, 6);
ti[1] = '\0';
if(ti[0] == 't' || ti[0] == 'T')
{
Title = true;
}
else if(ti[0] == 'f' || ti[0] == 'F')
{
Title = false;
}
}
else if(!strncmp(line, "ContentRatio", 12))
{
char ra[32] = {0};
temp = line;
temp.copy(ra, strlen(line)-13, 13);
ra[strlen(line)-13] = '\0';
ContentRatio = atof(ra);
}
else if(!strncmp(line, "WindowLen", 9))
{
char wi[32] = {0};
temp = line;
temp.copy(wi, strlen(line)-10, 10);
wi[strlen(line)-10] = '\0';
WindowLen = atoi(wi);
}
else
{
printf("Warning: Unrecognized parameters %s!\n", line);
}
}
/************************************************************************/
/* lower case to <***>, stored the content in string "strBuf" */
/* 并标记某字符是否可能做为内容到容器vector<bool> IsContent */
/************************************************************************/
fseek(fpHtml, 0, SEEK_SET);
int nposbegin = ftell(fpHtml);
fseek(fpHtml, 0, SEEK_END);
int nposend = ftell(fpHtml);
fseek(fpHtml, 0, SEEK_SET);
int nlen = nposend - nposbegin;
string strBuf;
char *tempBuf = (char *)malloc(sizeof(char)*nlen);
fread(tempBuf, nlen, 1, fpHtml);
strBuf = tempBuf;
//处理 和©,放在此处处理,可以提高以后计算content ratio的准确率
int nNbspLoc = 0;
nNbspLoc = strBuf.find(" ", 0);
while(nNbspLoc > 0)
{
strBuf.erase(nNbspLoc,6);
nNbspLoc = strBuf.find(" ", 0);
}
int nCopyLoc = 0, nCopyEnd = 0;
nCopyLoc = strBuf.find("©", 0);
while(nCopyLoc > 0)
{
nCopyEnd = strBuf.find("<", nCopyLoc);
strBuf.erase(nCopyLoc, nCopyEnd-nCopyLoc);
nCopyLoc = strBuf.find("©", 0);
}
int nSize = strBuf.size();
vector<bool> vIsContent;
vIsContent.assign(nSize, true);
bool bFind = false;
char cTemp;
for(i=0; i<strBuf.size(); i++)
{
cTemp = strBuf.at(i);
if(cTemp == '<')
{
bFind = true;
vIsContent[i] = false;
}
if(cTemp == '>')
{
bFind = false;
vIsContent[i] = false;
}
if(bFind == true)
{
if(isupper(cTemp))
{
cTemp = tolower(cTemp);
strBuf[i] = cTemp;
}
vIsContent[i] = false;
}
} //end of tolowercase
/************************************************************************/
/* 处理<script...>***</script>中间的内容,将***删除 */
/* 处理<style....>***</style> 中间的内容,将***删除 */
/************************************************************************/
nSize = strBuf.size();
bool isScript = false;
bool isStyle = false;
for(i=0; i<strBuf.size(); i++)
{
cTemp = strBuf.at(i);
int nBgPos = 0;
int nEndPos = 0;
int nNest = 0; //是否有嵌套存在
if(cTemp == '<' && strBuf.compare(i+1, 6, "script") == 0 )
{
int j = i;
while(1)
{
cTemp = strBuf.at(j++);
if(cTemp == '>')
{
nBgPos = j;
break;
}
}
nEndPos = strBuf.find("</script>", nBgPos);
nNest = DisNest(strBuf, nBgPos, nEndPos, "script"); //解嵌套
strBuf.erase(nBgPos, nEndPos-nBgPos);
vIsContent.erase(vIsContent.begin()+nBgPos, vIsContent.begin()+nEndPos);
nBgPos = nEndPos = 0;
}
else if(cTemp == '<' && strBuf.compare(i+1, 5, "style") == 0)
{
int j = i;
while(1)
{
cTemp = strBuf.at(j++);
if(cTemp == '>')
{
nBgPos = j;
break;
}
}
nEndPos = strBuf.find("</style>", nBgPos);
nNest = DisNest(strBuf, nBgPos, nEndPos, "style"); //解嵌套
strBuf.erase(nBgPos, nEndPos-nBgPos);
vIsContent.erase(vIsContent.begin()+nBgPos, vIsContent.begin()+nEndPos);
nBgPos = nEndPos = 0;
}
} //end of remove middle of <script>***</script>&&<style>***</style>
assert((vIsContent.size()==strBuf.size()));
// FILE *fpTemp; //temp debug begin
// fpTemp = fopen("temp.txt", "wb");
// for(i=0; i<nlen; i++) tempBuf[i] = ' ';
// strBuf.copy(tempBuf, strBuf.size(), 0);
// fwrite(tempBuf, sizeof(char), strBuf.size(), fpTemp);
// fflush(fpTemp); //temp debug end
/************************************************************************/
/*计算content ratio */
/************************************************************************/
nSize = strBuf.size();
vector<int> vContentRatio;
vContentRatio.assign(nSize, 0);
vector<int> vMaxRatioLoc;
int MaxRatio = 0;
int j;
for(i=0; i<nSize; i++)
{
if(vIsContent[i] == true)
{
for(j=0; j<WindowLen/2; j++)
{
if(i+j >= nSize) continue;
if(vIsContent[i+j] == true && strBuf.at(i+j) != 0x20 && strBuf.at(i+j) != '\r'
&& strBuf.at(i+j) != '\n' && strBuf.at(i+j) != 0x09)
{
vContentRatio[i]++;
}
}
for(j=0; j<WindowLen/2; j++)
{
if(i-j <= 0) continue;
if (vIsContent[i-j] && strBuf.at(i-j) != 0x20 && strBuf.at(i-j) != '\r'
&& strBuf.at(i-j) != '\n' && strBuf.at(i-j) != 0x09)
{
vContentRatio[i]++;
}
}
if(MaxRatio < vContentRatio[i])
{
MaxRatio = vContentRatio[i];
vMaxRatioLoc.clear();
vMaxRatioLoc.push_back(i);
}
else if(MaxRatio == vContentRatio[i])
{
vMaxRatioLoc.push_back(i);
}
}
// fprintf(fpTemp, "%c(%d)", strBuf.at(i), vContentRatio[i]);
} //end of compute content ratio
// fflush(fpTemp);fprintf(fpTemp, "\nNOTE");
// for(i=0; i<nSize; i++)
// {
// fprintf(fpTemp, "%d\n", vContentRatio[i]);
// }
// fprintf(fpTemp,"\n");
/************************************************************************/
/* 将确定为内容的字符放到vDesStr中 */
/************************************************************************/
//判断内容的开始&结束位置
int ContentBeginLoc = 0, ContentEndLoc = strBuf.size();
if(strcmp(ContentBegin, ""))
{
ContentBeginLoc = strBuf.find(ContentBegin, 0);
}
if(strcmp(ContentEnd, ""))
{
ContentEndLoc = strBuf.find(ContentEnd, 0);
}
int TempMaxRatio;
int nBgPos, nEndPos;
vector<char> vDesStr;
//处理是否需要Title的情况
if(Title) //保留Title
{
int TitleBgLoc = strBuf.find("<title>", 0) + 7;
int TitleEndLoc = strBuf.find("</title>", 0);
for(int k=TitleBgLoc; k<TitleEndLoc; k++)
{
vDesStr.push_back(strBuf.at(k));
}
vDesStr.push_back('\n');
}
for(i=0; i<nSize; i++)
{
if(vContentRatio[i] > 0)
{
nBgPos = i;
while (1)
{
if(vContentRatio[i++] == 0)
{
nEndPos = i-1;
break;
}
}
TempMaxRatio = FindMaxRatio(vContentRatio, nBgPos, nEndPos);
if((double)TempMaxRatio <= MaxRatio*0.5)
{
continue;
}
else if(nBgPos > ContentEndLoc || nEndPos < ContentBeginLoc)
{
continue;
}
else
{
for(int k=nBgPos; k<nEndPos; k++)
{
vDesStr.push_back(strBuf.at(k));
}
vDesStr.push_back(' ');
}
}
}
for(i=0; i<vDesStr.size(); i++)
{
fprintf(fpTxt, "%c", vDesStr.at(i));
}
/************************************************************************/
/*关闭文件,注意标准输入的情况 */
/************************************************************************/
timeend = clock();
if(fpHtml == stdin)
{
#ifdef WIN32
if(_setmode(_fileno(stdin), _O_TEXT) == -1)
{
printf("Error : 'stdin' cannot set back to TEXT mode.\n");
exit(1);
}
#endif
}
else
{
fclose(fpHtml);
}
fclose(fpTxt);
printf("Html2txt Done!\n");
// fclose(fpTemp);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -