⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 html2txt.cpp

📁 从html文件提取可显示的文本内容。可用于windows和linux环境。
💻 CPP
字号:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <vector>
#include <string>
#include <assert.h>
#include <time.h>
#ifdef WIN32
#include <io.h>
#include <fcntl.h>
#endif

using namespace std;
int findrev(string &strBuf,const int pos, const char* tag)
{
	int taglen = strlen(tag);
	char *tmpbf = (char *)malloc(taglen);
	for(int i=pos-taglen; i>=0; i--)
	{
		strBuf.copy(tmpbf,taglen, i);
		if(strncmp(tmpbf, tag, taglen) == 0) 
		{
			return i;
		}
	}
	return pos;
}

int DisNest(string &strBuf, int &nBgPos, int &nEndPos, const char *tag)
{
	int Tmpnbg, Tmpnend;
	int nCount = 0;
	char TmpTagBg[10], TmpTagEnd[10];
	TmpTagBg[0] = '<';TmpTagBg[1] = '\0';
	strcat(TmpTagBg+1, tag);
	TmpTagEnd[0] = '<'; TmpTagEnd[1] = '/'; TmpTagEnd[2] = '\0';
	strcat(TmpTagEnd+2, tag);
	Tmpnbg = nBgPos;
	//Tmpnend = nEndPos;
    while (1) 
	{
		Tmpnbg = findrev(strBuf,nEndPos, TmpTagBg);
		if(Tmpnbg < nBgPos) 
		{
			break;
		}
		else
		{
			nCount++;
			strBuf[Tmpnbg] = '0';
			nEndPos = strBuf.find(TmpTagEnd, nEndPos+1);
		}
    }
	return nCount;
}
int FindMaxRatio(vector<int> &vContenRatio, const int nBgPos, const int nEndPos)
{
	int MaxRatio = 0;
	int MaxRatioLoc;
	for(int i = nBgPos; i <= nEndPos; i++)
	{
		if(MaxRatio < vContenRatio[i]) 
		{
			MaxRatio = vContenRatio[i];
			MaxRatioLoc = i;
		}
	}
	return MaxRatio;
}

void main(int argc, char *argv[])
{
	int timebg, timeend;
	timebg = clock();
	if(argc < 7) 
	{
		printf("Usage: %s : -i htmlfile -o txtfile -c config", argv[0]);
		exit(0);
	}
    
	char *HtmlFile, *TxtFile, *ConfigFile;
	HtmlFile = TxtFile = ConfigFile = NULL;
	FILE *fpHtml, *fpTxt, *fpConfig;
	/* process command line*/
	for(int i=1; i<argc; i++)
	{
		if(argv[i][1] == 'i') 
		{
			HtmlFile = argv[++i];
		}
		else if(argv[i][1] == 'o')
		{
			TxtFile = argv[++i];
		}
		else if(argv[i][1] == 'c') 
		{
			ConfigFile = argv[++i];
		}
		else
		{
			printf("Usage: %s : -i htmlfile -o txtfile -c config", argv[0]);
			exit(0);
		}
	}  //end of process command line
	
	/* check the file if existing*/
	if(strcmp(HtmlFile, "-") == 0)  //can be used for the command 'cat' in linux mode
	{
#ifdef WIN32
		if(_setmode(_fileno(stdin), _O_BINARY ) == -1) 
		{
			printf("Error: STDIN can't be set binary mode!\n");
			exit(0);
		}
#endif
		fpHtml = stdin;
	}
	else if((fpHtml = fopen(HtmlFile, "rb")) == NULL) 
	{
		printf("Error: can't open input file %s!\n", HtmlFile);
		exit(0);
	}
	if((fpTxt = fopen(TxtFile, "wb")) == NULL) 
	{
		printf("Error: can't open output file %s!\n", TxtFile);
		exit(0);
	}
	if((fpConfig = fopen(ConfigFile, "rb")) == NULL) 
	{
		printf("Error: can't open config file %s!\n", ConfigFile);
		exit(0);
	} //end of open file

	/************************************************************************/
	/* read config file                                                     */
	/************************************************************************/
	char *ContentBegin, *ContentEnd;
	bool Title;
	double ContentRatio;
	int WindowLen;
	ContentBegin = (char*)malloc(sizeof(char)*128);
	ContentEnd = (char *)malloc(sizeof(char)*128);
	char *line = (char *)malloc(sizeof(char)*256);
	strcpy(ContentBegin, "");          //default value
	strcpy(ContentEnd, "");          //default value
	Title = false;                          //default value
	ContentRatio = 0.7;                     //default value
	WindowLen = 150;                        //default value
	while(!feof(fpConfig)) 
	{

		if(!fgets(line, 256, fpConfig)) 
		{
			break;
		}
		if(line[0] == '#') //配置文件中可以用#做为注释
		{
			continue;
		}
		int len = strlen(line) -1;
		while(len >= 0 && (line[len] == '\r' || line[len] == '\n')) 
		{
			line[len--] = '\0';
		}
		string temp;
		if(!strncmp(line, "ContentBegin", 12))
		{
			temp = line;
			temp.copy(ContentBegin, strlen(line)-13,13);
			ContentBegin[strlen(line)-13] = '\0';
			temp.erase();
		}
		else if(!strncmp(line, "ContentEnd", 10)) 
		{
			temp = line;
			temp.copy(ContentEnd, strlen(line)-11, 11);
			ContentEnd[strlen(line)-11] = '\0';
			temp.erase();
		}
		else if(!strncmp(line, "Title", 5)) 
		{
			char ti[2];
			temp = line;
			temp.copy(ti, 1, 6);
			ti[1] = '\0';
			if(ti[0] == 't' || ti[0] == 'T') 
			{
				Title = true;
			}
			else if(ti[0] == 'f' || ti[0] == 'F') 
			{
				Title = false;
			}
		}
		else if(!strncmp(line, "ContentRatio", 12)) 
		{
			char ra[32] = {0};
			temp = line;
			temp.copy(ra, strlen(line)-13, 13);
			ra[strlen(line)-13] = '\0';
			ContentRatio = atof(ra);
		}
		else if(!strncmp(line, "WindowLen", 9)) 
		{
			char wi[32] = {0};
			temp = line;
			temp.copy(wi, strlen(line)-10, 10);
			wi[strlen(line)-10] = '\0';
			WindowLen = atoi(wi);
		}
		else
		{
			printf("Warning: Unrecognized parameters %s!\n", line);
		}
	}
	/************************************************************************/
	/* lower case to <***>, stored the content in string "strBuf"           */
	/* 并标记某字符是否可能做为内容到容器vector<bool> IsContent	            */
	/************************************************************************/
	fseek(fpHtml, 0, SEEK_SET);
	int nposbegin = ftell(fpHtml);
	fseek(fpHtml, 0, SEEK_END);
	int nposend = ftell(fpHtml);
	fseek(fpHtml, 0, SEEK_SET);
	int nlen = nposend - nposbegin;
	string strBuf;
	char *tempBuf = (char *)malloc(sizeof(char)*nlen);
	fread(tempBuf, nlen, 1, fpHtml);
	strBuf = tempBuf;
	//处理&nbsp;和&copy;,放在此处处理,可以提高以后计算content ratio的准确率
	int nNbspLoc = 0;
	nNbspLoc = strBuf.find("&nbsp", 0);
	while(nNbspLoc > 0) 
	{
		strBuf.erase(nNbspLoc,6);
		nNbspLoc = strBuf.find("&nbsp", 0);
	}
	int nCopyLoc = 0, nCopyEnd = 0;
	nCopyLoc = strBuf.find("&copy", 0);
	while(nCopyLoc > 0) 
	{
		nCopyEnd = strBuf.find("<", nCopyLoc);
		strBuf.erase(nCopyLoc, nCopyEnd-nCopyLoc);
		nCopyLoc = strBuf.find("&copy", 0);
	}
	int nSize = strBuf.size();
	vector<bool> vIsContent;
	vIsContent.assign(nSize, true);
	bool bFind = false;
	char cTemp;
	for(i=0; i<strBuf.size(); i++)
	{
		cTemp = strBuf.at(i);
		if(cTemp == '<') 
		{
			bFind = true;
			vIsContent[i] = false;
		}
		if(cTemp == '>')
		{
			bFind = false;
			vIsContent[i] = false;
		}
		if(bFind == true) 
		{
			if(isupper(cTemp))
			{
				cTemp = tolower(cTemp);
				strBuf[i] = cTemp;
			}
			vIsContent[i] = false;
						
		}
	} //end of tolowercase
	/************************************************************************/
	/* 处理<script...>***</script>中间的内容,将***删除                     */
	/* 处理<style....>***</style> 中间的内容,将***删除                     */
	/************************************************************************/
	nSize = strBuf.size();
	bool isScript = false;
	bool isStyle = false;
	for(i=0; i<strBuf.size(); i++)
	{
		cTemp = strBuf.at(i);
		int nBgPos = 0;
		int nEndPos = 0;
		int nNest = 0;  //是否有嵌套存在
		if(cTemp == '<' && strBuf.compare(i+1, 6, "script") == 0 )
		{
			int j = i;
			while(1) 
			{				
				cTemp = strBuf.at(j++);
				if(cTemp == '>') 
				{
					nBgPos = j;
					break;
				}
			}
			nEndPos = strBuf.find("</script>", nBgPos);
			nNest = DisNest(strBuf, nBgPos, nEndPos, "script"); //解嵌套
			strBuf.erase(nBgPos, nEndPos-nBgPos);
			vIsContent.erase(vIsContent.begin()+nBgPos, vIsContent.begin()+nEndPos);
			nBgPos = nEndPos = 0;
		}
		else if(cTemp == '<' && strBuf.compare(i+1, 5, "style") == 0) 
		{
			int j = i;
			while(1) 
			{
				cTemp = strBuf.at(j++);
				if(cTemp == '>') 
				{
					nBgPos = j;
					break;
				}
			}
			nEndPos = strBuf.find("</style>", nBgPos);
			nNest = DisNest(strBuf, nBgPos, nEndPos, "style"); //解嵌套
			strBuf.erase(nBgPos, nEndPos-nBgPos);
			vIsContent.erase(vIsContent.begin()+nBgPos, vIsContent.begin()+nEndPos);
			nBgPos = nEndPos = 0;
		}
	}  //end of remove middle of <script>***</script>&&<style>***</style>
	assert((vIsContent.size()==strBuf.size()));
//	FILE *fpTemp;                        //temp debug begin
//	fpTemp = fopen("temp.txt", "wb");
//	for(i=0; i<nlen; i++) tempBuf[i] = ' ';
//	strBuf.copy(tempBuf, strBuf.size(), 0);
//	fwrite(tempBuf, sizeof(char), strBuf.size(), fpTemp);       
//    fflush(fpTemp);                      //temp debug end
	/************************************************************************/
	/*计算content ratio                                                     */
	/************************************************************************/
	nSize = strBuf.size();
	vector<int> vContentRatio;
    vContentRatio.assign(nSize, 0);
	vector<int> vMaxRatioLoc;
	int MaxRatio = 0;
	int j;
	for(i=0; i<nSize; i++)
	{
		if(vIsContent[i] == true)
		{
			for(j=0; j<WindowLen/2; j++)
			{
				if(i+j >= nSize) continue;
				if(vIsContent[i+j] == true && strBuf.at(i+j) != 0x20 && strBuf.at(i+j) != '\r'
					&& strBuf.at(i+j) != '\n' && strBuf.at(i+j) != 0x09)
				{
					vContentRatio[i]++;
				}
			}
			for(j=0; j<WindowLen/2; j++)
			{
				if(i-j <= 0) continue;
				if (vIsContent[i-j] && strBuf.at(i-j) != 0x20 && strBuf.at(i-j) != '\r'
					&& strBuf.at(i-j) != '\n' && strBuf.at(i-j) != 0x09) 
				{
					vContentRatio[i]++;
				}
			}
			if(MaxRatio < vContentRatio[i])
			{
				MaxRatio = vContentRatio[i];
				vMaxRatioLoc.clear();
				vMaxRatioLoc.push_back(i);
			}
			else if(MaxRatio == vContentRatio[i]) 
			{
				vMaxRatioLoc.push_back(i);
			}
		}
//		fprintf(fpTemp, "%c(%d)", strBuf.at(i), vContentRatio[i]);
	}  //end of compute content ratio
//    fflush(fpTemp);fprintf(fpTemp, "\nNOTE");
//	for(i=0; i<nSize; i++)
//	{
//		fprintf(fpTemp, "%d\n", vContentRatio[i]);
//	}
//	fprintf(fpTemp,"\n");

	/************************************************************************/
	/* 将确定为内容的字符放到vDesStr中                                      */
	/************************************************************************/
	//判断内容的开始&结束位置
	int ContentBeginLoc = 0, ContentEndLoc = strBuf.size();
	if(strcmp(ContentBegin, "")) 
	{
		ContentBeginLoc = strBuf.find(ContentBegin, 0);
	}
	if(strcmp(ContentEnd, "")) 
	{
		ContentEndLoc = strBuf.find(ContentEnd, 0);
	} 
	int TempMaxRatio;
	int nBgPos, nEndPos;
	vector<char> vDesStr;
	//处理是否需要Title的情况
	if(Title) //保留Title
	{
		int TitleBgLoc = strBuf.find("<title>", 0) + 7; 
		int TitleEndLoc = strBuf.find("</title>", 0);
		for(int k=TitleBgLoc; k<TitleEndLoc; k++)
		{
			vDesStr.push_back(strBuf.at(k));
		}
		vDesStr.push_back('\n');
	}
	for(i=0; i<nSize; i++)
	{
		if(vContentRatio[i] > 0)
		{
			nBgPos  = i;
			while (1) 
			{
				if(vContentRatio[i++] == 0)
				{
					nEndPos = i-1;
					break;
				}
			}
			TempMaxRatio = FindMaxRatio(vContentRatio, nBgPos, nEndPos);
			if((double)TempMaxRatio <= MaxRatio*0.5) 
			{
				continue;
			}
			else if(nBgPos > ContentEndLoc || nEndPos < ContentBeginLoc) 
			{
				continue;
			}
			else
			{
				for(int k=nBgPos; k<nEndPos; k++)
				{
					vDesStr.push_back(strBuf.at(k));
				}
				vDesStr.push_back(' ');
			}
		}
	}
	for(i=0; i<vDesStr.size(); i++)
	{
		fprintf(fpTxt, "%c", vDesStr.at(i));
	}
	/************************************************************************/
	/*关闭文件,注意标准输入的情况                                          */
	/************************************************************************/
	timeend = clock();
	if(fpHtml == stdin) 
	{
#ifdef WIN32
		if(_setmode(_fileno(stdin), _O_TEXT) == -1)
		{
			printf("Error : 'stdin' cannot set back to TEXT mode.\n");
			exit(1);
		}
#endif
	}
	else
	{
		fclose(fpHtml);
	}
	fclose(fpTxt);
	printf("Html2txt Done!\n");
// 	fclose(fpTemp);
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -