⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 httptse.cpp

📁 小型搜索引擎,用C/C++编写,属于全文搜索引擎
💻 CPP
📖 第 1 页 / 共 2 页
字号:
#include <stdlib.h>#include <stdio.h>#include <string.h>#include <strings.h>#include <errno.h>#include <netdb.h>#include <unistd.h>#include <netinet/in.h>#include <sys/types.h>#include <sys/socket.h>#include <sys/time.h>#include <fcntl.h>#include <iostream>#include "HttpTse.h"#include "Tse.h"#include "Url.h"#include "Page.h"#include "StrFun.h"int _checkBufSize(char **buf, int *bufsize, int more);using namespace std;char *userAgent = NULL;int timeout = DEFAULT_TIMEOUT;int hideUserAgent = 0;map<string,string> mapCacheHostLookup;typedef map<string,string>::value_type valTypeCHL;extern map<unsigned long,unsigned long> mapIpBlock;extern vector<string> vsUnreachHost;	/*         * Actually downloads the page, registering a hit (donation)         *      If the fileBuf passed in is NULL, the url is downloaded and then         *      freed; otherwise the necessary space is allocated for fileBuf.         *      Returns size of download on success, -1 on error is set,	 * 	-2 on 301.         */int HttpFetch(string strUrl, char **fileBuf, char **fileHeadBuf, char **location, int* nPSock ){	//const char *url_tmp = strUrl.c_str();	char *tmp, *url, *host, *charIndex, *requestBuf, *pageBuf;	int sock, bytesRead = 0, contentLength = -1, bufsize = REQUEST_BUF_SIZE;	int ret = -1, i=-1, tempSize, selectRet;	//if(url_tmp == NULL){	if( strUrl.empty() ){		cout << "strUrl is NULL" << endl;		return -1;	}	/* Copy the url passed in into a buffer we can work with, change, etc. */	//url = (char*)malloc(strlen(url_tmp)+1);	url = (char*)malloc(strUrl.length()+1);	if(url == NULL){		cout << "can not allocate enought memory for url" << endl;		return -1;	}	memset(url, 0,strUrl.length()+1);	//memset(url, strlen(url_tmp)+1, 0);	//strncpy(url, url_tmp, strlen(url_tmp) + 1);	memcpy(url, strUrl.c_str(), strUrl.length() );	charIndex = strstr(url, "://");	if(charIndex != NULL){		/* url contains a protocol field */		charIndex += strlen("://");		host = charIndex;		charIndex = strchr(charIndex, '/');	}else{		host = (char *)url;		charIndex = strchr(url, '/');	}	/* Compose a request string */	requestBuf = (char*)malloc(bufsize);	if(requestBuf == NULL){		free(url);		cout << "can not allocate enought memory for requestBuf" << endl;		return -1;	}	requestBuf[0] = 0;	if(charIndex == NULL){		/* The url has no '/' in it, assume the user is making a root-level                 *      request */		tempSize = strlen("GET /") + strlen(HTTP_VERSION) +1;		if( tempSize > bufsize ){			free(url);			free(requestBuf);			cout << "tempSize larger than bufsize" << endl;			return -1;		}		if(_checkBufSize(&requestBuf, &bufsize, tempSize) ||			snprintf(requestBuf, bufsize, "GET / %s\n", 			HTTP_VERSION) < 0 )		{			free(url);			free(requestBuf);			cout << "1._checkBuffSize(&requestBuf..) error" << endl;			return -1;		}		//requestBuf = "GET / " + (string)HTTP_VERSION + "\n";	}else{		tempSize = strlen("GET ") + strlen(charIndex) + strlen(HTTP_VERSION) + 3;		if(_checkBufSize(&requestBuf, &bufsize, tempSize) ||			snprintf(requestBuf, bufsize, "GET %s %s\n", 			charIndex, HTTP_VERSION) < 0)		{			free(url);			free(requestBuf);			cout << "2._checkBuffSize(&requestBuf..) error" << endl;			return -1;		}		//requestBuf = "GET / " + (string)charIndex + HTTP_VERSION + "\n";				}	/* Null out the end of the hostname if need be */	if(charIndex != NULL){		*charIndex = 0;	}	/* Use Host: even though 1.0 doesn't specify it.  Some servers         *      won't play nice if we don't send Host, and it shouldn't hurt anything */	tempSize = (int)strlen("Host: ") + (int)strlen(host) + 2;/* +2 for "\n\0" */	if(_checkBufSize(&requestBuf, &bufsize, tempSize + 128)){		free(url);		free(requestBuf);		cout << "3._checkBuffSize(&requestBuf..) error" << endl;		return -1;	}	strcat(requestBuf, "Host: ");	strcat(requestBuf, host);	strcat(requestBuf, "\n");	if(!hideUserAgent && userAgent == NULL) {		tempSize = (int)strlen("User-Agent: ") +			(int)strlen(DEFAULT_USER_AGENT) + (int)strlen(VERSION) + 3;		if(_checkBufSize(&requestBuf, &bufsize, tempSize)) {			free(url);			free(requestBuf);			cout << "4._checkBuffSize(&requestBuf..) error" << endl;			return -1;		}		strcat(requestBuf, "User-Agent: ");		strcat(requestBuf, DEFAULT_USER_AGENT);		strcat(requestBuf, "/");		strcat(requestBuf, VERSION);		strcat(requestBuf, "\n");	} else if(!hideUserAgent) {		tempSize = (int)strlen("User-Agent: ") + (int)strlen(userAgent) + 2;		if(_checkBufSize(&requestBuf, &bufsize, tempSize)) {			free(url);			free(requestBuf);			cout << "5._checkBuffSize(&requestBuf..) error" << endl;			return -1;		}		strcat(requestBuf, "User-Agent: ");		strcat(requestBuf, userAgent);		strcat(requestBuf, "\n");	}	//tempSize = (int)strlen("Connection: Close\n\n");	tempSize = (int)strlen("Connection: Keep-Alive\n\n");	if(_checkBufSize(&requestBuf, &bufsize, tempSize)) {		free(url);		free(requestBuf);		cout << "6._checkBuffSize(&requestBuf..) error" << endl;		return -1;	}	//strcat(requestBuf, "Connection: Close\n\n");	strcat(requestBuf, "Connection: Keep-Alive\n\n");	/* Now free any excess memory allocated to the buffer */	tmp = (char *)realloc(requestBuf, strlen(requestBuf) + 1);	if(tmp == NULL){		free(url);		free(requestBuf);		cout << "realloc for tmp error" << endl;		return -1;	}	requestBuf = tmp;	if( *nPSock != -1 ){		sock = *nPSock;		cout << "using privous socket" << *nPSock << endl;	}else{		cout << "1.get a new one" << endl;		sock = MakeSocket(host);		if(sock == -1) { 			free(url); 			free(requestBuf);			cout << "1.not able to MakeSocket" << endl;			return -1;		}		if(sock == -2) { 			free(url); 			free(requestBuf);			cout << "2.not able to MakeSocket" << endl;			return -1;		}	}		//cout << "requestBuf is " << requestBuf << endl;	if(write(sock, requestBuf, strlen(requestBuf)) == -1){		cout << "write error" << endl;		close(sock);		*nPSock  = -1;		cout << "2.close previous socket " << *nPSock << " and get a new one" << endl;		//maybe sock is dead,try again		sock = MakeSocket(host);		if(sock == -1) { 			free(url);			free(requestBuf);			cout << "3.not able to MakeSocket" << endl;			return -1;		}		if(sock == -2) { 			free(url);			free(requestBuf);			cout << "4.not able to MakeSocket" << endl;			return -1;		}		if(write(sock, requestBuf, strlen(requestBuf)) == -1){			close(sock);			*nPSock = -1;			free(url);			free(requestBuf);			cout << "write error" << endl;			return -1;		}	}	free(url);	free(requestBuf);	char headerBuf[HEADER_BUF_SIZE];	/* Grab enough of the response to get the metadata */	memset( headerBuf,0,HEADER_BUF_SIZE );	//cout << "old sock is " << sock << endl;	ret = _http_read_header(sock, headerBuf);	//cout << "ret = " << ret << endl;	if(ret < 0) { 		close(sock); 		*nPSock = -1;		cout << "_http_read() error " << endl;		return -1;	}	//cout << headerBuf << endl;	charIndex = strstr(headerBuf, "HTTP/");	if(charIndex == NULL){		close(sock);		*nPSock = -1;		cout << headerBuf << endl;		cout << "strstr() error " << endl;		return -1;	}	while(*charIndex != ' '){		charIndex++;	}	charIndex++;		ret = sscanf(charIndex, "%i", &i);	if(ret != 1){		close(sock); 		*nPSock = -1;		cout << "sscanf() error" << endl;		return -1;	}	#ifdef DEBUG	// http return code		cout <<"######Http return code: ######" << endl << i << endl;	#endif	// deal with http://net.cs.pku.edu.cn/~cnds	if(i == 301 || i == 302){		char *loc;		loc = (char*)malloc(URL_LEN);		if(loc == NULL){			close(sock);			*nPSock = -1;			cout << "malloc error" << endl;			return -1;		}		charIndex = strstr(headerBuf, "Location:");		if(charIndex != NULL){			//ret = sscanf(charIndex + strlen("Location: "), "%(URL_LEN-1)s",loc);			ret = sscanf(charIndex + strlen("Location: "), "%255s",loc);			if(ret != 1){				close(sock);				*nPSock = -1;				cout << headerBuf << endl;				cout << "sscanf() error" << endl;				return -1;			} else{				*location = loc;				close(sock);				*nPSock = -1;				//cout << "sscanf() else error" << endl;				return -2;			}		}	}	if(i<200 || i>299 ){		close(sock);		*nPSock = -1;		cout << "ret code = " << i << " < 200 or > 299" << endl;		return -1;	}	charIndex = strstr(headerBuf, "Content-Length:");	if(charIndex == NULL){		charIndex = strstr(headerBuf, "Content-length:");	}	if(charIndex == NULL){		/* Allocate enough memory to hold the page */		//if(contentLength == -1){			contentLength = DEFAULT_PAGE_BUF_SIZE;		//}	}else{		ret = sscanf(charIndex + strlen("content-length: "), "%i", 			&contentLength);		if(ret < 1){			close(sock);			*nPSock = -1;			cout << "sscanf() error" << endl;			return -1;		}	}	if(contentLength < 20){		contentLength = DEFAULT_PAGE_BUF_SIZE;	}	if(contentLength > MAX_PAGE_BUF_SIZE){		cout << "the page discarde due to its size " << contentLength 			<< " is larger than " << MAX_PAGE_BUF_SIZE << endl;		//close(sock);		return -1;	}	#ifdef DEBUG	// http content length		cout <<"######Content length: ######" << endl << contentLength << endl;	#endif	pageBuf = (char *)malloc(contentLength);	if(pageBuf == NULL){		close(sock);		*nPSock = -1;		cout << "malloc for pageBuf" << endl;		return -1;	}	        /* Begin reading the body of the file */	fd_set rfds;	struct timeval tv;	int flags;	flags=fcntl(sock,F_GETFL,0);        if(flags<0){		close(sock);		*nPSock = -1;		free(pageBuf);		cout << "1.fcntl() error " << endl;		return -1;	}        flags|=O_NONBLOCK;        if(fcntl(sock,F_SETFL,flags)<0){		close(sock);		*nPSock = -1;		free(pageBuf);		cout << "2.fcntl() error " << endl;		return -1;	}	int pre_ret=0;	while(ret > 0){		FD_ZERO(&rfds);		FD_SET(sock, &rfds);		if( bytesRead == contentLength ){			tv.tv_sec = 1;		}else{			tv.tv_sec = timeout;		}		tv.tv_usec = 0;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -