⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 url.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
		if (up->user == (char *) NULL && up->password == (char *) NULL)			ftp_get_auth(up);		/*      If still no userinfo, set to defaults           */		if (up->user == (char *) NULL)			up->user = xstrdup("anonymous");		if (up->password == (char *) NULL) {			sprintf(buf, "%s@", getmylogin());			up->password = xstrdup(buf);		}		break;	case URL_HTTP:		break;	case URL_GOPHER:		if (strlen(up->pathname) == 1)			up->gophertype = 1;		else			up->gophertype = *(up->pathname + 1) - 0x30;		up->filename = NULL;		break;	case URL_NEWS:	case URL_NOP:	case URL_X:		break;	default:		Debug(20, 1, ("url_open: WARNING: Unsupported URL: %s\n",			up->url));		url_close(up);		return (NULL);		break;	}	Debug(20, 9, ("url_open: returning %#08x\n", up));	return (up);}/* *  url_read() - Reads n bytes in buf from the URL *up.  Returns the *  number of bytes read, or a negative number on error. */int url_read(buf, n, off, up)     char *buf;			/* buffer to place the data */     int n;			/* read at most n bytes */     int off;			/* offset into URL data */     URL *up;			/* URL */{	int x;	if (!up)		return (-1);	if (up->filename == NULL)		if (url_retrieve(up))			return (-1);	if ((up->fp = fopen(up->filename, "r")) == NULL) {		log_errno(up->filename);		return (-1);	}	if (off > 0 && fseek(up->fp, off, SEEK_SET)) {		log_errno(up->filename);		fclose(up->fp);		return (-1);	}	x = fread(buf, 1, n, up->fp);	fclose(up->fp);	up->fp = NULL;	return (x);}/* *  url_retrieve() - Retrieves the URL's data and places it into a *  temporary file.  Returns non-zero if it could not retrieve the data; *  otherwise returns 0; Returns -1 if the url pointed at a redirect. * * Return code indicates severity of error (DW 6/9/95): * *      -1      URL is a redirect - new location in up->url *      1-9     'soft', maybe temporary errors.  Doesn't necessarily *              mean the object doens't exist. *      10+     'hard' errors from remote HTTPD.  The URL is invalid *              or no longer exists * * Return codes: *      0       Success *      1       DNS errors (from get_sockaddr()) *      2       socket()/bind() errors *      3       connect() errors *      4       network write/read errors *      10      HTTP/FTP/Gopher/etc protocol errors * */int url_retrieve(up)     URL *up;{	int get_code;#ifdef USE_MD5	extern char *get_md5();#endif#ifdef USE_CCACHE	DataReturn *dataRec;#endif#ifdef USE_LOCAL_CACHE	int cache_hit = 0;	/* See if we have the file in the cache already */	if (use_local_cache && up->type != URL_FILE && up->type != URL_NOP &&	    up->type != URL_X && up->filename == NULL) {		char *s = lookup_cache(up->url);		Debug(20, 1, ("url_retrieve: lookup_cache: returned: %s\n", s ? s : "Null"));		if (s != NULL) {			up->filename = s;			up->shsafe_filename = shsafe_path(up->filename);			up->lmt = lmt_cache(up->url);			up->flags |= URL_FLAG_NEED_UNLINK;			cache_hit = 1;		}	}#endif	if (up->type == URL_NOP || up->type == URL_X)		return (1);	/*	 *  If we don't have the file, then grab it via the access protocol	 */	if (up->filename == NULL) {		if (liburl_sleep_time > 0)			sleep(liburl_sleep_time);		switch (up->type) {		case URL_FTP:#ifdef USE_CCACHE			/*			 *  Use the FTP connection cache, rather than ftp_get().			 */			if ((dataRec = SockGetData(up, TEMP, NULL)) == NULL)				return (1);			up->filename = xstrdup(dataRec->fileName);			up->shsafe_filename = shsafe_path(up->filename);			free(dataRec);			break;#endif		case URL_GOPHER:		case URL_NEWS:		case URL_HTTP:			up->filename = tempnam(NULL, url_table[up->type].scheme);			up->flags |= URL_FLAG_NEED_UNLINK;			up->shsafe_filename = shsafe_path(up->filename);			get_code = (*url_table[up->type].get_func) (up);			/* >0 - Error on the fetch */			if (get_code > 0) {				Debug(20, 1, ("WARNING: url_retrieve: Cannot access %s\n",					up->url));				(void) unlink(up->filename);				xfree(up->filename);				xfree(up->shsafe_filename);				up->filename = NULL;				up->shsafe_filename = NULL;				return get_code;			}			/* -1 - Redirect */			if (get_code == -1) {			        Debug(20, 1, ("url_retrieve: Redirected to %s\n",					      up->url));				(void) unlink(up->filename);				xfree(up->filename);				xfree(up->shsafe_filename);				up->filename = NULL;				up->flags = 0;				up->shsafe_filename = NULL;				return get_code;			}			break;		case URL_FILE:			errorlog("Internal Error: url_retrieve: %s\n", up->url);			break;		default:			Debug(20, 1, ("WARNING: url_retrieve: Unsupported type: %s\n", up->url));			return 10;		}	}	if (up->lmt == 0)		get_lmt(up);#ifdef USE_LOCAL_CACHE	/* Now that we have the file, add it to the cache */	if (use_local_cache && up->type != URL_FILE && !cache_hit) {		add_cache(up->url, up->filename, up->lmt);		/* if (up->redir_from_url != (char *) NULL)                        add_cache(up->redir_from_url, up->filename, up->lmt);			*/	}#endif /* USE_LOCAL_CACHE */#ifdef USE_MD5	/* Compute an MD5 checksum, if not done already */	if (up->md5 == NULL)		up->md5 = get_md5(up->filename);#endif	return 0;}/* *  url_close() - Closes the URL, and frees memory. */void url_close(up)     URL *up;{	if (!up)		return;	if (up->filename && up->type != URL_FILE	    && (up->flags & URL_FLAG_NEED_UNLINK))		(void) unlink(up->filename);	if (up->shsafe_filename)		xfree(up->shsafe_filename);	if (up->url)		xfree(up->url);	if (up->raw_pathname)		xfree(up->raw_pathname);	if (up->pathname)		xfree(up->pathname);	if (up->host)		xfree(up->host);	if (up->user)		xfree(up->user);	if (up->password)		xfree(up->password);	if (up->filename)		xfree(up->filename);#ifdef USE_MD5	if (up->md5)		xfree(up->md5);#endif	if (up->http_version)		xfree(up->http_version);	if (up->http_mime_hdr)		xfree(up->http_mime_hdr);	if (up->http_reason_line)		xfree(up->http_reason_line);	xfree(up);}/* *  Tolower() - converts an entire string to lowercase. */static void Tolower(q)     char *q;{	char *s = q;	while (*s) {		*s = tolower(*s);		s++;	}}/* *  url_parse() - Parses the URL from the URL *up and sets up's values. *  Returns an allocated URL structure on success; otherwise, returns NULL. */static URL *url_parse(url)     char *url;{	static URL *up = NULL;	char *urlbuf = NULL;	char *buf = NULL;	char *scheme = NULL;	char *scheme_specific = NULL;	char *host_part = NULL;	char *url_path = NULL;	char *raw_url_path = NULL;	char *userinfo = NULL;	char *username = NULL;	char *password = NULL;	char *hostinfo = NULL;	char *hostname = NULL;	char *t = NULL;	int port;	int n;	if (url == (char *) NULL) {		Debug(20, 1, ("url_parse: Somebody gave me a NULL URL!\n"));		return (NULL);	}	urlbuf = xstrdup(url);	/* working copy */	if ((t = strrchr(urlbuf, '\n')) != (char *) NULL)		*t = (char) '\0';	if ((t = strrchr(urlbuf, '\r')) != (char *) NULL)		*t = (char) '\0';	Debug(20, 9, ("url_parse: parsing '%s'\n", url));	if ((t = strchr(urlbuf, ':')) == (char *) NULL) {		Log("url_parse: Invalid URL: %s\n", urlbuf);		xfree(urlbuf);		return NULL;	}	*t = (char) '\0';	scheme = xstrdup(urlbuf);	scheme_specific = xstrdup(t + 1);	*t = (char) ':';	Debug(20, 9, ("url_parse:          scheme = %s\n", scheme));	Debug(20, 9, ("url_parse: scheme_specific = %s\n", scheme_specific));	up = xmalloc(sizeof(URL));	/* Basic initialization */	Tolower(scheme);	if (!strncmp(scheme, "x-", 2))	/* any x- type */		up->type = URL_X;	else if (!strcmp(scheme, "file"))		up->type = URL_FILE;	else if (!strcmp(scheme, "ftp"))		up->type = URL_FTP;	else if (!strcmp(scheme, "http"))		up->type = URL_HTTP;	else if (!strcmp(scheme, "gopher"))		up->type = URL_GOPHER;	else if (!strcmp(scheme, "news"))		up->type = URL_NEWS;	else if (!strcmp(scheme, "nop"))		up->type = URL_NOP;	else if (!strcmp(scheme, "telnet"))		up->type = URL_TELNET;	else if (!strcmp(scheme, "wais"))		up->type = URL_WAIS;	else if (!strcmp(scheme, "mailto"))		up->type = URL_MAILTO;	else {		Debug(20, 9, ("url_parse: Unknown URL scheme: %s\n", scheme));		if (!strcmp(scheme, "javascript") && !strcmp(scheme, "https"))			Log("url_parse: Unknown URL scheme: %s\n", scheme);		xfree(urlbuf);		xfree(scheme);		xfree(scheme_specific);		xfree(up);		return NULL;	}	/*      Do scheme-specific parsing              */	switch (up->type) {	case URL_FILE:		host_part = xmalloc(strlen(scheme_specific));		url_path = xmalloc(strlen(scheme_specific));		if (strlen(scheme_specific) >= 3 && scheme_specific[2] == '/') {			*host_part = '\0';			n = sscanf(scheme_specific, "//%s", url_path);			if (n < 1) {				strcpy(url_path, "/");				n = 1;			}		} else {			n = sscanf(scheme_specific, "//%[^/]%s", host_part, url_path);		}		if (n < 1 || n > 2) {			Log("url_parse: Invalid URL: %s\n", urlbuf);			xfree(urlbuf);			xfree(scheme);			xfree(scheme_specific);			xfree(host_part);			xfree(url_path);			xfree(up);			return NULL;		}		if (*url_path == '\0')			strcpy(url_path, "/");		break;	case URL_HTTP:	case URL_GOPHER:	case URL_FTP:		host_part = xmalloc(strlen(scheme_specific));		url_path = xmalloc(strlen(scheme_specific));		n = sscanf(scheme_specific, "//%[^/]%s", host_part, url_path);		if (n < 1 || n > 2) {			Log("url_parse: Invalid URL: %s\n", urlbuf);			xfree(urlbuf);			xfree(scheme);			xfree(scheme_specific);			xfree(host_part);			xfree(url_path);			xfree(up);			return NULL;		}		if (*url_path == '\0')			strcpy(url_path, "/");		break;	case URL_NEWS:	case URL_X:	case URL_NOP:		url_path = scheme_specific;		scheme_specific = (char *) NULL;		break;	default:		Debug(20, 1, ("Harvest does not support %s URLs\n", scheme));		xfree(urlbuf);		xfree(scheme);		xfree(scheme_specific);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -