⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 url.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
		xfree(host_part);		xfree(url_path);		xfree(up);		return NULL;		break;	}	xfree(urlbuf);	Debug(20, 9, ("url_parse:       host_part = %s\n", host_part ? host_part : "(none)"));	Debug(20, 9, ("url_parse:        url_path = %s\n", url_path));	if (host_part == (char *) NULL)		goto finish_host_part;	/* ---  HOST PART PARSING  ---	 * **	 * ** In general, the host part may look like:	 * **	 * **   [username[:password]@]hostname[:port]	 */	/* --- First, separate host_part into userinfo and hostinfo --- */	if ((t = strchr(host_part, '@')) != (char *) NULL) {		*t = (char) '\0';		userinfo = xstrdup(host_part);		hostinfo = xstrdup(t + 1);		*t = (char) '@';	} else {		hostinfo = host_part;		host_part = (char *) NULL;	}	/* --- separate userinfo into username and password --- */	if (userinfo) {		if ((t = strchr(userinfo, ':')) != (char *) NULL) {			*t = (char) '\0';			username = xstrdup(userinfo);			password = xstrdup(t + 1);			*t = (char) ':';		} else {			username = userinfo;			userinfo = (char *) NULL;			password = (char *) NULL;		}		if (username)			rfc1738_unescape(username);		if (password)			rfc1738_unescape(password);	}	/* --- separate hostinfo into hostname and port --- */	port = url_table[up->type].port;	if ((t = strchr(hostinfo, ':')) != (char *) NULL) {		*t = (char) '\0';		hostname = xstrdup(hostinfo);		port = atoi(t + 1);		*t = (char) ':';	} else {		hostname = hostinfo;		hostinfo = (char *) NULL;	}	Tolower(hostname);      finish_host_part:	Debug(20, 9, ("url_parse:        username = %s\n", username ? username : "(none)"));	Debug(20, 9, ("url_parse:        password = %s\n", password ? password : "(none)"));	Debug(20, 9, ("url_parse:        hostname = %s\n", hostname ? hostname : "(none)"));	Debug(20, 9, ("url_parse:            port = %d\n", port));	/* ---  URL-PATH PART PARSING  --- */	/* Remove HTML Bookmarks */	if (up->type == URL_HTTP) {		if ((t = strchr(url_path, '#')) != (char *) NULL)			*t = '\0';	}	switch (up->type) {	case URL_HTTP:	case URL_FTP:	case URL_GOPHER:	case URL_FILE:		remove_dot(url_path);		remove_dotdot(url_path);		break;	default:		break;	}	raw_url_path = xstrdup(url_path);	rfc1738_unescape(url_path);	/* Conform to RFC 1738 if needed */	if (liburl_conform_rfc1738) {		char *x = raw_url_path;		/* use unescaped pathname for the escape */		rfc1738_unescape(x);		raw_url_path = xstrdup(rfc1738_escape(x));		xfree(x);	}	Debug(20, 9, ("url_parse:        url_path = %s\n", url_path));	Debug(20, 9, ("url_parse:    raw_url_path = %s\n", raw_url_path));	/* Write the URL */	urlbuf = xmalloc(BUFSIZ);	buf = xmalloc(BUFSIZ);	/*      Note: Here we write the username and password into	 * **   the URL string.  So, if the user specifies user:pw	 * **   in a rootnode URL, it gets passed all throughout	 * **   the plumbing and will be visible in the query results,	 * **   etc.  To specifiy ``hidden'' password info, use the	 * **   HTTP-Basic-Auth and FTP-Auth lines in the Gatherer	 * **   config file.    -DW                                      */	sprintf(buf, "%s:", scheme);	strcat(urlbuf, buf);	if (hostname) {		strcat(urlbuf, "//");		if (username)			strcat(urlbuf, username);		if (password) {			sprintf(buf, ":%s", password);			strcat(urlbuf, buf);		}		if (username || password)			strcat(urlbuf, "@");		strcat(urlbuf, hostname);		if (port != url_table[up->type].port) {			sprintf(buf, ":%d", port);			strcat(urlbuf, buf);		}	}	strcat(urlbuf, raw_url_path);	up->url = xstrdup(urlbuf);	Debug(20, 9, ("url_parse:             url = %s\n", up->url));	up->port = port;	up->flags = 0;	if (hostname)		up->host = xstrdup(hostname);	if (username)		up->user = xstrdup(username);	if (password)		up->password = xstrdup(password);	if (url_path)		up->pathname = xstrdup(url_path);	if (raw_url_path)		up->raw_pathname = xstrdup(raw_url_path);	if (username || password)		URL_FLAG_SET(up->flags, URL_FLAG_PASS_USERINFO);	xfree(urlbuf);	xfree(buf);	xfree(scheme);	xfree(scheme_specific);	xfree(host_part);	xfree(url_path);	xfree(raw_url_path);	xfree(userinfo);	xfree(username);	xfree(password);	xfree(hostinfo);	xfree(hostname);	return (up);}/* *  remove_dot - Removes /./ portions of the string. */static void remove_dot(pathname)     char *pathname;{	char *p;	while ((p = strstr(pathname, "/./")) != NULL) {		/* move the string up, *including* terminating null */#ifdef HAVE_MEMMOVE		memmove(p + 1, p + 3, strlen(p + 3) + 1);#else		bcopy(p + 3, p + 1, strlen(p + 3) + 1);#endif	}}/* *  remove_dotdot - Normalizes pathnames to remove the /../ portion */static void remove_dotdot(pathname)     char *pathname;{	char *p, *q;	while ((p = strstr(pathname, "/../")) != NULL) {		if (p == pathname)	/* nothing to strip, bail */			return;		for (q = p - 1; q != pathname; q--)			if (*q == '/')				break;		if (*q != '/')	/* no previous /, bail */			return;		/* q now points to previous / at beginning of component */#ifdef HAVE_MEMMOVE		memmove(q + 1, p + 4, strlen(p + 4) + 1);#else		bcopy(p + 4, q + 1, strlen(p + 4) + 1);#endif	}}#ifdef USE_CCACHE/* * url_initCache() - inits ftp connection cache to desired parameters */void url_initCache(maxConnect, timeout)     int maxConnect;     long timeout;{	InitConfigRec *initParam;	initParam = (InitConfigRec *) xmalloc(sizeof(InitConfigRec));	if (!initParam)		return;	initParam->maxConnections = maxConnect;	initParam->timeOut = timeout;	SockInit(initParam);}/* * url_shutdowncache() - shuts down ftp connection cache and cleans up mess */void url_shutdowncache(){	ShutDownCache();}#endif /* USE_CCACHE */#define safe_strdup(s)	(s) == NULL ? NULL : xstrdup(s)URL *dup_url(up)     URL *up;{	static URL *newup;	newup = xmalloc(sizeof(URL));	newup->url = safe_strdup(up->url);	newup->raw_pathname = safe_strdup(up->raw_pathname);	newup->pathname = safe_strdup(up->pathname);	newup->host = safe_strdup(up->host);	newup->user = safe_strdup(up->user);	newup->password = safe_strdup(up->password);	newup->filename = safe_strdup(up->filename);	newup->flags = up->flags;#ifdef USE_MD5	newup->md5 = safe_strdup(up->md5);#endif	newup->fp = NULL;	/* can't copy */	newup->port = up->port;	newup->type = up->type;	newup->gophertype = up->gophertype;	newup->lmt = up->lmt;	newup->http_status_code = up->http_status_code;	newup->http_version = safe_strdup(up->http_version);	newup->http_reason_line = safe_strdup(up->http_reason_line);	newup->http_mime_hdr = safe_strdup(up->http_mime_hdr);	return (newup);}#undef safe_strdup#ifdef OLD_CODE/* *  compare_fullhost() - compares the two hosts. Returns 0 on match; *  non-zero otherwise. */static int compare_fullhost(host1, host2)     char *host1;     char *host2;{	char *s, *s1, *s2;	int r;	if ((s = getrealhost(host1)) == NULL)		return (1);	s1 = xstrdup(s);	xfree(s);	if ((s = getrealhost(host2)) == NULL) {		xfree(s1);		return (1);	}	s2 = xstrdup(s);	xfree(s);	r = strcmp(s1, s2);	xfree(s1);	xfree(s2);	return (r);}#endifstatic void get_lmt(up)     URL *up;{	struct stat sb;	if (stat(up->filename, &sb) < 0) {		log_errno(up->filename);		return;	}	up->lmt = sb.st_ctime;	if (up->http_mime_hdr != NULL) {		char *tbuf, *p, *q;		tbuf = xstrdup(up->http_mime_hdr);		p = strtok(tbuf, "\n");		while (p != NULL) {			if (!strncasecmp(p, "Last-Modified:",				strlen("Last-Modified:"))) {				q = p + strlen("Last-Modified:") + 1;				up->lmt = parse_rfc850(q);			}			p = strtok(NULL, "\n");		}		xfree(tbuf);	}	Debug(20, 1, ("get_lmt: %ld %s\n", up->lmt, up->url));}void print_url(up)     URL *up;{	Log("\n--------------\n");	Log("URL url      : %s\n", up->url);	Log("URL Type     : %d\n", up->type);	Log("URL RPathname: %s\n", up->raw_pathname);	Log("URL Pathname : %s\n", up->pathname);	Log("URL Host     : %s\n", up->host);	Log("URL Port     : %d\n", up->port);	Log("URL User     : %s\n", up->user);	Log("URL Password : %s\n", up->password);	Log("URL G Type   : %d\n", up->gophertype);	Log("URL Filename : %s\n", up->filename);	Log("URL LUpdate  : %d\n", up->lmt);#ifdef USE_MD5	Log("URL MD5      : %s\n", up->md5);#endif	Log("URL HTTP Ver : %s\n", up->http_version);	Log("URL HTTP Code: %d\n", up->http_status_code);	Log("URL HTTP RLin: %s\n", up->http_reason_line);	Log("URL HTTP MIME: %s\n", up->http_mime_hdr);	Log("--------------\n");}/* *  url_confirm() - quickly checks to see if the URL is ok. *  Returns 0 if it is ok; otherwise, returns non-zero. */int url_confirm(up)     URL *up;{	char *tmp = NULL;	switch (up->type) {	case URL_HTTP:	case URL_GOPHER:	case URL_FTP:		if ((tmp = getrealhost(up->host)) == NULL) {			errorlog("%s: Host unknown.\n", up->host);			return (1);		}		break;	default:		break;	}	if (tmp)		xfree(tmp);	return (0);}/* *  shsafe_path - Escapes characters to use text inside "'s for sh. */static char *shsafe_path(s)     char *s;{	static char buf[BIG_BUFSIZ];	char *p, *q;	for (p = s, q = &buf[0]; *p != '\0'; p++, q++) {		if ((*p == '\"') || (*p == '\\') || (*p == '$')) {			*q++ = '\\';	/* escape */			*q = *p;		} else {			*q = *p;		}	}	*q = '\0';	return (xstrdup(buf));}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -