📄 url_relative.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
static char rcsid[] = "$Id: url_relative.c,v 2.1 1997/03/21 18:10:11 sxw Exp $";/* This code is taken from libwww-3.0                                   *//*                                                                    HTParse.c *    URI MANAGEMENT * *    (c) COPYRIGHT CERN 1994. *    Please first read the full copyright statement in the file COPYRIGH. * *  history: *    May 12 94       TAB added as legal char in HTCleanTelnetString * *//* *  url_relative.c - Build a full URL from a partial URL and its relative URL * *  Duane Wessels, wessels@cs.colorado.edu, June 1995 * *  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include <string.h>#include <stdlib.h>#include <stdio.h>#include "util.h"#ifndef NULL#define NULL    0#endif#define TOLOWER(c) (isupper(c) ? tolower(c) : (c))static char *HTSimplify();struct struct_parts {	char *access;		/* Now known as "scheme" */	char *host;	char *absolute;	char *relative;/*      char * search;          no - treated as part of path */	char *anchor;};/*      Scan a filename for its consituents * **   ----------------------------------- * ** * ** On entry, * **   name    points to a document name which may be incomplete. * ** On exit, * **      absolute or relative may be nonzero (but not both). * **   host, anchor and access may be nonzero if they were specified. * **   Any which are nonzero point to zero terminated strings. */static void scan(name, parts)     char *name;     struct struct_parts *parts;{	char *after_access;	char *p;	int length = strlen(name);	parts->access = 0;	parts->host = 0;	parts->absolute = 0;	parts->relative = 0;	parts->anchor = 0;	after_access = name;	for (p = name; *p; p++) {		if (*p == ':') {			*p = 0;			parts->access = after_access;	/* Scheme has been specified */			after_access = p + 1;			if (0 == strcasecmp("URL", parts->access)) {				parts->access = NULL;	/* Ignore IETF's URL: pre-prefix */			} else				break;		}		if (*p == '/')			break;	/* Access has not been specified */		if (*p == '#')			break;	}	for (p = name + length - 1; p >= name; p--) {		if (*p == '#') {			parts->anchor = p + 1;			*p = 0;	/* terminate the rest */		}	}	p = after_access;	if (*p == '/') {		if (p[1] == '/') {			parts->host = p + 2;	/* host has been specified      */			*p = 0;	/* Terminate access             */			p = strchr(parts->host, '/');	/* look for end of host name if any */			if (p) {				*p = 0;		/* Terminate host */				parts->absolute = p + 1;	/* Root has been found */			}		} else {			parts->absolute = p + 1;	/* Root found but no host */		}	} else {		parts->relative = (*after_access) ? after_access : 0;	/* zero for "" */	}}/*      Parse a Name relative to another name * **   ------------------------------------- * ** * **   This returns those parts of a name which are given (and requested) * **   substituting bits from the related name where necessary. * ** * ** On entry, * **   aName           A filename given * **      relatedName     A name relative to which aName is to be parsed. Give * **                      it an empty string if aName is absolute. * ** On exit, * **   returns         A pointer to a malloc'd string which MUST BE FREED */char *url_parse_relative(aName, relatedName)     char *aName;     char *relatedName;{	char *result = 0;	char *return_value = 0;	int len;	char *name = 0;	char *rel = 0;	char *p;	char *access;	struct struct_parts given, related;	if (!relatedName)	/* HWL 23/8/94: dont dump due to NULL */		relatedName = "";	/* Make working copies of input strings to cut up:	 */	len = strlen(aName) + strlen(relatedName) + 10;	result = (char *) xmalloc(len);		/* Lots of space: more than enough */	name = xstrdup(aName);	rel = xstrdup(relatedName);	scan(name, &given);	scan(rel, &related);	result[0] = 0;		/* Clear string  */	access = given.access ? given.access : related.access;	if (access) {		strcat(result, access);		strcat(result, ":");	}	if (given.access && related.access)	/* If different, inherit nothing. */		if (strcmp(given.access, related.access) != 0) {			related.host = 0;			related.absolute = 0;			related.relative = 0;			related.anchor = 0;		}	if (given.host || related.host) {		strcat(result, "//");		strcat(result, given.host ? given.host : related.host);	}	if (given.host && related.host)		/* If different hosts, inherit no path. */		if (strcmp(given.host, related.host) != 0) {			related.absolute = 0;			related.relative = 0;			related.anchor = 0;		}	if (given.absolute) {	/* All is given */		strcat(result, "/");		strcat(result, given.absolute);	} else if (related.absolute) {	/* Adopt path not name */		strcat(result, "/");		strcat(result, related.absolute);		if (given.relative) {			p = strchr(result, '?');	/* Search part? */			if (!p)				p = result + strlen(result) - 1;			for (; *p != '/'; p--);		/* last / */			p[1] = 0;	/* Remove filename */			strcat(result, given.relative);		/* Add given one */			result = HTSimplify(result);		}	} else if (given.relative) {		strcat(result, given.relative);		/* what we've got */	} else if (related.relative) {		strcat(result, related.relative);	} else {		/* No inheritance */		strcat(result, "/");	}	if (given.anchor || related.anchor) {		strcat(result, "#");		strcat(result, given.anchor ? given.anchor : related.anchor);	}	xfree(rel);	xfree(name);	return_value = xstrdup(result);	xfree(result);	return return_value;	/* exactly the right length */}/*              Simplify a URI * //           -------------- * // A URI is allowed to contain the seqeunce xxx/../ which may be * // replaced by "" , and the seqeunce "/./" which may be replaced by "/". * // Simplification helps us recognize duplicate URIs. * // * //   Thus,   /etc/junk/../fred       becomes /etc/fred * //           /etc/junk/./fred        becomes /etc/junk/fred * // * //      but we should NOT change * //           http://fred.xxx.edu/../.. * // * //   or      ../../albert.html * // * // In the same manner, the following prefixed are preserved: * // * //   ./<etc> * //   //<etc> * // * // In order to avoid empty URLs the following URLs become: * // * //           /fred/..                becomes /fred/.. * //           /fred/././..            becomes /fred/.. * //           /fred/.././junk/.././   becomes /fred/.. * // * // If more than one set of `://' is found (several proxies in cascade) then * // only the part after the last `://' is simplified. * // * // Returns: A string which might be the old one or a new one. */static char *HTSimplify(filename)     char *filename;{	char *path;	char *p;	if (!filename) {		return filename;	}	if ((path = strstr(filename, "://")) != NULL) {		/* Find host name */		char *newptr;		path += 3;		while ((newptr = strstr(path, "://")) != NULL)			path = newptr + 3;	} else if ((path = strstr(filename, ":/")) != NULL) {		path += 2;	} else		path = filename;	if (*path == '/' && *(path + 1) == '/') {	/* Some URLs start //<foo> */		path += 1;	} else if (!strncmp(path, "news:", 5)) {		char *ptr = strchr(path + 5, '@');		if (!ptr)			ptr = path + 5;		while (*ptr) {	/* Make group or host lower case */			*ptr = TOLOWER(*ptr);			ptr++;		}		return filename;	/* Doesn't need to do any more */	}	if ((p = path)) {		int segments = 0;		/* Parse string first time to find number of `real' tokens */		while (*p) {			if (*p == '/' || p == path) {				if (!((*(p + 1) == '/' || !*(p + 1)) ||					(*(p + 1) == '.' && (*(p + 2) == '/' || !*(p + 2))) ||					(*(p + 1) == '.' && *(p + 2) == '.' && (*(p + 3) == '/' || !*(p + 3)))))					segments++;			}			p++;		}		/* Parse string second time to simplify */		p = path;		while (*p) {			if (*p == '/') {				if (p > path && *(p + 1) == '.' && (*(p + 2) == '/' || !*(p + 2))) {					char *orig = p, *dest = p + 2;					while ((*orig++ = *dest++));	/* Remove a slash and a dot */					p--;				} else if (segments > 1 && *(p + 1) == '.' && *(p + 2) == '.' &&				    (*(p + 3) == '/' || !*(p + 3))) {					char *q = p;					while (q > path && *--q != '/');	/* prev slash */					if (strncmp(q, "/../", 4) && strncmp(q, "/./", 3) &&					    strncmp(q, "./", 2)) {						char *orig = q, *dest = p + 3;						if (*q != '/')							dest++;						while ((*orig++ = *dest++));	/* Remove /xxx/.. */						segments--;						p = q - 1;	/* Start again with prev slash */					} else						p++;				} else if (*(p + 1) == '/') {					while (*(p + 1) == '/') {						char *orig = p, *dest = p + 1;						while ((*orig++ = *dest++));	/* Remove multiple /'s */					}				}			}			p++;		}		/* end while (*p) */	}	return filename;}#ifdef MAKE_MAINmain(argc, argv)     int argc;     char *argv[];{	char *url;	url = url_parse_relative(argv[1], argv[2]);	printf("%s\n", url);}#endif
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -