⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 rfc1738.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
static char rcsid[] = "$Id: rfc1738.c,v 2.2 1997/06/12 22:26:27 sxw Exp $";/* *  rfc1738.c - code to comply with RFC 1738  * *  DEBUG: none *  AUTHOR: Harvest derived * *  Harvest Indexer http://www.tardis.ed.ac.uk/harvest/ *  --------------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. *  *  Please mail harvest@tardis.ed.ac.uk if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. *   *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. *   *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//*  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. *   *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): *   *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. *   *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. *   *  TERMS OF USE *     *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. *     *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. *   *  DERIVATIVE WORKS *   *    Users may make derivative works from the Harvest software, subject  *    to the following constraints: *   *      - You must include the above copyright notice and these  *        accompanying paragraphs in all forms of derivative works,  *        and any documentation and other materials related to such  *        distribution and use acknowledge that the software was  *        developed at the above institutions. *   *      - You must notify IRTF-RD regarding your distribution of  *        the derivative work. *   *      - You must clearly notify users that your are distributing  *        a modified version and not the original Harvest software. *   *      - Any derivative product is also subject to these copyright  *        and use restrictions. *   *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. *   *  HISTORY OF FREE SOFTWARE STATUS *   *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards.   *   */#include <stdio.h>#include <stdlib.h>#include <string.h>#include "util.h"#define BIG_BUFSIZ (BUFSIZ * 8)/*   *  RFC 1738 defines that these characters should be escaped, as well *  any non-US-ASCII character or anything between 0x00 - 0x1F. */static char rfc1738_unsafe_chars[] ={    (char) 0x3C,		/* < */    (char) 0x3E,		/* > */    (char) 0x22,		/* " */    (char) 0x23,		/* # */    (char) 0x25,		/* % */    (char) 0x7B,		/* { */    (char) 0x7D,		/* } */    (char) 0x7C,		/* | */    (char) 0x5C,		/* \ */    (char) 0x5E,		/* ^ */    (char) 0x7E,		/* ~ */    (char) 0x5B,		/* [ */    (char) 0x5D,		/* ] */    (char) 0x60,		/* ` */    (char) 0x27,		/* ' */    (char) 0x20			/* space */};/* *  rfc1738_escape - Returns a static buffer contains the RFC 1738  *  compliant, escaped version of the given url. */char *rfc1738_escape(url)     char *url;{	static char buf[BIG_BUFSIZ];	char *p, *q;	int i, do_escape;	for (p = url, q = &buf[0]; *p != '\0' && q<&buf[0]+BIG_BUFSIZ-1; 	     p++, q++) {		do_escape = 0;		/* RFC 1738 defines these chars as unsafe */		for (i = 0; i < sizeof(rfc1738_unsafe_chars); i++) {			if (*p == rfc1738_unsafe_chars[i]) {				do_escape = 1;				break;			}		}		/* RFC 1738 says any control chars (0x00-0x1F) are encoded */		if (*p <= (char) 0x1F) {			do_escape = 1;		}		/* RFC 1738 says 0x7f is encoded */		if (*p == (char) 0x7F) {			do_escape = 1;		}		/* RFC 1738 says any non-US-ASCII are encoded */		if ((*p >= (char) 0x80) && (*p <= (char) 0xFF)) {			do_escape = 1;		}		/* Do the triplet encoding, or just copy the char */		if (do_escape == 1) {			(void) sprintf(q, "%%%02x", (unsigned char) *p);			q += sizeof(char) * 2;		} else {			*q = *p;		}	}	*q = '\0';	return (buf);}/* *  rfc1738_unescape() - Converts escaped characters (%xy numbers) in  *  given the string.  %% is a %. %ab is the 8-bit hexadecimal number "ab" *  Adapted to cope with single %s and with invalid %ab's */void rfc1738_unescape(s)     char *s;{	char hexnum[3];	int i, j;		/* i is write, j is read */	unsigned int x;	for (i = j = 0; s[j]; i++, j++) {		s[i] = s[j];		if (s[i] == '%' && s[j+1]!='\0' && s[j+2]!='\0' &&		    isxdigit(s[j+1]) && isxdigit(s[j+2])) {			hexnum[0] = s[++j];			if (hexnum[0] != '%') {				hexnum[1] = s[++j];				hexnum[2] = '\0';				sscanf(hexnum, "%x", &x);				s[i] = (char) (0x0ff & x);			} else {				s[i] = '%';			}		}	}	s[i] = '\0';}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -