⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htmlurls.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
static char rcsid[] = "$Id: HTMLurls.c,v 2.4 2000/01/21 17:37:33 sxw Exp $";/* *  HTMLurls - Prints all of the URLs from an HTML file. * *  Uses code from NCSA Mosaic (version 2.2) libhtmlw. * *  Usage: HTMLurls filename * *  DEBUG: none *  AUTHOR: Harvest derived * *  Harvest Indexer http://harvest.sourceforge.net/ *  ----------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. * *  Please mail lee@arco.de if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. * *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. * *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//*  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <ctype.h>#include "HTML.h"#include "util.h"#include "url.h"/* Local Variables */static int RobotsIndex=1;static int RobotsFollow=1;static Buffer *urls = NULL;static char *base = NULL;#if 0 /* kjl/7mar2002 *//* Global */char *Url = NULL;#endifstatic void usage(){    fprintf(stderr, "Usage: HTMLurls [--base-url url] filename\n");    exit(1);}/* *  strstr_icase - Looks for string b in string a.  Case insenstive cmps. */char *strstr_icase(a, b)     char *a, *b;{    int asz = strlen(a), bsz = strlen(b);    static char *p;    p = a;    while (asz >= bsz) {	if (!strncasecmp(p, b, bsz))	    return (p);	p++;	asz--;    }    return (NULL);}void process_metarobots(s)     char *s;{    char *p, *q, *tmps, *v;    int all;    /* Find the NAME in the META */    if ((tmps = strstr_icase(s, "name")) == NULL)      return;    /* Grab the NAME from the NAME = tag */    if ((p = strchr(tmps, '=')) == NULL)      return;    p++;                    /* skip '=' */    while (isspace(*p) || (*p == '\"'))      p++;                /* skip space '"'s */    q = xstrdup(p);          /* copy name */    if ((p = strchr(q, '\"')) != NULL)      /* terminate string */      *p = '\0';    if ((p = strchr(q, ' ')) != NULL)       /* terminate string */      *p = '\0';    /* printf("NAME: %s\n",q); */    if (strcasecmp("ROBOTS",q)!=0) {        /* check for robots string */      xfree(q);      return;    }    xfree(q);                               /* finished with that bit */    if ((tmps = strstr_icase(s,"content")) == NULL) /* Find content bit */      return;    if ((p = strchr(tmps, '=')) == NULL)    /* Find equals */      return;    p++;                                    /* skip '=' */    while (isspace(*p) || (*p == '\"'))      p++;                                  /* skip space '"'s */    q = xstrdup(p);                         /* copy name */    if ((p = strchr(q, '\"')) != NULL)      /* terminate string */      *p = '\0';    /* printf("CONTENT: %s\n",q); */    /* Okay - q should now contain the robots meta tag configuration string     * This should be comma seperated - but we'll allow them to use spaces     * just to be nice.     */    all=0;    v = strtok(q,", ");    while (v!=NULL) {      /* printf("TAG: %s\n",v); */      if (strcasecmp(v,"all")==0) {	all=1;      } else if (strcasecmp(v,"none")==0) {	RobotsFollow=0;	RobotsIndex=0;      } else if (strcasecmp(v,"index")==0) {	RobotsIndex=1;      } else if (strcasecmp(v,"noindex")==0) {	RobotsIndex=0;      } else if (strcasecmp(v,"follow")==0) {	RobotsFollow=1;      } else if (strcasecmp(v,"nofollow")==0) {	RobotsFollow=0;      }      v = strtok(NULL,", ");    }    /* All overrides everything else */    if (all==1) {      RobotsFollow=1;      RobotsIndex=1;    }    /*    printf("INDEX: %s\n",RobotsIndex?"Yes":"No");    printf("FOLLOW: %s\n",RobotsFollow?"Yes":"No");    */}/* * process_metapull - extract the URL from the META tag for client pull * <META HTTP-EQUIV="REFRESH" CONTENT="<secs>; URL=URL">	Netscape 1.1 * <META HTTP-EQUIV="REFRESH" CONTENT="<secs>,URI">		W3C, HTML 4.0 (REC-html40-19980424) * HS, 20-Oct-1997, 03-May-1998 */void process_metapull(s)     char *s;{    char *p, *q, *tmps, *v;    /* Find the HTTP-EQUIV in the META */    if ((tmps = strstr_icase(s, "http-equiv")) == NULL)      return;    /* Skip over the HTTP-EQUIV from the HTTP-EQUIV = tag */    if ((p = strchr(tmps, '=')) == NULL)      return;    p++;                    /* skip '=' */    while (isspace(*p) || (*p == '\"'))      p++;                /* skip space '"'s */    q = xstrdup(p);          /* copy name */    if ((p = strchr(q, '\"')) != NULL)      /* terminate string */      *p = '\0';    if ((p = strchr(q, ' ')) != NULL)       /* terminate string */      *p = '\0';    /* printf("HTTP-EQUIV: %s\n",q); */    if (strcasecmp("REFRESH",q)!=0) {        /* check for 'REFRESH' */      xfree(q);      return;    }    xfree(q);                               /* finished with that bit */    if ((tmps = strstr_icase(s, "content")) == NULL) /* Find content bit */      return;    if ((p = strchr(tmps, '=')) == NULL)    /* Find equals */      return;    if ((p = strpbrk(p, ",;")) == NULL)     /* There is no URL in this refresh, not interesting */      return;    p++;				    /* Skip delimiter, ';' or ',' */    while (isspace(*p)) p++;                /* skip spaces */    /* p is now first char of something, it may be 'URL=' or the URL itself */    q = p;	/* save p for later */    /* If there is URL=, skip it */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -