⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 lsm2soif.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
static char rcsid[] = "lsm2soif.c,v 1.20 1996/01/05 20:28:19 duane Exp";/* *  lsm2soif - Converts Linux Software Maps (lsm) to SOIF. * *  Usage: lsm2soif url local-file * *  Darren Hardy, hardy@cs.colorado.edu, June 1994 *  Updated for new IAFA-like Aug94 LSM format, April 1995 * *  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. *   *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): *   *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. *   *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. *   *  TERMS OF USE *     *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. *     *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. *   *  DERIVATIVE WORKS *   *    Users may make derivative works from the Harvest software, subject  *    to the following constraints: *   *      - You must include the above copyright notice and these  *        accompanying paragraphs in all forms of derivative works,  *        and any documentation and other materials related to such  *        distribution and use acknowledge that the software was  *        developed at the above institutions. *   *      - You must notify IRTF-RD regarding your distribution of  *        the derivative work. *   *      - You must clearly notify users that your are distributing  *        a modified version and not the original Harvest software. *   *      - Any derivative product is also subject to these copyright  *        and use restrictions. *   *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. *   *  HISTORY OF FREE SOFTWARE STATUS *   *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards.   *   */#include <stdio.h>#include <stdlib.h>#include <string.h>#include "util.h"#include "url.h"#include "template.h"#define LSM_DTYPE "Linux-Software"/* Local functions */static void do_lsmtosoif();/* Local variables */static int n_flag = 0;static void usage(){	fprintf(stderr, "Usage: lsm2soif url local-file\n");	exit(1);}static void do_lsmtosoif(url, filename)     char *url;     char *filename;{	char buf[BUFSIZ], attr[BUFSIZ], value[BUFSIZ], pattr[BUFSIZ];	char *sv, *pv, *fv, *s, *p;	int i, mode_aug94 = 1, mode_old = 0;	Template *t;	FILE *fp;	URL *up;	Buffer *val;	AVList *walker;	if ((up = url_open(url)) == NULL) {		errorlog("Cannot open URL: %s\n", url);		return;	}	/* Build the template */	t = create_template(NULL, up->url);	/* Read the file and build a SOIF template from it */	if ((fp = fopen(filename, "r")) == NULL) {		log_errno(filename);		url_close(up);		return;	}	val = create_buffer(BUFSIZ);	pattr[0] = '\0';	while (fgets(buf, BUFSIZ, fp)) {		/* strip trailing newline */		if ((s = strrchr(buf, '\n')) != NULL)			*s = '\0';		/* check for Begin/End tags */		if (!strcmp(buf, "End"))			break;		if (!strcmp(buf, "Begin3")) {			mode_aug94 = 1;			mode_old = 0;			continue;		} else if (!strcmp(buf, "Begin")) {			mode_old = 1;			mode_aug94 = 0;			continue;		} else if (!strcmp(buf, "Begin2")) {			mode_old = 1;			mode_aug94 = 0;			continue;		}		if (mode_old) {	/* very old-style */			if ((s = strchr(buf, '=')) == NULL)				continue;	/* not an old-style LSM line */			for (p = buf, i = 0; p < s && !isspace(*p); p++, i++)				attr[i] = *p;			attr[i] = '\0';			if (i < 1)				continue;	/* null attribute */			if (isdigit(attr[--i]))				attr[i] = '\0';		/* strip attribute number */			while (*s != '\0' && (*s == '=' || isspace(*s)))				s++;			if (!strcmp(attr, "Site") ||			    !strcmp(attr, "Path") ||			    !strcmp(attr, "File")) {				if ((p = strchr(s, ' ')) != NULL)					*p = '\0';				if ((p = strchr(s, '\t')) != NULL)					*p = '\0';			}			if (strlen(s) < 1)	/* empty line */				continue;			strcpy(value, s);			if (t->list)				append_AVList(t->list, attr, value,				    strlen(value));			else				t->list = create_AVList(attr, value,				    strlen(value));		}		if (!mode_aug94 || buf[0] == '\0')			continue;		/* current aug94 format */		strncat(buf, "\n", 1);	/* replace newline */		/*		 *  This is a simple state machine.  Either the		 *  line contains an attribute, or the line		 *  contains data associated with the previous attr.		 */		if (!isspace(buf[0]) && sscanf(buf, "%[A-Za-z-]:", attr) == 1) {			if (pattr[0] == '\0')				strcpy(pattr, attr);			memset(buf, ' ', strlen(attr) + 1);	/* erase attr */		}		/* See if we've switch attributes, if so purge */		if (strcmp(pattr, attr) != 0) {			if (t->list)				append_AVList(t->list, pattr, val->data,				    val->length);			else				t->list = create_AVList(pattr, val->data,				    val->length);			shrink_buffer(val);		}		for (p = buf; *p && isspace(*p); p++)			/* skip spaces */ ;		add_buffer(val, p, strlen(p));		(void) strcpy(pattr, attr);		memset(buf, '\0', sizeof(buf));	}	fclose(fp);	if (mode_aug94 && pattr[0]) {		if (t->list)			append_AVList(t->list, pattr, val->data, val->length);		else			t->list = create_AVList(pattr, val->data, val->length);	}	free_buffer(val);	if (mode_old) {		AVPair *site_avp, *path_avp, *file_avp;		/* Reset t->url to the file that the LSM points to */		site_avp = extract_AVPair(t->list, "Site");		if (site_avp == NULL)			site_avp = extract_AVPair(t->list, "Maintained-At");		if (site_avp == NULL)			site_avp = extract_AVPair(t->list, "MaintAt");		path_avp = extract_AVPair(t->list, "Path");		if (path_avp == NULL)			path_avp = extract_AVPair(t->list, "PathFile");		file_avp = extract_AVPair(t->list, "File");		if (file_avp == NULL)			file_avp = extract_AVPair(t->list, "Package-Name");		if (file_avp == NULL)			file_avp = extract_AVPair(t->list, "PkgName");		if (file_avp == NULL)			file_avp = extract_AVPair(t->list, "PathFile");		if (site_avp) {			sv = strdup(site_avp->value);			pv = strdup(path_avp ? path_avp->value : "/???/");			fv = strdup(file_avp ? file_avp->value : "???");			for (p = sv; *p && !isspace(*p); p++);			*p = '\0';			for (p = pv; *p && !isspace(*p); p++);			*p = '\0';			for (p = fv; *p && !isspace(*p); p++);			*p = '\0';			if (*pv == '/' && *fv == '/')				sprintf(buf, "ftp://%s%s%s", sv, pv, fv);			else if (*pv == '/' && *fv != '/')				sprintf(buf, "ftp://%s%s/%s", sv, pv, fv);			else if (*pv != '/' && *fv == '/')				sprintf(buf, "ftp://%s/%s%s", sv, pv, fv);			else				sprintf(buf, "ftp://%s/%s/%s", sv, pv, fv);			xfree(t->url);			t->url = strdup(buf);			xfree(sv);			xfree(pv);			xfree(fv);		}	} else if (mode_aug94) {		AVPair *pavp, *aavp, *oavp, *avp;		/* Reset t->url to the file that the LSM points to */		pavp = extract_AVPair(t->list, "Primary-Site");		aavp = extract_AVPair(t->list, "Alternate-Site");		oavp = extract_AVPair(t->list, "Original-Site");		avp = pavp ? pavp : (aavp ? aavp : (oavp ? oavp : NULL));		if (avp) {			char stuff[3][BUFSIZ];			if (sscanf(avp->value, "%s %s\n%[^\n]\n",				stuff[0],	/* site */				stuff[1],	/* base directory */				stuff[2])	/* size + filename */			    == 3) {				sv = strdup(stuff[0]);				pv = strdup(stuff[1]);				for (p = sv; *p && !isspace(*p); p++);				*p = '\0';				for (p = pv; *p && !isspace(*p); p++);				*p = '\0';				/* fv is last segment */				for (p = stuff[2] + strlen(stuff[2]); p > stuff[2]; p--)					if (isspace(*p)) {						p++;						break;					}				fv = strdup(p);				if (strchr(fv, '/') != NULL) {	/* fv has full path */					if (*fv == '/')						sprintf(buf, "ftp://%s%s", sv, fv);					else						sprintf(buf, "ftp://%s/%s", sv, fv);				} else if (*pv == '/')					sprintf(buf, "ftp://%s%s/%s", sv, pv, fv);				else					sprintf(buf, "ftp://%s/%s/%s", sv, pv, fv);				xfree(t->url);				t->url = strdup(buf);				if (sv)					xfree(sv);				if (pv)					xfree(pv);				if (fv)					xfree(fv);			}		}	}	/* verify attributes in the template */	for (walker = t->list; walker; walker = walker->next) {		if ((p = strchr(walker->data->attribute, ':')) != NULL) {			strcpy(buf, ++p);			strcpy(walker->data->attribute, buf);		}		/* Make Desc lines Description lines */		if (!strcmp(walker->data->attribute, "Desc")) {			xfree(walker->data->attribute);			walker->data->attribute = strdup("Description");		}	}	if (t->list)		append_AVList(t->list, "Type", LSM_DTYPE, strlen(LSM_DTYPE));	else		t->list = create_AVList("Type", LSM_DTYPE, strlen(LSM_DTYPE));	/* Print out the template */	(void) init_print_template(stdout);	print_template(t);	finish_print_template();	free_template(t);	url_close(up);}int main(argc, argv)     int argc;     char *argv[];{	char *url, *filename;	if (argc != 3)		usage();	url = strdup(argv[1]);	filename = strdup(argv[2]);	init_log(stderr, stderr);	init_url();	do_lsmtosoif(url, filename);	finish_url();	exit(0);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -