📄 parser.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
static char rcsid[] = "$Id: parser.c,v 2.1 1997/03/21 17:20:05 sxw Exp $";/* *  parser.c -- Broker * *  parse input from Gatherer and perform operations using collector utils. *  parser for the Collector<->Gatherer protocol * *  DEBUG: section  72, level 1         Broker SOIF parsing routines *  AUTHOR: Harvest derived (William G. Camargo, Darren R. Hardy) * *  Harvest Indexer http://www.tardis.ed.ac.uk/harvest/ *  --------------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. *  *  Please mail harvest@tardis.ed.ac.uk if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. *   *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. *   *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//*  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. *   *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): *   *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. *   *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. *   *  TERMS OF USE *     *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. *     *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. *   *  DERIVATIVE WORKS *   *    Users may make derivative works from the Harvest software, subject  *    to the following constraints: *   *      - You must include the above copyright notice and these  *        accompanying paragraphs in all forms of derivative works,  *        and any documentation and other materials related to such  *        distribution and use acknowledge that the software was  *        developed at the above institutions. *   *      - You must notify IRTF-RD regarding your distribution of  *        the derivative work. *   *      - You must clearly notify users that your are distributing  *        a modified version and not the original Harvest software. *   *      - Any derivative product is also subject to these copyright  *        and use restrictions. *   *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. *   *  HISTORY OF FREE SOFTWARE STATUS *   *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards.   *   */#include "broker.h"#include "log.h"#define SP_SIZE BUFSIZchar space[SP_SIZE];extern int recv_nobjs;extern GathererID *COL_gid;/* parse an input file. */int P_parse_input(tfile, type)     char *tfile;     int type;{	int err = SUCCESS;	int Mode = NO_MODE;	FILE *InFile = NULL;	Mode = NO_MODE;	InFile = NULL;	Debug(72, 1, ("P_parse_input: starting with type %d\n", type));	/* The tfile is really a FILE * to 'gather' if a Gatherer */	if (type < BAFULL_U) {		InFile = (FILE *) tfile;	} else if ((InFile = fopen(tfile, "r")) == NULL) {		errorlog("Parser: Cannot read %s\n", tfile);		return ERROR;	}	while (err == SUCCESS) {		err = P_parse_command(InFile, Mode);	}	if (type < BAFULL_U) {		COL_Close_Read_Pipe(InFile);	} else {		(void) fclose(InFile);		if (unlink(tfile) < 0) {			errorlog("Parser:  Cannot remove %s\n", tfile);			log_errno(tfile);			xfree(tfile);			return ERROR;		}		xfree(tfile);	}	return (err);}/* Do the commands: update, delete or refresh */int P_parse_command(InFile, Mode)     FILE *InFile;     int Mode;{	char *command;	int nextc, n = 0;	Debug(72, 1, ("P_parse_command: starting with Mode %d\n", Mode));	nextc = P_get_next_char(InFile);	if ((nextc == EOF) || (nextc != '@'))		return ERROR;	command = space;	command[0] = '\0';	if (fgets(command, SP_SIZE, InFile) == NULL) {		errorlog("P_parse_command: Cannot read command.\n");		return ERROR;	}	if (strncmp(command, "DELETE", 6) == 0) {		if (strchr(command, '}') != NULL)			return SUCCESS;		/* nop */		Mode = DEL_MODE;	} else if (strncmp(command, "UPDATE", 6) == 0) {		if (strchr(command, '}') != NULL)			return SUCCESS;		/* nop */		Mode = UPD_MODE;	} else if (strncmp(command, "REFRESH", 7) == 0) {		if (strchr(command, '}') != NULL)			return SUCCESS;		/* nop */		Mode = REF_MODE;	} else {		errorlog("Parser: P_parse_command: Cannot determine next command: %s\n", command);		return ERROR;	}	init_parse_template_file(InFile);	while (P_parse_object(Mode) == SUCCESS) {		/* every 100 objects, give status */		if (recv_nobjs > 0 && recv_nobjs % 250 == 0) {			Log("Received %d objects so far...\n", recv_nobjs);		}		/* check for pending connections */		if ((n++ & 0x1F) == 0) {			(void) select_loop(0, 0, 0);		}	}	finish_parse_template();	return SUCCESS;}/* update/delete/refresh an object */int P_parse_object(Mode)     int Mode;{	reg_t *new_r;	Template *template;	AVList *walker;	FILE *OutFile = NULL;	extern time_t max_update_time;	/* 	 *  Read the next template from the input.  If the parser returns NULL,	 *  then we check to see if we've reached the end of the file.  If	 *  So we stop the parsing by returning ERROR; otherwise we continue	 *  trying to parse by running SUCCESS.	 */	if ((template = parse_template()) == NULL)		return (is_parse_end_of_input()? ERROR : SUCCESS);	Debug(72, 1, ("P_parse_object: received object: %s\n", template->url));	recv_nobjs++;	/* Set up the new reg_t record; and add the URL to it */	new_r = (reg_t *) xmalloc(sizeof(reg_t));	memset(new_r, '\0', sizeof(reg_t));	/* null entire record */	new_r->url = xstrdup(template->url);	/* Save URL in reg ent */	new_r->urls = strlen(new_r->url);	new_r->GID = -1;	/* Find a file to which to write the template */	if (Mode == UPD_MODE) {		if ((OutFile = COL_UPD_Obj_begin(new_r)) == NULL) {			Log("WARNING: Cannot initialize update: %s (FD %d).\n",			    new_r->url, new_r->FD);			free_template(template);			RG_Free_Entry(new_r);			return ERROR;		}	} else {		OutFile = NULL;	}	/* 	 *  Walk the attribute-value list of the template, and save	 *  away the needed reg_t values into new_r.  Also, normalize	 *  all attribute names.	 */	COL_gid = (GathererID *) xmalloc(sizeof(GathererID));	memset(COL_gid, '\0', sizeof(GathererID));	COL_gid->GID = -1;	for (walker = template->list; walker; walker = walker->next) {		(void) COL_Normalize_Name(walker->data->attribute);		(void) COL_Save_Att(walker->data, new_r);	}	/* pick off max update time for logging in LASTUPDATE */	max_update_time = new_r->update_time > max_update_time ?	    new_r->update_time : max_update_time;	/* Now write the template to the file in the database, if needed */	if (OutFile != NULL) {		(void) init_print_template(OutFile);		print_template(template);		finish_print_template();		(void) fclose(OutFile);		OutFile = NULL;	}	free_template(template);	/* Don't need anymore */	/* Verify/correct the reg_t record */	if (COL_Fill_Entry(new_r) == ERROR) {		/* backout of the changes */		if (Mode == UPD_MODE)			(void) SM_Destroy_Obj(new_r->FD);		RG_Free_Entry(new_r);		RG_gid_free(COL_gid);		return ERROR;	}	RG_gid_free(COL_gid);	/* Finish the job */	switch (Mode) {	case UPD_MODE:		return (COL_UPD_Obj_end(new_r));	case DEL_MODE:		return (COL_DEL_Obj(new_r));	case REF_MODE:		return (COL_REF_Obj(new_r));	default:		break;	}	errorlog("P_parse_error: Internal error: Illegal Mode: %d\n", Mode);	return (ERROR);}/* get next non-whitespace character on input stream */int P_get_next_char(InFile)     FILE *InFile;{	int tmp;	tmp = getc(InFile);	while (isspace((unsigned char) tmp)) {		tmp = getc(InFile);	}	return (tmp);}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -