📄 recognize.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
static char rcsid[] = "recognize.c,v 1.34 1996/01/05 20:28:58 duane Exp";/* *  recognize.c - Type recognition for the Essence system. * *  DEBUG: section  63, level 1         Gatherer essence type recognition * *  Darren Hardy, hardy@cs.colorado.edu, February 1994 * *  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. *   *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): *   *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. *   *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. *   *  TERMS OF USE *     *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. *     *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. *   *  DERIVATIVE WORKS *   *    Users may make derivative works from the Harvest software, subject  *    to the following constraints: *   *      - You must include the above copyright notice and these  *        accompanying paragraphs in all forms of derivative works,  *        and any documentation and other materials related to such  *        distribution and use acknowledge that the software was  *        developed at the above institutions. *   *      - You must notify IRTF-RD regarding your distribution of  *        the derivative work. *   *      - You must clearly notify users that your are distributing  *        a modified version and not the original Harvest software. *   *      - Any derivative product is also subject to these copyright  *        and use restrictions. *   *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. *   *  HISTORY OF FREE SOFTWARE STATUS *   *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards.   *   */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <memory.h>#include <sys/types.h>#include <sys/param.h>#include <ctype.h>#include "url.h"#include "util.h"#include "essence.h"/* Local data structures */struct type_regex {		/* maps a type name into a regex pattern */	char *type;	char *pattern;#if defined(USE_GNU_REGEX) || defined(USE_POSIX_REGEX)	regex_t compiled_pattern;	/* can reuse compiled patterns */#endif};static struct type_regex *types_by_name = NULL;static struct type_regex *types_by_content = NULL;static struct type_regex *types_by_url = NULL;static int ntypes_by_name = 0;static int ntypes_by_content = 0;static int ntypes_by_url = 0;/* Local Functions */static int init_types();/* Local macros */#if defined(USE_GNU_REGEX)#define type_match(s, tr) \	(re_match(&(tr).compiled_pattern, (s), strlen(s), 0, 0) > 0)#elif defined(USE_POSIX_REGEX)#define type_match(s, tr) \	(regexec(&(tr).compiled_pattern, (s), 0, 0, 0) == 0)#elif defined(USE_BSD_REGEX)#define type_match(s, tr) \	((re_comp((tr).pattern) == NULL) ? (re_exec(s) > 0) : 0)#endif/* *  type_recognize() - Recognizes the type of the given DataObject  *  and saves the type information in object->type.  Returns non-zero  *  on error; and 0 on success. */int type_recognize(object)     DataObject *object;{	int done = 0;	Debug(63, 1, ("type_recognize(%s)\n", object->url->url));	switch (object->url->type) {	case URL_FILE:		done = !type_recognize_by_stat(object) ||		    !type_recognize_by_url(object) ||		    !type_recognize_by_name(object) ||		    ((object->flags & F_NO_ACCESS) == 0 &&		    !type_recognize_by_content(object));		break;	case URL_HTTP:	case URL_FTP:	case URL_GOPHER:	case URL_NOP:	case URL_X:	case URL_NEWS:		done = !type_recognize_by_url(object) ||		    !type_recognize_by_name(object) ||		    ((object->flags & F_NO_ACCESS) == 0 &&		    !type_recognize_by_content(object));		break;	default:		errorlog("Unsupported URL: recognize: %s.\n", object->url->url);	}	/* Try external typing */	if (!done)		done = !type_recognize_by_external(object);	/* Default type */	if (!done)		object->type = strdup("Unknown");	/* Clean up */	if (object->data != NULL) {		xfree(object->data);		object->data = NULL;		object->dsize = 0;	}	return (0);}/* *  type_recognize_by_name() - Recognizes the type of a given DataObject *  using only file naming conventions.  Returns non-zero on error; and 0 *  on success. */int type_recognize_by_name(object)     DataObject *object;{	int i;	for (i = 0; i < ntypes_by_name; i++) {		if (type_match(object->basename, types_by_name[i])) {			object->type = strdup(types_by_name[i].type);			return (0);		}	}	return (1);}/* *  type_recognize_by_url() - Recognizes the type of a given DataObject *  using only URL naming conventions.  Returns non-zero on error; and 0 *  on success. */int type_recognize_by_url(object)     DataObject *object;{	int i;	for (i = 0; i < ntypes_by_url; i++) {		if (type_match(object->url->url, types_by_url[i])) {			object->type = strdup(types_by_url[i].type);			return (0);		}	}	return (1);}/* *  type_recognize_by_content() - Recognizes the type of a given DataObject *  by recognizing identifying data within the object.  Returns non-zero  *  on error; and 0 on success. */int type_recognize_by_content(object)     DataObject *object;{	char *s, *rawtype = NULL;	int i, sz;	extern char *softmagic(), *ascmagic();	if (object_retrieve(object))		return (1);	/* First read the first bytes of the file, if not already there */	if (object->data == NULL) {		if (object->s->st_size == 0) {			object->dsize = 0;			object->data = NULL;			object->type = strdup("Empty");			return (0);		}		/* Check to see if we can easily read the entire file */		sz = (object->s->st_size < (MIN_XFER * 4)) ?		    object->s->st_size : MIN_XFER;		/* Need some extra room because file() isn't nice */		object->data = xmalloc(sz + 32);		memset(object->data, '\0', sz + 32);		object->dsize = url_read(object->data, sz, 0, object->url);		Debug(63, 1, ("Reading %d bytes (got %d) from %s\n", sz,			object->dsize, object->url->url));		if (object->dsize <= 0) {			xfree(object->data);			object->data = NULL;			object->dsize = 0;			return (1);		}	}	/* Now use the routines from file(1) to identify contents */	if ((s = softmagic(object->data)) != NULL)		rawtype = strdup(s);	else if ((s = ascmagic(object->data, object->dsize)) != NULL)		rawtype = strdup(s);	else		return (1);	/* still unknown */	/* Match the output of file(1) with our regular expressions */	for (i = 0; i < ntypes_by_content; i++) {		if (type_match(rawtype, types_by_content[i])) {			object->type = strdup(types_by_content[i].type);			xfree(rawtype);			return (0);		}	}	xfree(rawtype);	return (1);		/* still unknown */}/* *  type_recognize_by_stat() - Recognizes the type of a given DataObject *  by looking at the stat(2) data for the object.  Returns non-zero  *  on error; and 0 on success. */int type_recognize_by_stat(object)     DataObject *object;{	if (S_ISDIR(object->s->st_mode)) {		object->type = strdup("Directory");		return (0);	}#ifdef S_ISLNK	if (S_ISLNK(object->s->st_mode)) {		object->type = strdup("SymbolicLink");		return (0);	}#endif#ifdef S_ISSOCK	if (S_ISSOCK(object->s->st_mode)) {		object->type = strdup("Socket");		return (0);	}#endif	if (!S_ISREG(object->s->st_mode)) {	/* bizarre file */		object->type = strdup("Unknown");		return (0);	}	return (1);}/* *  type_recognize_by_external() - Recognizes the type of a given DataObject *  using external, user-defined programs.  Returns non-zero on error; and 0 *  on success. */int type_recognize_by_external(object)     DataObject *object;{	return (1);		/* stub */}/* Initialization routines *//* *  init_type_recognize() - Initialize type recognition step. */int init_type_recognize(cfile_by_name, cfile_by_content, cfile_by_url, magic_file)     char *cfile_by_name;     char *cfile_by_content;     char *cfile_by_url;     char *magic_file;{	int apprentice(), init_types();	/* Read in magic file for file(1) routines */	Debug(63, 1, ("Using %s as magic file.\n", magic_file));	if (apprentice(magic_file, 0)) {		log_errno(magic_file);		return (1);	}	/* Read in by-name and by-content regular expressions */	Debug(63, 1, ("Using %s as by-name configuration file.\n", cfile_by_name));	Debug(63, 1, ("Using %s as by-content configuration file.\n", cfile_by_content));	Debug(63, 1, ("Using %s as by-url configuration file.\n", cfile_by_url));	types_by_name = xmalloc(MAX_TYPES * sizeof(struct type_regex));	types_by_content = xmalloc(MAX_TYPES * sizeof(struct type_regex));	types_by_url = xmalloc(MAX_TYPES * sizeof(struct type_regex));	if (init_types(cfile_by_name, types_by_name, &ntypes_by_name) ||	    init_types(cfile_by_url, types_by_url, &ntypes_by_url) ||	    init_types(cfile_by_content, types_by_content, &ntypes_by_content)) {		xfree(types_by_name);		xfree(types_by_url);		xfree(types_by_content);		return (1);	}	if (debug_ok(63, 1)) {		int i;		for (i = 0; i < ntypes_by_content; i++)			Log("Type: %s\tRE: %s\n", types_by_content[i].type,			    types_by_content[i].pattern);		for (i = 0; i < ntypes_by_name; i++)			Log("Type: %s\tRE: %s\n", types_by_name[i].type,			    types_by_name[i].pattern);		for (i = 0; i < ntypes_by_url; i++)			Log("Type: %s\tRE: %s\n", types_by_url[i].type,			    types_by_url[i].pattern);	}	return (0);}/* *  finish_type_recognize() - Cleans up after the type recognition step. */void finish_type_recognize(){	int i;	for (i = 0; i < ntypes_by_name; i++) {		if (types_by_name[i].type != NULL)			xfree(types_by_name[i].type);		if (types_by_name[i].pattern != NULL)			xfree(types_by_name[i].pattern);#if defined(USE_POSIX_REGEX)		regfree(&types_by_name[i].compiled_pattern);#endif	}	if (types_by_name != NULL)		xfree(types_by_name);	types_by_name = NULL;	ntypes_by_name = 0;	for (i = 0; i < ntypes_by_content; i++) {		if (types_by_content[i].type != NULL)			xfree(types_by_content[i].type);		if (types_by_content[i].pattern != NULL)			xfree(types_by_content[i].pattern);#if defined(USE_POSIX_REGEX)		regfree(&types_by_content[i].compiled_pattern);#endif	}	if (types_by_content != NULL)		xfree(types_by_content);	types_by_content = NULL;	ntypes_by_content = 0;	for (i = 0; i < ntypes_by_url; i++) {		if (types_by_url[i].type != NULL)			xfree(types_by_url[i].type);		if (types_by_url[i].pattern != NULL)			xfree(types_by_url[i].pattern);#if defined(USE_POSIX_REGEX)		regfree(&types_by_url[i].compiled_pattern);#endif	}	if (types_by_url != NULL)		xfree(types_by_url);	types_by_url = NULL;	ntypes_by_url = 0;}/* *  init_types() - Initializes the given type_regex array with the regular *  expressions from filename.  Returns 0 on success; non-zero otherwise. */static int init_types(filename, t, nt)     char *filename;     struct type_regex *t;     int *nt;{	FILE *fp;	char buf[BUFSIZ], *s;	int ret;	if ((fp = fopen(filename, "r")) == NULL) {		log_errno(filename);		return (1);	}	while (fgets(buf, BUFSIZ, fp) != NULL) {		if (buf[0] == '#' || buf[0] == '\n')			continue;		if ((s = strrchr(buf, '\n')) != NULL)			*s = '\0';		for (s = &buf[0]; !isspace(*s); s++);		*s = '\0';		t[*nt].type = strdup(buf);		for (++s; isspace(*s); s++);		t[*nt].pattern = strdup(s);#if defined(USE_GNU_REGEX)		re_syntax_options = USE_RE_SYNTAX;		ret = !(re_compile_pattern(t[*nt].pattern,			(int) strlen(t[*nt].pattern),			&t[*nt].compiled_pattern) == NULL);#elif defined(USE_POSIX_REGEX)		ret = regcomp(&t[*nt].compiled_pattern, t[*nt].pattern,		    USE_RE_SYNTAX);#else		ret = !(re_comp(t[*nt].pattern) == NULL);	/* test it */#endif		if (ret != 0) {			fatal("Couldn't compile %s", t[*nt].pattern);		}		if (++(*nt) >= MAX_TYPES) {			errorlog("WARNING!: %s has too many types.\n", filename);			break;		}	}	fclose(fp);	return (0);}
💿 文件大小 7910 K
👤 上传用户 pc1667pc1667
📂 所属分类网络
🏷️ 相关标签

#harvest #html #页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -