📄 recognize.c
字号:
static char rcsid[] = "recognize.c,v 1.34 1996/01/05 20:28:58 duane Exp";/* * recognize.c - Type recognition for the Essence system. * * DEBUG: section 63, level 1 Gatherer essence type recognition * * Darren Hardy, hardy@cs.colorado.edu, February 1994 * * ---------------------------------------------------------------------- * Copyright (c) 1994, 1995. All rights reserved. * * The Harvest software was developed by the Internet Research Task * Force Research Group on Resource Discovery (IRTF-RD): * * Mic Bowman of Transarc Corporation. * Peter Danzig of the University of Southern California. * Darren R. Hardy of the University of Colorado at Boulder. * Udi Manber of the University of Arizona. * Michael F. Schwartz of the University of Colorado at Boulder. * Duane Wessels of the University of Colorado at Boulder. * * This copyright notice applies to software in the Harvest * ``src/'' directory only. Users should consult the individual * copyright notices in the ``components/'' subdirectories for * copyright information about other software bundled with the * Harvest source code distribution. * * TERMS OF USE * * The Harvest software may be used and re-distributed without * charge, provided that the software origin and research team are * cited in any use of the system. Most commonly this is * accomplished by including a link to the Harvest Home Page * (http://harvest.cs.colorado.edu/) from the query page of any * Broker you deploy, as well as in the query result pages. These * links are generated automatically by the standard Broker * software distribution. * * The Harvest software is provided ``as is'', without express or * implied warranty, and with no support nor obligation to assist * in its use, correction, modification or enhancement. We assume * no liability with respect to the infringement of copyrights, * trade secrets, or any patents, and are not responsible for * consequential damages. Proper use of the Harvest software is * entirely the responsibility of the user. * * DERIVATIVE WORKS * * Users may make derivative works from the Harvest software, subject * to the following constraints: * * - You must include the above copyright notice and these * accompanying paragraphs in all forms of derivative works, * and any documentation and other materials related to such * distribution and use acknowledge that the software was * developed at the above institutions. * * - You must notify IRTF-RD regarding your distribution of * the derivative work. * * - You must clearly notify users that your are distributing * a modified version and not the original Harvest software. * * - Any derivative product is also subject to these copyright * and use restrictions. * * Note that the Harvest software is NOT in the public domain. We * retain copyright, as specified above. * * HISTORY OF FREE SOFTWARE STATUS * * Originally we required sites to license the software in cases * where they were going to build commercial products/services * around Harvest. In June 1995 we changed this policy. We now * allow people to use the core Harvest software (the code found in * the Harvest ``src/'' directory) for free. We made this change * in the interest of encouraging the widest possible deployment of * the technology. The Harvest software is really a reference * implementation of a set of protocols and formats, some of which * we intend to standardize. We encourage commercial * re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <memory.h>#include <sys/types.h>#include <sys/param.h>#include <ctype.h>#include "url.h"#include "util.h"#include "essence.h"/* Local data structures */struct type_regex { /* maps a type name into a regex pattern */ char *type; char *pattern;#if defined(USE_GNU_REGEX) || defined(USE_POSIX_REGEX) regex_t compiled_pattern; /* can reuse compiled patterns */#endif};static struct type_regex *types_by_name = NULL;static struct type_regex *types_by_content = NULL;static struct type_regex *types_by_url = NULL;static int ntypes_by_name = 0;static int ntypes_by_content = 0;static int ntypes_by_url = 0;/* Local Functions */static int init_types();/* Local macros */#if defined(USE_GNU_REGEX)#define type_match(s, tr) \ (re_match(&(tr).compiled_pattern, (s), strlen(s), 0, 0) > 0)#elif defined(USE_POSIX_REGEX)#define type_match(s, tr) \ (regexec(&(tr).compiled_pattern, (s), 0, 0, 0) == 0)#elif defined(USE_BSD_REGEX)#define type_match(s, tr) \ ((re_comp((tr).pattern) == NULL) ? (re_exec(s) > 0) : 0)#endif/* * type_recognize() - Recognizes the type of the given DataObject * and saves the type information in object->type. Returns non-zero * on error; and 0 on success. */int type_recognize(object) DataObject *object;{ int done = 0; Debug(63, 1, ("type_recognize(%s)\n", object->url->url)); switch (object->url->type) { case URL_FILE: done = !type_recognize_by_stat(object) || !type_recognize_by_url(object) || !type_recognize_by_name(object) || ((object->flags & F_NO_ACCESS) == 0 && !type_recognize_by_content(object)); break; case URL_HTTP: case URL_FTP: case URL_GOPHER: case URL_NOP: case URL_X: case URL_NEWS: done = !type_recognize_by_url(object) || !type_recognize_by_name(object) || ((object->flags & F_NO_ACCESS) == 0 && !type_recognize_by_content(object)); break; default: errorlog("Unsupported URL: recognize: %s.\n", object->url->url); } /* Try external typing */ if (!done) done = !type_recognize_by_external(object); /* Default type */ if (!done) object->type = strdup("Unknown"); /* Clean up */ if (object->data != NULL) { xfree(object->data); object->data = NULL; object->dsize = 0; } return (0);}/* * type_recognize_by_name() - Recognizes the type of a given DataObject * using only file naming conventions. Returns non-zero on error; and 0 * on success. */int type_recognize_by_name(object) DataObject *object;{ int i; for (i = 0; i < ntypes_by_name; i++) { if (type_match(object->basename, types_by_name[i])) { object->type = strdup(types_by_name[i].type); return (0); } } return (1);}/* * type_recognize_by_url() - Recognizes the type of a given DataObject * using only URL naming conventions. Returns non-zero on error; and 0 * on success. */int type_recognize_by_url(object) DataObject *object;{ int i; for (i = 0; i < ntypes_by_url; i++) { if (type_match(object->url->url, types_by_url[i])) { object->type = strdup(types_by_url[i].type); return (0); } } return (1);}/* * type_recognize_by_content() - Recognizes the type of a given DataObject * by recognizing identifying data within the object. Returns non-zero * on error; and 0 on success. */int type_recognize_by_content(object) DataObject *object;{ char *s, *rawtype = NULL; int i, sz; extern char *softmagic(), *ascmagic(); if (object_retrieve(object)) return (1); /* First read the first bytes of the file, if not already there */ if (object->data == NULL) { if (object->s->st_size == 0) { object->dsize = 0; object->data = NULL; object->type = strdup("Empty"); return (0); } /* Check to see if we can easily read the entire file */ sz = (object->s->st_size < (MIN_XFER * 4)) ? object->s->st_size : MIN_XFER; /* Need some extra room because file() isn't nice */ object->data = xmalloc(sz + 32); memset(object->data, '\0', sz + 32); object->dsize = url_read(object->data, sz, 0, object->url); Debug(63, 1, ("Reading %d bytes (got %d) from %s\n", sz, object->dsize, object->url->url)); if (object->dsize <= 0) { xfree(object->data); object->data = NULL; object->dsize = 0; return (1); } } /* Now use the routines from file(1) to identify contents */ if ((s = softmagic(object->data)) != NULL) rawtype = strdup(s); else if ((s = ascmagic(object->data, object->dsize)) != NULL) rawtype = strdup(s); else return (1); /* still unknown */ /* Match the output of file(1) with our regular expressions */ for (i = 0; i < ntypes_by_content; i++) { if (type_match(rawtype, types_by_content[i])) { object->type = strdup(types_by_content[i].type); xfree(rawtype); return (0); } } xfree(rawtype); return (1); /* still unknown */}/* * type_recognize_by_stat() - Recognizes the type of a given DataObject * by looking at the stat(2) data for the object. Returns non-zero * on error; and 0 on success. */int type_recognize_by_stat(object) DataObject *object;{ if (S_ISDIR(object->s->st_mode)) { object->type = strdup("Directory"); return (0); }#ifdef S_ISLNK if (S_ISLNK(object->s->st_mode)) { object->type = strdup("SymbolicLink"); return (0); }#endif#ifdef S_ISSOCK if (S_ISSOCK(object->s->st_mode)) { object->type = strdup("Socket"); return (0); }#endif if (!S_ISREG(object->s->st_mode)) { /* bizarre file */ object->type = strdup("Unknown"); return (0); } return (1);}/* * type_recognize_by_external() - Recognizes the type of a given DataObject * using external, user-defined programs. Returns non-zero on error; and 0 * on success. */int type_recognize_by_external(object) DataObject *object;{ return (1); /* stub */}/* Initialization routines *//* * init_type_recognize() - Initialize type recognition step. */int init_type_recognize(cfile_by_name, cfile_by_content, cfile_by_url, magic_file) char *cfile_by_name; char *cfile_by_content; char *cfile_by_url; char *magic_file;{ int apprentice(), init_types(); /* Read in magic file for file(1) routines */ Debug(63, 1, ("Using %s as magic file.\n", magic_file)); if (apprentice(magic_file, 0)) { log_errno(magic_file); return (1); } /* Read in by-name and by-content regular expressions */ Debug(63, 1, ("Using %s as by-name configuration file.\n", cfile_by_name)); Debug(63, 1, ("Using %s as by-content configuration file.\n", cfile_by_content)); Debug(63, 1, ("Using %s as by-url configuration file.\n", cfile_by_url)); types_by_name = xmalloc(MAX_TYPES * sizeof(struct type_regex)); types_by_content = xmalloc(MAX_TYPES * sizeof(struct type_regex)); types_by_url = xmalloc(MAX_TYPES * sizeof(struct type_regex)); if (init_types(cfile_by_name, types_by_name, &ntypes_by_name) || init_types(cfile_by_url, types_by_url, &ntypes_by_url) || init_types(cfile_by_content, types_by_content, &ntypes_by_content)) { xfree(types_by_name); xfree(types_by_url); xfree(types_by_content); return (1); } if (debug_ok(63, 1)) { int i; for (i = 0; i < ntypes_by_content; i++) Log("Type: %s\tRE: %s\n", types_by_content[i].type, types_by_content[i].pattern); for (i = 0; i < ntypes_by_name; i++) Log("Type: %s\tRE: %s\n", types_by_name[i].type, types_by_name[i].pattern); for (i = 0; i < ntypes_by_url; i++) Log("Type: %s\tRE: %s\n", types_by_url[i].type, types_by_url[i].pattern); } return (0);}/* * finish_type_recognize() - Cleans up after the type recognition step. */void finish_type_recognize(){ int i; for (i = 0; i < ntypes_by_name; i++) { if (types_by_name[i].type != NULL) xfree(types_by_name[i].type); if (types_by_name[i].pattern != NULL) xfree(types_by_name[i].pattern);#if defined(USE_POSIX_REGEX) regfree(&types_by_name[i].compiled_pattern);#endif } if (types_by_name != NULL) xfree(types_by_name); types_by_name = NULL; ntypes_by_name = 0; for (i = 0; i < ntypes_by_content; i++) { if (types_by_content[i].type != NULL) xfree(types_by_content[i].type); if (types_by_content[i].pattern != NULL) xfree(types_by_content[i].pattern);#if defined(USE_POSIX_REGEX) regfree(&types_by_content[i].compiled_pattern);#endif } if (types_by_content != NULL) xfree(types_by_content); types_by_content = NULL; ntypes_by_content = 0; for (i = 0; i < ntypes_by_url; i++) { if (types_by_url[i].type != NULL) xfree(types_by_url[i].type); if (types_by_url[i].pattern != NULL) xfree(types_by_url[i].pattern);#if defined(USE_POSIX_REGEX) regfree(&types_by_url[i].compiled_pattern);#endif } if (types_by_url != NULL) xfree(types_by_url); types_by_url = NULL; ntypes_by_url = 0;}/* * init_types() - Initializes the given type_regex array with the regular * expressions from filename. Returns 0 on success; non-zero otherwise. */static int init_types(filename, t, nt) char *filename; struct type_regex *t; int *nt;{ FILE *fp; char buf[BUFSIZ], *s; int ret; if ((fp = fopen(filename, "r")) == NULL) { log_errno(filename); return (1); } while (fgets(buf, BUFSIZ, fp) != NULL) { if (buf[0] == '#' || buf[0] == '\n') continue; if ((s = strrchr(buf, '\n')) != NULL) *s = '\0'; for (s = &buf[0]; !isspace(*s); s++); *s = '\0'; t[*nt].type = strdup(buf); for (++s; isspace(*s); s++); t[*nt].pattern = strdup(s);#if defined(USE_GNU_REGEX) re_syntax_options = USE_RE_SYNTAX; ret = !(re_compile_pattern(t[*nt].pattern, (int) strlen(t[*nt].pattern), &t[*nt].compiled_pattern) == NULL);#elif defined(USE_POSIX_REGEX) ret = regcomp(&t[*nt].compiled_pattern, t[*nt].pattern, USE_RE_SYNTAX);#else ret = !(re_comp(t[*nt].pattern) == NULL); /* test it */#endif if (ret != 0) { fatal("Couldn't compile %s", t[*nt].pattern); } if (++(*nt) >= MAX_TYPES) { errorlog("WARNING!: %s has too many types.\n", filename); break; } } fclose(fp); return (0);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -