📄 mkindex.c
字号:
static char rcsid[] = "$Id: mkindex.c,v 2.1 1997/03/21 19:21:59 sxw Exp $";/* * mkindex.c - Builds a (URL, Timestamp) and a (URL, MD5) hash table * from a (URL, Template) hash table. * * Usage: mkindex [indb tstmpdb md5db] * * For example, mkindex * mkindex PRODUCTION.gdbm INDEX.gdbm MD5.gdbm * * DEBUG: none * AUTHOR: Harvest derived * * Harvest Indexer http://www.tardis.ed.ac.uk/harvest/ * --------------------------------------------------- * * The Harvest Indexer is a continued development of code developed by * the Harvest Project. Development is carried out by numerous individuals * in the Internet community, and is not officially connected with the * original Harvest Project or its funding sources. * * Please mail harvest@tardis.ed.ac.uk if you are interested in participating * in the development effort. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * ---------------------------------------------------------------------- * Copyright (c) 1994, 1995. All rights reserved. * * The Harvest software was developed by the Internet Research Task * Force Research Group on Resource Discovery (IRTF-RD): * * Mic Bowman of Transarc Corporation. * Peter Danzig of the University of Southern California. * Darren R. Hardy of the University of Colorado at Boulder. * Udi Manber of the University of Arizona. * Michael F. Schwartz of the University of Colorado at Boulder. * Duane Wessels of the University of Colorado at Boulder. * * This copyright notice applies to software in the Harvest * ``src/'' directory only. Users should consult the individual * copyright notices in the ``components/'' subdirectories for * copyright information about other software bundled with the * Harvest source code distribution. * * TERMS OF USE * * The Harvest software may be used and re-distributed without * charge, provided that the software origin and research team are * cited in any use of the system. Most commonly this is * accomplished by including a link to the Harvest Home Page * (http://harvest.cs.colorado.edu/) from the query page of any * Broker you deploy, as well as in the query result pages. These * links are generated automatically by the standard Broker * software distribution. * * The Harvest software is provided ``as is'', without express or * implied warranty, and with no support nor obligation to assist * in its use, correction, modification or enhancement. We assume * no liability with respect to the infringement of copyrights, * trade secrets, or any patents, and are not responsible for * consequential damages. Proper use of the Harvest software is * entirely the responsibility of the user. * * DERIVATIVE WORKS * * Users may make derivative works from the Harvest software, subject * to the following constraints: * * - You must include the above copyright notice and these * accompanying paragraphs in all forms of derivative works, * and any documentation and other materials related to such * distribution and use acknowledge that the software was * developed at the above institutions. * * - You must notify IRTF-RD regarding your distribution of * the derivative work. * * - You must clearly notify users that your are distributing * a modified version and not the original Harvest software. * * - Any derivative product is also subject to these copyright * and use restrictions. * * Note that the Harvest software is NOT in the public domain. We * retain copyright, as specified above. * * HISTORY OF FREE SOFTWARE STATUS * * Originally we required sites to license the software in cases * where they were going to build commercial products/services * around Harvest. In June 1995 we changed this policy. We now * allow people to use the core Harvest software (the code found in * the Harvest ``src/'' directory) for free. We made this change * in the interest of encouraging the widest possible deployment of * the technology. The Harvest software is really a reference * implementation of a set of protocols and formats, some of which * we intend to standardize. We encourage commercial * re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <string.h>#include <time.h>#include <gdbm.h>#include "util.h"#include "template.h"/* Local functions */static void usage();static void usage(){ fprintf(stderr, "Usage: mkindex [indb timedb md5db]\n"); exit(1);}static GDBM_FILE indbf = NULL, tsdbf = NULL, mddbf = NULL;static void die(x) int x;{ if (indbf != NULL) gdbm_close(indbf); if (tsdbf != NULL) gdbm_close(tsdbf); if (mddbf != NULL) gdbm_close(mddbf); exit(x);}int main(argc, argv) int argc; char *argv[];{ char *infile = "PRODUCTION.gdbm"; char *tsfile = "INDEX.gdbm"; char *mdfile = "MD5.gdbm"; datum k; datum nextkey; datum d; datum newd; Template *template = NULL; AVPair *avp = NULL; /* * # of internal cache buckets in gdbm. We can use a large number * here because each bucket holds a d.dptr, but our d.dptrs are * very small in this program (max 33 bytes). * But cannot be too large because GDBM is sloppy in how it does it. */ int cache_size = 300; /* 3 times normal amount */ init_log3("mkindex", stderr, stderr); if (argc == 4) { infile = strdup(argv[1]); tsfile = strdup(argv[2]); mdfile = strdup(argv[3]); } else if (argc != 1) usage(); indbf = gdbm_open(infile, 0, GDBM_READER, 0644, NULL); if (indbf == NULL) { errorlog("gdbm_open: %s: %s\n", infile, gdbm_strerror(gdbm_errno)); die(1); } tsdbf = gdbm_open(tsfile, 0, GDBM_NEWDB, 0644, NULL); if (tsdbf == NULL) { errorlog("gdbm_open: %s: %s\n", tsfile, gdbm_strerror(gdbm_errno)); die(1); } if (gdbm_setopt(tsdbf, GDBM_CACHESIZE, &cache_size, sizeof(int))) Log("WARNING: Cannot reset GDBM cache size to %d.\n", cache_size); mddbf = gdbm_open(mdfile, 0, GDBM_NEWDB, 0644, NULL); if (mddbf == NULL) { errorlog("gdbm_open: %s: %s\n", mdfile, gdbm_strerror(gdbm_errno)); die(1); } if (gdbm_setopt(mddbf, GDBM_CACHESIZE, &cache_size, sizeof(int))) Log("WARNING: Cannot reset GDBM cache size to %d.\n", cache_size); /* * Extract the (URL, Template) from indbf, then extract the * timestamp from Template, then build the (Timestamp, URL) * in the tsdbf. */ k = gdbm_firstkey(indbf); while (k.dptr) { d = gdbm_fetch(indbf, k); if (d.dptr == NULL) { errorlog("gdbm_fetch: %s: %s\n", infile, gdbm_strerror(gdbm_errno)); die(1); } init_parse_template_string(d.dptr, d.dsize); template = parse_template(); finish_parse_template(); if (template == NULL) { Log("WARNING: %s is not parseable.\n", k.dptr); goto next_item; } avp = extract_AVPair(template->list, T_UPDATE); if (avp == NULL) { errorlog("%s not in template %s\n", T_UPDATE, template->url); die(1); } /* Store URL->timestamp mapping in the INDEX */ newd.dptr = xmalloc(avp->vsize + 1); memcpy(newd.dptr, avp->value, avp->vsize); newd.dptr[avp->vsize] = '\0'; newd.dsize = avp->vsize + 1; /* include \0 */ if (gdbm_store(tsdbf, k, newd, GDBM_INSERT)) { errorlog("gdbm_store: %s: %s\n", tsfile, gdbm_strerror(gdbm_errno)); die(1); } xfree(newd.dptr); avp = extract_AVPair(template->list, T_MD5); if (avp != NULL) { /* Store URL->timestamp mapping in the INDEX */ newd.dptr = xmalloc(avp->vsize + 1); memcpy(newd.dptr, avp->value, avp->vsize); newd.dptr[avp->vsize] = '\0'; newd.dsize = avp->vsize + 1; /* include \0 */ if (gdbm_store(mddbf, k, newd, GDBM_INSERT)) { errorlog("gdbm_store: %s: %s\n", mdfile, gdbm_strerror(gdbm_errno)); die(1); } xfree(newd.dptr); } next_item: nextkey = gdbm_nextkey(indbf, k); free(k.dptr); free(d.dptr); free_template(template); k = nextkey; } die(0);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -