⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dicttest.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
/* $Id: dicttest.c,v 1.27 2002/08/02 19:26:55 adam Exp $   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002   Index Data ApsThis file is part of the Zebra server.Zebra is free software; you can redistribute it and/or modify it underthe terms of the GNU General Public License as published by the FreeSoftware Foundation; either version 2, or (at your option) any laterversion.Zebra is distributed in the hope that it will be useful, but WITHOUT ANYWARRANTY; without even the implied warranty of MERCHANTABILITY orFITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public Licensefor more details.You should have received a copy of the GNU General Public Licensealong with Zebra; see the file LICENSE.zebra.  If not, write to theFree Software Foundation, 59 Temple Place - Suite 330, Boston, MA02111-1307, USA.*/#include <stdlib.h>#include <string.h>#include <stdio.h>#include <ctype.h>#include <dict.h>#include <zebrautl.h>char *prog;static Dict dict;static int look_hits;static int grep_handler (char *name, const char *info, void *client){    look_hits++;    printf ("%s\n", name);    return 0;}static int scan_handler (char *name, const char *info, int pos, void *client){    printf ("%s\n", name);    return 0;}int main (int argc, char **argv){    Res my_resource = 0;    BFiles bfs;    const char *name = NULL;    const char *inputfile = NULL;    const char *config = NULL;    const char *delete_term = NULL;    int scan_the_thing = 0;    int do_delete = 0;    int range = -1;    int srange = 0;    int rw = 0;    int infosize = 4;    int cache = 10;    int ret;    int unique = 0;    char *grep_pattern = NULL;    char *arg;    int no_of_iterations = 0;    int no_of_new = 0, no_of_same = 0, no_of_change = 0;    int no_of_hits = 0, no_of_misses = 0, no_not_found = 0, no_of_deleted = 0;    int max_pos;        prog = argv[0];    if (argc < 2)    {        fprintf (stderr, "usage:\n "                 " %s [-d] [-D t] [-S] [-r n] [-p n] [-u] [-g pat] [-s n] "                 "[-v n] [-i f] [-w] [-c n] config file\n\n",                 prog);        fprintf (stderr, "  -d      delete instead of insert\n");        fprintf (stderr, "  -D t    delete subtree instead of insert\n");        fprintf (stderr, "  -r n    set regular match range\n");        fprintf (stderr, "  -p n    set regular match start range\n");        fprintf (stderr, "  -u      report if keys change during insert\n");        fprintf (stderr, "  -g p    try pattern n (see -r)\n");        fprintf (stderr, "  -s n    set info size to n (instead of 4)\n");        fprintf (stderr, "  -v n    set logging level\n");        fprintf (stderr, "  -i f    read file with words\n");        fprintf (stderr, "  -w      insert/delete instead of lookup\n");        fprintf (stderr, "  -c n    cache size (number of pages)\n");        fprintf (stderr, "  -S      scan the dictionary\n");        exit (1);    }    while ((ret = options ("D:Sdr:p:ug:s:v:i:wc:", argv, argc, &arg)) != -2)    {        if (ret == 0)        {            if (!config)                config = arg;            else if (!name)                name = arg;            else            {                logf (LOG_FATAL, "too many files specified\n");                exit (1);            }        }	else if (ret == 'D')	{	    delete_term = arg;	}        else if (ret == 'd')            do_delete = 1;        else if (ret == 'g')        {            grep_pattern = arg;        }        else if (ret == 'r')        {            range = atoi (arg);        }        else if (ret == 'p')        {            srange = atoi (arg);        }        else if (ret == 'u')        {            unique = 1;        }        else if (ret == 'c')        {            cache = atoi(arg);            if (cache<2)                cache = 2;        }        else if (ret == 'w')            rw = 1;        else if (ret == 'i')            inputfile = arg;	else if (ret == 'S')	    scan_the_thing = 1;        else if (ret == 's')        {            infosize = atoi(arg);        }        else if (ret == 'v')        {            yaz_log_init (yaz_log_mask_str(arg), prog, NULL);        }        else        {            logf (LOG_FATAL, "Unknown option '-%s'", arg);            exit (1);        }    }    if (!config || !name)    {        logf (LOG_FATAL, "no config and/or dictionary specified");        exit (1);    }    my_resource = res_open (config, 0);    if (!my_resource)    {        logf (LOG_FATAL, "cannot open resource `%s'", config);        exit (1);    }    bfs = bfs_create (res_get(my_resource, "register"), 0);    if (!bfs)    {        logf (LOG_FATAL, "bfs_create fail");        exit (1);    }    dict = dict_open (bfs, name, cache, rw, 0);    if (!dict)    {        logf (LOG_FATAL, "dict_open fail of `%s'", name);        exit (1);    }    if (inputfile)    {        FILE *ipf;        char ipf_buf[1024];        int line = 1;        char infobytes[120];        memset (infobytes, 0, 120);        if (!(ipf = fopen(inputfile, "r")))        {            logf (LOG_FATAL|LOG_ERRNO, "cannot open %s", inputfile);            exit (1);        }                while (fgets (ipf_buf, 1023, ipf))        {            char *ipf_ptr = ipf_buf;            sprintf (infobytes, "%d", line);            for (;*ipf_ptr && *ipf_ptr != '\n';ipf_ptr++)            {                if (isalpha(*ipf_ptr) || *ipf_ptr == '_')                {                    int i = 1;                    while (ipf_ptr[i] && (isalnum(ipf_ptr[i]) ||                                          ipf_ptr[i] == '_'))                        i++;                    if (ipf_ptr[i])                        ipf_ptr[i++] = '\0';                    if (rw)                    {			if (do_delete)                            switch (dict_delete (dict, ipf_ptr))                            {                            case 0:                                no_not_found++;                                break;                            case 1:                                no_of_deleted++;                            }                        else                            switch(dict_insert (dict, ipf_ptr,                                                infosize, infobytes))                            {                            case 0:                                no_of_new++;                                break;                            case 1:                                no_of_change++;                                if (unique)                                    logf (LOG_LOG, "%s change\n", ipf_ptr);                                break;                            case 2:                                if (unique)                                    logf (LOG_LOG, "%s duplicate\n", ipf_ptr);                                no_of_same++;                                break;                            }                    }                    else if(range < 0)                    {                        char *cp;                        cp = dict_lookup (dict, ipf_ptr);                        if (cp && *cp)                            no_of_hits++;                        else                            no_of_misses++;                    }                    else                    {                        look_hits = 0;                        dict_lookup_grep (dict, ipf_ptr, range, NULL,                                          &max_pos, srange, grep_handler);                        if (look_hits)                            no_of_hits++;                        else                            no_of_misses++;                    }                    ++no_of_iterations;		    if ((no_of_iterations % 10000) == 0)		    {			printf ("."); fflush(stdout);		    }                    ipf_ptr += (i-1);                }            }            ++line;        }        fclose (ipf);    }    if (rw && delete_term)    {	logf (LOG_LOG, "dict_delete_subtree %s", delete_term);	dict_delete_subtree (dict, delete_term, 0, 0);    }    if (grep_pattern)    {        if (range < 0)            range = 0;        logf (LOG_LOG, "Grepping '%s'", grep_pattern);        dict_lookup_grep (dict, grep_pattern, range, NULL, &max_pos,                          srange, grep_handler);    }    if (rw)    {        logf (LOG_LOG, "Iterations.... %d", no_of_iterations);                    if (do_delete)        {            logf (LOG_LOG, "No of deleted. %d", no_of_deleted);            logf (LOG_LOG, "No not found.. %d", no_not_found);        }        else        {            logf (LOG_LOG, "No of new..... %d", no_of_new);            logf (LOG_LOG, "No of change.. %d", no_of_change);        }    }    else    {        logf (LOG_LOG, "Lookups....... %d", no_of_iterations);        logf (LOG_LOG, "No of hits.... %d", no_of_hits);        logf (LOG_LOG, "No of misses.. %d", no_of_misses);    }    if (scan_the_thing)    {	char term_dict[1024];        	int before = 1000000;	int after = 1000000;	logf (LOG_LOG, "dict_scan");	term_dict[0] = 1;	term_dict[1] = 0;	dict_scan (dict, term_dict, &before, &after, 0, scan_handler);    }    dict_close (dict);    bfs_destroy (bfs);    res_close (my_resource);    return 0;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -