⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zrpn.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 5 页
字号:
/* $Id: zrpn.c,v 1.134 2003/09/05 10:51:17 adam Exp $   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003   Index Data ApsThis file is part of the Zebra server.Zebra is free software; you can redistribute it and/or modify it underthe terms of the GNU General Public License as published by the FreeSoftware Foundation; either version 2, or (at your option) any laterversion.Zebra is distributed in the hope that it will be useful, but WITHOUT ANYWARRANTY; without even the implied warranty of MERCHANTABILITY orFITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public Licensefor more details.You should have received a copy of the GNU General Public Licensealong with Zebra; see the file LICENSE.zebra.  If not, write to theFree Software Foundation, 59 Temple Place - Suite 330, Boston, MA02111-1307, USA.*/#include <stdio.h>#include <assert.h>#ifdef WIN32#include <io.h>#else#include <unistd.h>#endif#include <ctype.h>#include "index.h"#include <zebra_xpath.h>#include <charmap.h>#include <rstemp.h>#include <rsnull.h>#include <rsbool.h>#include <rsbetween.h>struct rpn_char_map_info {    ZebraMaps zm;    int reg_type;};typedef struct {    int type;    int major;    int minor;    Z_AttributesPlusTerm *zapt;} AttrType;static const char **rpn_char_map_handler (void *vp, const char **from, int len){    struct rpn_char_map_info *p = (struct rpn_char_map_info *) vp;    const char **out = zebra_maps_input (p->zm, p->reg_type, from, len);#if 0    if (out && *out)    {        const char *outp = *out;        yaz_log (LOG_LOG, "---");        while (*outp)        {            yaz_log (LOG_LOG, "%02X", *outp);            outp++;        }    }#endif    return out;}static void rpn_char_map_prepare (struct zebra_register *reg, int reg_type,				  struct rpn_char_map_info *map_info){    map_info->zm = reg->zebra_maps;    map_info->reg_type = reg_type;    dict_grep_cmap (reg->dict, map_info, rpn_char_map_handler);}static int attr_find_ex (AttrType *src, oid_value *attributeSetP,			 const char **string_value){    int num_attributes;    num_attributes = src->zapt->attributes->num_attributes;    while (src->major < num_attributes)    {        Z_AttributeElement *element;        element = src->zapt->attributes->attributes[src->major];        if (src->type == *element->attributeType)        {            switch (element->which)             {            case Z_AttributeValue_numeric:                ++(src->major);                if (element->attributeSet && attributeSetP)                {                    oident *attrset;                    attrset = oid_getentbyoid (element->attributeSet);                    *attributeSetP = attrset->value;                }                return *element->value.numeric;                break;            case Z_AttributeValue_complex:                if (src->minor >= element->value.complex->num_list)		    break;                if (element->attributeSet && attributeSetP)                {                    oident *attrset;                                        attrset = oid_getentbyoid (element->attributeSet);                    *attributeSetP = attrset->value;                }                if (element->value.complex->list[src->minor]->which ==                      Z_StringOrNumeric_numeric)		{		    ++(src->minor);		    return			*element->value.complex->list[src->minor-1]->u.numeric;		}		else if (element->value.complex->list[src->minor]->which ==  			 Z_StringOrNumeric_string)		{		    if (!string_value)			break;		    ++(src->minor);		    *string_value = 			element->value.complex->list[src->minor-1]->u.string;		    return -2;		}		else		    break;            default:                assert (0);            }        }        ++(src->major);    }    return -1;}static int attr_find (AttrType *src, oid_value *attributeSetP){    return attr_find_ex (src, attributeSetP, 0);}static void attr_init (AttrType *src, Z_AttributesPlusTerm *zapt,                       int type){    src->zapt = zapt;    src->type = type;    src->major = 0;    src->minor = 0;}#define TERM_COUNT               struct grep_info {        #ifdef TERM_COUNT            int *term_no;        #endif            ISAMS_P *isam_p_buf;    int isam_p_size;            int isam_p_indx;    ZebraHandle zh;    int reg_type;    ZebraSet termset;};        static void term_untrans  (ZebraHandle zh, int reg_type,			   char *dst, const char *src){    int len = 0;    while (*src)    {        const char *cp = zebra_maps_output (zh->reg->zebra_maps,					    reg_type, &src);	if (!cp && len < IT_MAX_WORD-1)	    dst[len++] = *src++;	else	    while (*cp && len < IT_MAX_WORD-1)		dst[len++] = *cp++;    }    dst[len] = '\0';}static void add_isam_p (const char *name, const char *info,			struct grep_info *p){    if (p->isam_p_indx == p->isam_p_size)    {        ISAMS_P *new_isam_p_buf;#ifdef TERM_COUNT                int *new_term_no;        #endif        p->isam_p_size = 2*p->isam_p_size + 100;        new_isam_p_buf = (ISAMS_P *) xmalloc (sizeof(*new_isam_p_buf) *					     p->isam_p_size);        if (p->isam_p_buf)        {            memcpy (new_isam_p_buf, p->isam_p_buf,                    p->isam_p_indx * sizeof(*p->isam_p_buf));            xfree (p->isam_p_buf);        }        p->isam_p_buf = new_isam_p_buf;#ifdef TERM_COUNT        new_term_no = (int *) xmalloc (sizeof(*new_term_no) *				       p->isam_p_size);        if (p->term_no)        {            memcpy (new_term_no, p->isam_p_buf,                    p->isam_p_indx * sizeof(*p->term_no));            xfree (p->term_no);        }        p->term_no = new_term_no;#endif    }    assert (*info == sizeof(*p->isam_p_buf));    memcpy (p->isam_p_buf + p->isam_p_indx, info+1, sizeof(*p->isam_p_buf));#if 1    if (p->termset)    {	const char *db;	int set, use;	char term_tmp[IT_MAX_WORD];	int su_code = 0;	int len = key_SU_decode (&su_code, name);		term_untrans  (p->zh, p->reg_type, term_tmp, name+len+1);	logf (LOG_LOG, "grep: %d %c %s", su_code, name[len], term_tmp);	zebraExplain_lookup_ord (p->zh->reg->zei,				 su_code, &db, &set, &use);	logf (LOG_LOG, "grep:  set=%d use=%d db=%s", set, use, db);		resultSetAddTerm (p->zh, p->termset, name[len], db,			  set, use, term_tmp);    }#endif    (p->isam_p_indx)++;}static int grep_handle (char *name, const char *info, void *p){    add_isam_p (name, info, (struct grep_info *) p);    return 0;}static int term_pre (ZebraMaps zebra_maps, int reg_type, const char **src,		     const char *ct1, const char *ct2){    const char *s1, *s0 = *src;    const char **map;    /* skip white space */    while (*s0)    {        if (ct1 && strchr (ct1, *s0))            break;        if (ct2 && strchr (ct2, *s0))            break;        s1 = s0;        map = zebra_maps_input (zebra_maps, reg_type, &s1, strlen(s1));        if (**map != *CHR_SPACE)            break;        s0 = s1;    }    *src = s0;    return *s0;}#define REGEX_CHARS " []()|.*+?!"/* term_100: handle term, where trunc=none (no operators at all) */static int term_100 (ZebraMaps zebra_maps, int reg_type,		     const char **src, char *dst, int space_split,		     char *dst_term){    const char *s0, *s1;    const char **map;    int i = 0;    int j = 0;    const char *space_start = 0;    const char *space_end = 0;    if (!term_pre (zebra_maps, reg_type, src, NULL, NULL))        return 0;    s0 = *src;    while (*s0)    {        s1 = s0;        map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0));	if (space_split)	{	    if (**map == *CHR_SPACE)		break;	}	else  /* complete subfield only. */	{	    if (**map == *CHR_SPACE)	    {   /* save space mapping for later  .. */		space_start = s1;		space_end = s0;		continue;	    }	    else if (space_start)	    {   /* reload last space */		while (space_start < space_end)		{                    if (strchr (REGEX_CHARS, *space_start))			dst[i++] = '\\';		    dst_term[j++] = *space_start;		    dst[i++] = *space_start++;		}		/* and reset */		space_start = space_end = 0;	    }	}	/* add non-space char */        while (s1 < s0)        {            if (strchr(REGEX_CHARS, *s1))                dst[i++] = '\\';	    dst_term[j++] = *s1;            dst[i++] = *s1++;        }    }    dst[i] = '\0';    dst_term[j] = '\0';    *src = s0;    return i;}/* term_101: handle term, where trunc=Process # */static int term_101 (ZebraMaps zebra_maps, int reg_type,		     const char **src, char *dst, int space_split,		     char *dst_term){    const char *s0, *s1;    const char **map;    int i = 0;    int j = 0;    if (!term_pre (zebra_maps, reg_type, src, "#", "#"))        return 0;    s0 = *src;    while (*s0)    {        if (*s0 == '#')        {            dst[i++] = '.';            dst[i++] = '*';	    dst_term[j++] = *s0++;        }        else        {            s1 = s0;            map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0));            if (space_split && **map == *CHR_SPACE)                break;            while (s1 < s0)            {                if (strchr(REGEX_CHARS, *s1))                    dst[i++] = '\\';		dst_term[j++] = *s1;                dst[i++] = *s1++;            }        }    }    dst[i] = '\0';    dst_term[j++] = '\0';    *src = s0;    return i;}/* term_103: handle term, where trunc=re-2 (regular expressions) */static int term_103 (ZebraMaps zebra_maps, int reg_type, const char **src,		     char *dst, int *errors, int space_split,		     char *dst_term){    int i = 0;    int j = 0;    const char *s0, *s1;    const char **map;    if (!term_pre (zebra_maps, reg_type, src, "^\\()[].*+?|", "("))        return 0;    s0 = *src;    if (errors && *s0 == '+' && s0[1] && s0[2] == '+' && s0[3] &&        isdigit (s0[1]))    {        *errors = s0[1] - '0';        s0 += 3;        if (*errors > 3)            *errors = 3;    }    while (*s0)    {        if (strchr ("^\\()[].*+?|-", *s0))	{	    dst_term[j++] = *s0;            dst[i++] = *s0++;	}        else        {            s1 = s0;            map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0));            if (**map == *CHR_SPACE)                break;            while (s1 < s0)            {                if (strchr(REGEX_CHARS, *s1))                    dst[i++] = '\\';		dst_term[j++] = *s1;                dst[i++] = *s1++;            }        }    }    dst[i] = '\0';    dst_term[j] = '\0';    *src = s0;    return i;}/* term_103: handle term, where trunc=re-1 (regular expressions) */static int term_102 (ZebraMaps zebra_maps, int reg_type, const char **src,		     char *dst, int space_split, char *dst_term){    return term_103 (zebra_maps, reg_type, src, dst, NULL, space_split,		     dst_term);}/* term_104: handle term, where trunc=Process # and ! */static int term_104 (ZebraMaps zebra_maps, int reg_type,		     const char **src, char *dst, int space_split,		     char *dst_term){    const char *s0, *s1;    const char **map;    int i = 0;    int j = 0;    if (!term_pre (zebra_maps, reg_type, src, "?*#", "?*#"))        return 0;    s0 = *src;    while (*s0)    {        if (*s0 == '?')        {	    dst_term[j++] = *s0++;            if (*s0 >= '0' && *s0 <= '9')            {                int limit = 0;                while (*s0 >= '0' && *s0 <= '9')                {                    limit = limit * 10 + (*s0 - '0');                    dst_term[j++] = *s0++;                }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -