⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 recgrs.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
/* $Id: recgrs.c,v 1.85 2003/10/07 09:18:21 adam Exp $   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003   Index Data ApsThis file is part of the Zebra server.Zebra is free software; you can redistribute it and/or modify it underthe terms of the GNU General Public License as published by the FreeSoftware Foundation; either version 2, or (at your option) any laterversion.Zebra is distributed in the hope that it will be useful, but WITHOUT ANYWARRANTY; without even the implied warranty of MERCHANTABILITY orFITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public Licensefor more details.You should have received a copy of the GNU General Public Licensealong with Zebra; see the file LICENSE.zebra.  If not, write to theFree Software Foundation, 59 Temple Place - Suite 330, Boston, MA02111-1307, USA.*/#include <stdio.h>#include <assert.h>#include <sys/types.h>#ifndef WIN32#include <unistd.h>#endif#include <yaz/log.h>#include <yaz/oid.h>#include <recctrl.h>#include "grsread.h"#define GRS_MAX_WORD 512struct grs_handler {    RecTypeGrs type;    void *clientData;    int initFlag;    struct grs_handler *next;};struct grs_handlers {    struct grs_handler *handlers;};static int read_grs_type (struct grs_handlers *h,			  struct grs_read_info *p, const char *type,			  data1_node **root){    struct grs_handler *gh = h->handlers;    const char *cp = strchr (type, '.');    if (cp == NULL || cp == type)    {        cp = strlen(type) + type;        *p->type = 0;    }    else        strcpy (p->type, cp+1);    for (gh = h->handlers; gh; gh = gh->next)    {        if (!memcmp (type, gh->type->type, cp-type) && 	    gh->type->type[cp-type] == '\0')	{	    if (!gh->initFlag)	    {		gh->initFlag = 1;		gh->clientData = (*gh->type->init)();	    }	    p->clientData = gh->clientData;            *root = (gh->type->read)(p);	    gh->clientData = p->clientData;	    return 0;	}    }    return 1;}static void grs_add_handler (struct grs_handlers *h, RecTypeGrs t){    struct grs_handler *gh = (struct grs_handler *) xmalloc (sizeof(*gh));    gh->next = h->handlers;    h->handlers = gh;    gh->initFlag = 0;    gh->clientData = 0;    gh->type = t;}static void *grs_init(RecType recType){    struct grs_handlers *h = (struct grs_handlers *) xmalloc (sizeof(*h));    h->handlers = 0;    grs_add_handler (h, recTypeGrs_sgml);    grs_add_handler (h, recTypeGrs_regx);#if HAVE_TCL_H    grs_add_handler (h, recTypeGrs_tcl);#endif    grs_add_handler (h, recTypeGrs_marc);    grs_add_handler (h, recTypeGrs_marcxml);#if HAVE_EXPAT_H    grs_add_handler (h, recTypeGrs_xml);#endif#if HAVE_PERL    grs_add_handler (h, recTypeGrs_perl);#endif    return h;}static void grs_destroy(void *clientData){    struct grs_handlers *h = (struct grs_handlers *) clientData;    struct grs_handler *gh = h->handlers, *gh_next;    while (gh)    {	gh_next = gh->next;	if (gh->initFlag)	    (*gh->type->destroy)(gh->clientData);	xfree (gh);	gh = gh_next;    }    xfree (h);}int d1_check_xpath_predicate(data1_node *n, struct xpath_predicate *p){    int res = 1;    char *attname;    data1_xattr *attr;        if (!p) {        return 1;    } else {        if (p->which == XPATH_PREDICATE_RELATION) {            if (p->u.relation.name[0]) {                if (*p->u.relation.name != '@') {                    yaz_log(LOG_WARN,                          "  Only attributes (@) are supported in xelm xpath predicates");                    yaz_log(LOG_WARN, "predicate %s ignored", p->u.relation.name);                    return (1);                }                attname = p->u.relation.name + 1;                res = 0;                /* looking for the attribute with a specified name */                for (attr = n->u.tag.attributes; attr; attr = attr->next) {                    yaz_log(LOG_DEBUG,"  - attribute %s <-> %s", attname, attr->name );                                        if (!strcmp(attr->name, attname)) {                        if (p->u.relation.op[0]) {                            if (*p->u.relation.op != '=') {                                yaz_log(LOG_WARN,                                      "Only '=' relation is supported (%s)",p->u.relation.op);                                yaz_log(LOG_WARN, "predicate %s ignored", p->u.relation.name);                                res = 1; break;                            } else {                                yaz_log(LOG_DEBUG,"    - value %s <-> %s",                                      p->u.relation.value, attr->value );                                if (!strcmp(attr->value, p->u.relation.value)) {                                    res = 1; break;                                }                             }                        } else {                            /* attribute exists, no value specified */                            res = 1; break;                        }                    }                }		yaz_log(LOG_DEBUG, "return %d", res);                return res;            } else {                return 1;            }        }         else if (p->which == XPATH_PREDICATE_BOOLEAN) {            if (!strcmp(p->u.boolean.op,"and")) {                return d1_check_xpath_predicate(n, p->u.boolean.left)                     && d1_check_xpath_predicate(n, p->u.boolean.right);             }            else if (!strcmp(p->u.boolean.op,"or")) {                return (d1_check_xpath_predicate(n, p->u.boolean.left)                         || d1_check_xpath_predicate(n, p->u.boolean.right));             } else {                yaz_log(LOG_WARN, "Unknown boolean relation %s, ignored",p->u.boolean.op);                return 1;            }        }    }    return 0;}/* *ostrich*   New function, looking for xpath "element" definitions in abs, bytagpath, using a kind of ugly regxp search.The DFA was built whileparsing abs, so here we just go trough them and try to matchagainst the given tagpath. The first matching entry is returned.pop, 2002-12-13Added support for enhanced xelm. Now [] predicates are consideredas well, when selecting indexing rules... (why the hell it's calledtermlist???)pop, 2003-01-17*/data1_termlist *xpath_termlist_by_tagpath(char *tagpath, data1_node *n){    data1_absyn *abs = n->root->u.root.absyn;    data1_xpelement *xpe = abs->xp_elements;    data1_node *nn;#ifdef ENHANCED_XELM     struct xpath_location_step *xp;#endif    char *pexpr = xmalloc(strlen(tagpath)+2);    int ok = 0;        sprintf (pexpr, "%s\n", tagpath);    yaz_log(LOG_DEBUG,"Checking tagpath %s",tagpath);    while (xpe)     {        struct DFA_state **dfaar = xpe->dfa->states;        struct DFA_state *s=dfaar[0];        struct DFA_tran *t;        const char *p;        int i;        unsigned char c;        int start_line = 1;        c = *pexpr++; t = s->trans; i = s->tran_no;	if ((c >= t->ch[0] && c <= t->ch[1]) || (!t->ch[0])) {            p = pexpr;            do {                if ((s = dfaar[t->to])->rule_no &&                     (start_line || s->rule_nno))  {                    ok = 1;                    break;                }                for (t=s->trans, i=s->tran_no; --i >= 0; t++) {                    if ((unsigned) *p >= t->ch[0] && (unsigned) *p <= t->ch[1])                        break;                }                p++;            } while (i >= 0);	}	if (ok)	    yaz_log(LOG_DEBUG," xpath match %s",xpe->xpath_expr);	else	    yaz_log(LOG_DEBUG," xpath no match %s",xpe->xpath_expr);        pexpr--;        if (ok) {#ifdef ENHANCED_XELM             /* we have to check the perdicates up to the root node */            xp = xpe->xpath;                        /* find the first tag up in the node structure */            nn = n; while (nn && nn->which != DATA1N_tag) {                nn = nn->parent;            }                        /* go from inside out in the node structure, while going               backwards trough xpath location steps ... */            for (i=xpe->xpath_len - 1; i>0; i--) {                                yaz_log(LOG_DEBUG,"Checking step %d: %s on tag %s",		     i,xp[i].part,nn->u.tag.tag);                                if (!d1_check_xpath_predicate(nn, xp[i].predicate)) {                    yaz_log(LOG_DEBUG,"  Predicates didn't match");                    ok = 0;                    break;                }                                if (nn->which == DATA1N_tag) {                    nn = nn->parent;                }            }#endif            if (ok) {                break;            }	}        xpe = xpe->next;    }         xfree(pexpr);        if (ok) {      yaz_log(LOG_DEBUG,"Got it");        return xpe->termlists;    } else {        return NULL;    }}/* use     1   start element (tag)     2   end element     3   start attr (and attr-exact)     4   end attr  1016   cdata  1015   attr data  *ostrich*  Now, if there is a matching xelm described in abs, for the  indexed element or the attribute,  then the data is handled according   to those definitions...  modified by pop, 2002-12-13*//* add xpath index for an attribute */static void index_xpath_attr (char *tag_path, char *name, char *value,			      char *structure, struct recExtractCtrl *p,			      RecWord *wrd){    wrd->attrSet = VAL_IDXPATH;    wrd->attrUse = 1;    wrd->reg_type = '0';    wrd->string = tag_path;    wrd->length = strlen(tag_path);    (*p->tokenAdd)(wrd);        if (value) {        wrd->attrUse = 1015;        wrd->reg_type = 'w';        wrd->string = value;        wrd->length = strlen(value);        (*p->tokenAdd)(wrd);    }        wrd->attrUse = 2;    wrd->reg_type = '0';    wrd->string = tag_path;    wrd->length = strlen(tag_path);    (*p->tokenAdd)(wrd);}static void index_xpath (data1_node *n, struct recExtractCtrl *p,                         int level, RecWord *wrd, int use){    int i;    char tag_path_full[1024];    size_t flen = 0;    data1_node *nn;    int termlist_only = 1;    yaz_log(LOG_DEBUG, "index_xpath level=%d use=%d", level, use);    if ((!n->root->u.root.absyn) ||	(n->root->u.root.absyn->enable_xpath_indexing)) {      termlist_only = 0;    }    switch (n->which)    {    case DATA1N_data:        wrd->string = n->u.data.data;        wrd->length = n->u.data.len;        if (p->flagShowRecords)        {            printf("%*s XData:\"", (level + 1) * 4, "");            for (i = 0; i<wrd->length && i < 8; i++)                fputc (wrd->string[i], stdout);            printf("\"\n");        }          else  {            data1_termlist *tl;            int xpdone = 0;            flen = 0;                        /* we have to fetch the whole path to the data tag */            for (nn = n; nn; nn = nn->parent) {                if (nn->which == DATA1N_tag) {                    size_t tlen = strlen(nn->u.tag.tag);                    if (tlen + flen > (sizeof(tag_path_full)-2)) return;                    memcpy (tag_path_full + flen, nn->u.tag.tag, tlen);                    flen += tlen;                    tag_path_full[flen++] = '/';                }                else if (nn->which == DATA1N_root)  break;            }            tag_path_full[flen] = 0;                        /* If we have a matching termlist... */            if (n->root->u.root.absyn && (tl = xpath_termlist_by_tagpath(tag_path_full, n))) {                for (; tl; tl = tl->next) {                    wrd->reg_type = *tl->structure;                    /* this is the ! case, so structure is for the xpath index */                    if (!tl->att) {                        wrd->attrSet = VAL_IDXPATH;                        wrd->attrUse = use;                        (*p->tokenAdd)(wrd);                        xpdone = 1;                    } else {                        /* this is just the old fashioned attribute based index */                        wrd->attrSet = (int) (tl->att->parent->reference);                        wrd->attrUse = tl->att->locals->local;                        (*p->tokenAdd)(wrd);                    }                }            }            /* xpath indexing is done, if there was no termlist given,                or no ! in the termlist, and default indexing is enabled... */            if ((!xpdone) && (!termlist_only)) {                wrd->attrSet = VAL_IDXPATH;                wrd->attrUse = use;                wrd->reg_type = 'w';                (*p->tokenAdd)(wrd);            }	}        break;    case DATA1N_tag:        flen = 0;        for (nn = n; nn; nn = nn->parent)        {            if (nn->which == DATA1N_tag)            {                size_t tlen = strlen(nn->u.tag.tag);                if (tlen + flen > (sizeof(tag_path_full)-2))                    return;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -