⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 recgrs.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
                memcpy (tag_path_full + flen, nn->u.tag.tag, tlen);                flen += tlen;                tag_path_full[flen++] = '/';            }            else if (nn->which == DATA1N_root)                break;        }        wrd->reg_type = '0';        wrd->string = tag_path_full;        wrd->length = flen;        wrd->attrSet = VAL_IDXPATH;        wrd->attrUse = use;        if (p->flagShowRecords)        {            printf("%*s tag=", (level + 1) * 4, "");            for (i = 0; i<wrd->length && i < 40; i++)                fputc (wrd->string[i], stdout);            if (i == 40)                printf (" ..");            printf("\n");        }        else        {            data1_xattr *xp;            data1_termlist *tl;	    int do_xpindex;            	    tag_path_full[flen] = 0;                        /* Add tag start/end xpath index, only when there is a ! in the apropriate xelm               directive, or default xpath indexing is enabled */	    if (!(do_xpindex = 1 - termlist_only)) {                if ((tl = xpath_termlist_by_tagpath(tag_path_full, n))) {                    for (; tl; tl = tl->next) { if (!tl->att) {do_xpindex = 1;} }                }	    }	    if (do_xpindex) {                (*p->tokenAdd)(wrd);   /* index element pag (AKA tag path) */	    }                        if (use == 1) /* only for the starting tag... */            {#define MAX_ATTR_COUNT 50                data1_termlist *tll[MAX_ATTR_COUNT];                                int i = 0;                                /* get termlists for attributes, and find out, if we have to do xpath indexing */                for (xp = n->u.tag.attributes; xp; xp = xp->next) {                    i++;                }                                i = 0;                for (xp = n->u.tag.attributes; xp; xp = xp->next) {                    char comb[512];                    int do_xpindex = 1 - termlist_only;                    data1_termlist *tl;                    char attr_tag_path_full[1024];                     int int_len = flen;                                        /* this could be cached as well */                    sprintf (attr_tag_path_full, "@%s/%.*s",                             xp->name, int_len, tag_path_full);                                        tll[i] = xpath_termlist_by_tagpath(attr_tag_path_full,n);                                        /* if there is a ! in the xelm termlist, or default indexing is on,                        proceed with xpath idx */                    if ((tl = tll[i]))                    {                        for (; tl; tl = tl->next)                        {                             if (!tl->att)                                do_xpindex = 1;                        }                    }                                        if (do_xpindex) {                                                /* attribute  (no value) */                        wrd->reg_type = '0';                        wrd->attrUse = 3;                        wrd->string = xp->name;                        wrd->length = strlen(xp->name);                                                wrd->seqno--;                        (*p->tokenAdd)(wrd);                                                if (xp->value &&                            strlen(xp->name) + strlen(xp->value) < sizeof(comb)-2) {                                                        /* attribute value exact */                            strcpy (comb, xp->name);                            strcat (comb, "=");                            strcat (comb, xp->value);                                                        wrd->attrUse = 3;                            wrd->reg_type = '0';                            wrd->string = comb;                            wrd->length = strlen(comb);                            wrd->seqno--;                                                        (*p->tokenAdd)(wrd);                        }                    }                                    i++;                }                                i = 0;                for (xp = n->u.tag.attributes; xp; xp = xp->next) {                    data1_termlist *tl;                    char attr_tag_path_full[1024];                    int int_len = flen;                    int xpdone = 0;                                        sprintf (attr_tag_path_full, "@%s/%.*s",                             xp->name, int_len, tag_path_full);                                        if ((tl = tll[i]))                    {                        /* If there is a termlist given (=xelm directive) */                        for (; tl; tl = tl->next)                        {                            if (!tl->att) {                                /* add xpath index for the attribute */                                index_xpath_attr (attr_tag_path_full, xp->name,                                                  xp->value, tl->structure,                                                  p, wrd);                                xpdone = 1;                            } else {                                /* add attribute based index for the attribute */                                if (xp->value) {                                    wrd->attrSet = (int)                                         (tl->att->parent->reference);                                    wrd->attrUse = tl->att->locals->local;                                    wrd->reg_type = *tl->structure;                                    wrd->string = xp->value;                                    wrd->length = strlen(xp->value);                                    (*p->tokenAdd)(wrd);                                }                            }                        }                    }                    /* if there was no termlist for the given path,                        or the termlist didn't have a ! element, index                        the attribute as "w" */                    if ((!xpdone) && (!termlist_only))                    {                        index_xpath_attr (attr_tag_path_full, xp->name,                                          xp->value,  "w", p, wrd);                    }                    i++;                }            }	}    }}static void index_termlist (data1_node *par, data1_node *n,                            struct recExtractCtrl *p, int level, RecWord *wrd){    data1_termlist *tlist = 0;    data1_datatype dtype = DATA1K_string;    /*     * cycle up towards the root until we find a tag with an att..     * this has the effect of indexing locally defined tags with     * the attribute of their ancestor in the record.     */        while (!par->u.tag.element)        if (!par->parent || !(par=get_parent_tag(p->dh, par->parent)))            break;    if (!par || !(tlist = par->u.tag.element->termlists))        return;    if (par->u.tag.element->tag)        dtype = par->u.tag.element->tag->kind;        for (; tlist; tlist = tlist->next)    {	char xattr[512];	/* consider source */	wrd->string = 0;	if (!strcmp (tlist->source, "data") && n->which == DATA1N_data)	{	    wrd->string = n->u.data.data;	    wrd->length = n->u.data.len;	}	else if (!strcmp (tlist->source, "tag") && n->which == DATA1N_tag)        {	    wrd->string = n->u.tag.tag;	    wrd->length = strlen(n->u.tag.tag);	}	else if (sscanf (tlist->source, "attr(%511[^)])", xattr) == 1 &&	    n->which == DATA1N_tag)	{	    data1_xattr *p = n->u.tag.attributes;	    while (p && strcmp (p->name, xattr))		p = p->next;	    if (p)	    {		wrd->string = p->value;		wrd->length = strlen(p->value);	    }	}	if (wrd->string)	{	    if (p->flagShowRecords)	    {		int i;		printf("%*sIdx: [%s]", (level + 1) * 4, "",		       tlist->structure);		printf("%s:%s [%d] %s",		       tlist->att->parent->name,		       tlist->att->name, tlist->att->value,		       tlist->source);		printf (" XData:\"");		for (i = 0; i<wrd->length && i < 40; i++)		    fputc (wrd->string[i], stdout);		fputc ('"', stdout);		if (wrd->length > 40)		    printf (" ...");		fputc ('\n', stdout);	    }	    else	    {		wrd->reg_type = *tlist->structure;		wrd->attrSet = (int) (tlist->att->parent->reference);		wrd->attrUse = tlist->att->locals->local;		(*p->tokenAdd)(wrd);	    }	}    }}static int dumpkeys(data1_node *n, struct recExtractCtrl *p, int level,                    RecWord *wrd){    for (; n; n = n->next)    {	if (p->flagShowRecords) /* display element description to user */	{	    if (n->which == DATA1N_root)	    {		printf("%*s", level * 4, "");                printf("Record type: '%s'\n", n->u.root.type);	    }	    else if (n->which == DATA1N_tag)	    {		data1_element *e;		printf("%*s", level * 4, "");		if (!(e = n->u.tag.element))		    printf("Local tag: '%s'\n", n->u.tag.tag);		else		{		    printf("Elm: '%s' ", e->name);		    if (e->tag)		    {			data1_tag *t = e->tag;			printf("TagNam: '%s' ", t->names->name);			printf("(");			if (t->tagset)			    printf("%s[%d],", t->tagset->name, t->tagset->type);			else			    printf("?,");			if (t->which == DATA1T_numeric)			    printf("%d)", t->value.numeric);			else			    printf("'%s')", t->value.string);		    }		    printf("\n");		}	    }	}	if (n->which == DATA1N_tag)	{            index_termlist (n, n, p, level, wrd);            /* index start tag */	    if (n->root->u.root.absyn)      	        index_xpath (n, p, level, wrd, 1); 	}	if (n->child)	    if (dumpkeys(n->child, p, level + 1, wrd) < 0)		return -1;	if (n->which == DATA1N_data)	{	    data1_node *par = get_parent_tag(p->dh, n);	    if (p->flagShowRecords)	    {		printf("%*s", level * 4, "");		printf("Data: ");		if (n->u.data.len > 256)		    printf("'%.170s ... %.70s'\n", n->u.data.data,			   n->u.data.data + n->u.data.len-70);		else if (n->u.data.len > 0)		    printf("'%.*s'\n", n->u.data.len, n->u.data.data);		else		    printf("NULL\n");	    }	    if (par)		index_termlist (par, n, p, level, wrd);	    index_xpath (n, p, level, wrd, 1016); 	}	if (n->which == DATA1N_tag)	{            /* index end tag */	    index_xpath (n, p, level, wrd, 2);	}	if (p->flagShowRecords && n->which == DATA1N_root)	{	    printf("%*s-------------\n\n", level * 4, "");	}    }    return 0;}int grs_extract_tree(struct recExtractCtrl *p, data1_node *n){    oident oe;    int oidtmp[OID_SIZE];    RecWord wrd;    oe.proto = PROTO_Z3950;    oe.oclass = CLASS_SCHEMA;    if (n->u.root.absyn)    {        oe.value = n->u.root.absyn->reference;                if ((oid_ent_to_oid (&oe, oidtmp)))            (*p->schemaAdd)(p, oidtmp);    }    (*p->init)(p, &wrd);    return dumpkeys(n, p, 0, &wrd);}static int grs_extract_sub(struct grs_handlers *h, struct recExtractCtrl *p,			   NMEM mem){    data1_node *n;    struct grs_read_info gri;    oident oe;    int oidtmp[OID_SIZE];    RecWord wrd;    gri.readf = p->readf;    gri.seekf = p->seekf;    gri.tellf = p->tellf;    gri.endf = p->endf;    gri.fh = p->fh;    gri.offset = p->offset;    gri.mem = mem;    gri.dh = p->dh;    if (read_grs_type (h, &gri, p->subType, &n))	return RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER;    if (!n)        return RECCTRL_EXTRACT_EOF;    oe.proto = PROTO_Z3950;    oe.oclass = CLASS_SCHEMA;#if 0    if (!n->u.root.absyn)        return RECCTRL_EXTRACT_ERROR;#endif    if (n->u.root.absyn)    {        oe.value = n->u.root.absyn->reference;        if ((oid_ent_to_oid (&oe, oidtmp)))            (*p->schemaAdd)(p, oidtmp);    }    data1_concat_text(p->dh, mem, n);    /* ensure our data1 tree is UTF-8 */    data1_iconv (p->dh, mem, n, "UTF-8", data1_get_encoding(p->dh, n));#if 0    data1_pr_tree (p->dh, n, stdout);#endif    (*p->init)(p, &wrd);    if (dumpkeys(n, p, 0, &wrd) < 0)    {	data1_free_tree(p->dh, n);	return RECCTRL_EXTRACT_ERROR_GENERIC;    }    data1_free_tree(p->dh, n);    return RECCTRL_EXTRACT_OK;}static int grs_extract(void *clientData, struct recExtractCtrl *p){    int ret;    NMEM mem = nmem_create ();    struct grs_handlers *h = (struct grs_handlers *) clientData;    ret = grs_extract_sub(h, p, mem);    nmem_destroy(mem);    return ret;}/* * Return: -1: Nothing done. 0: Ok. >0: Bib-1 diagnostic. */static int process_comp(data1_handle dh, data1_node *n, Z_RecordComposition *c){    data1_esetname *eset;    Z_Espec1 *espec = 0;    Z_ElementSpec *p;    switch (c->which)    {    case Z_RecordComp_simple:	if (c->u.simple->which != Z_ElementSetNames_generic)	    return 26; /* only generic form supported. Fix this later */	if (!(eset = data1_getesetbyname(dh, n->u.root.absyn,					 c->u.simple->u.generic)))

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -