⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 d1_read.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
/* $Id: d1_read.c,v 1.5 2003/09/08 10:26:51 adam Exp $   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003   Index Data ApsThis file is part of the Zebra server.Zebra is free software; you can redistribute it and/or modify it underthe terms of the GNU General Public License as published by the FreeSoftware Foundation; either version 2, or (at your option) any laterversion.Zebra is distributed in the hope that it will be useful, but WITHOUT ANYWARRANTY; without even the implied warranty of MERCHANTABILITY orFITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public Licensefor more details.You should have received a copy of the GNU General Public Licensealong with Zebra; see the file LICENSE.zebra.  If not, write to theFree Software Foundation, 59 Temple Place - Suite 330, Boston, MA02111-1307, USA.*//* * This module reads "loose" SGML and converts it to data1 tree  */#include <assert.h>#include <stdio.h>#include <stdlib.h>#include <yaz/xmalloc.h>#include <yaz/yaz-util.h>#include <yaz/log.h>#include <data1.h>data1_node *data1_get_root_tag (data1_handle dh, data1_node *n){    if (!n)        return 0;    if (data1_is_xmlmode(dh))    {        n = n->child;        while (n && n->which != DATA1N_tag)            n = n->next;    }    return n;}        /* * get the tag which is the immediate parent of this node (this may mean * traversing intermediate things like variants and stuff. */data1_node *get_parent_tag (data1_handle dh, data1_node *n){    if (data1_is_xmlmode(dh))    {        for (; n && n->which != DATA1N_root; n = n->parent)            if (n->which == DATA1N_tag && n->parent &&                n->parent->which != DATA1N_root)                return n;    }    else    {        for (; n && n->which != DATA1N_root; n = n->parent)            if (n->which == DATA1N_tag)                return n;    }    return 0;}data1_node *data1_mk_node (data1_handle dh, NMEM m){    return data1_mk_node2 (dh, m, DATA1N_root, 0);}data1_node *data1_mk_node_type (data1_handle dh, NMEM m, int type){    return data1_mk_node2 (dh, m, type, 0);}static void data1_init_node (data1_handle dh, data1_node *r, int type){    r->which = type;    switch(type)    {    case DATA1N_tag:	r->u.tag.tag = 0;	r->u.tag.element = 0;	r->u.tag.no_data_requested = 0;	r->u.tag.node_selected = 0;	r->u.tag.make_variantlist = 0;	r->u.tag.get_bytes = -1;	r->u.tag.attributes = 0;	break;    case DATA1N_root:	r->u.root.type = 0;	r->u.root.absyn = 0;	break;    case DATA1N_data:	r->u.data.data = 0;	r->u.data.len = 0;	r->u.data.what = 0;	r->u.data.formatted_text = 0;        break;    case DATA1N_comment:	r->u.data.data = 0;	r->u.data.len = 0;	r->u.data.what = 0;	r->u.data.formatted_text = 1;        break;    case DATA1N_variant:        r->u.variant.type = 0;        r->u.variant.value = 0;	break;    case DATA1N_preprocess:        r->u.preprocess.target = 0;        r->u.preprocess.attributes = 0;        break;    default:	logf (LOG_WARN, "data_mk_node_type. bad type = %d\n", type);    }}data1_node *data1_append_node (data1_handle dh, NMEM m, int type,                               data1_node *parent){    data1_node *r = (data1_node *)nmem_malloc(m, sizeof(*r));    r->next = r->child = r->last_child = 0;    r->destroy = 0;        if (!parent)        r->root = r;    else    {        r->root = parent->root;        r->parent = parent;        if (!parent->child)            parent->child = parent->last_child = r;        else            parent->last_child->next = r;        parent->last_child = r;    }    data1_init_node(dh, r, type);    return r;}data1_node *data1_mk_node2 (data1_handle dh, NMEM m, int type,                            data1_node *parent){    return data1_append_node (dh, m, type, parent);}data1_node *data1_insert_node (data1_handle dh, NMEM m, int type,                               data1_node *parent){    data1_node *r = (data1_node *)nmem_malloc(m, sizeof(*r));    r->next = r->child = r->last_child = 0;    r->destroy = 0;        if (!parent)        r->root = r;    else    {        r->root = parent->root;        r->parent = parent;        if (!parent->child)            parent->last_child = r;        else            r->next = parent->child;        parent->child = r;    }    data1_init_node(dh, r, type);    return r;}void data1_free_tree (data1_handle dh, data1_node *t){    data1_node *p = t->child, *pn;    while (p)    {	pn = p->next;	data1_free_tree (dh, p);	p = pn;    }    if (t->destroy)	(*t->destroy)(t);}data1_node *data1_mk_root (data1_handle dh, NMEM nmem, const char *name){    data1_absyn *absyn = data1_get_absyn (dh, name);    data1_node *res;    if (!absyn)    {        yaz_log(LOG_WARN, "Unable to acquire abstract syntax " "for '%s'",                name);         /* It's now OK for a record not to have an absyn */    }    res = data1_mk_node2 (dh, nmem, DATA1N_root, 0);    res->u.root.type = data1_insert_string (dh, res, nmem, name);    res->u.root.absyn = absyn;    return res;}void data1_set_root(data1_handle dh, data1_node *res,                    NMEM nmem, const char *name){    data1_absyn *absyn = data1_get_absyn (dh, name);    res->u.root.type = data1_insert_string (dh, res, nmem, name);    res->u.root.absyn = absyn;}data1_node *data1_mk_preprocess (data1_handle dh, NMEM nmem,                                 const char *target,                                 const char **attr, data1_node *at){    return data1_mk_preprocess_n (dh, nmem, target, strlen(target),                                  attr, at);}data1_node *data1_mk_preprocess_n (data1_handle dh, NMEM nmem,                                   const char *target, size_t len,                                   const char **attr, data1_node *at){    data1_xattr **p;    data1_node *res = data1_mk_node2 (dh, nmem, DATA1N_preprocess, at);    res->u.preprocess.target = data1_insert_string_n (dh, res, nmem,                                                      target, len);        p = &res->u.preprocess.attributes;    while (attr && *attr)    {        *p = (data1_xattr*) nmem_malloc (nmem, sizeof(**p));        (*p)->name = nmem_strdup (nmem, *attr++);        (*p)->value = nmem_strdup (nmem, *attr++);        (*p)->what = DATA1I_text;        p = &(*p)->next;    }    *p = 0;    return res;}data1_node *data1_mk_tag_n (data1_handle dh, NMEM nmem,                             const char *tag, size_t len, const char **attr,                            data1_node *at){    data1_node *partag = get_parent_tag(dh, at);    data1_node *res = data1_mk_node2 (dh, nmem, DATA1N_tag, at);    data1_xattr **p;    data1_element *e = 0;        res->u.tag.tag = data1_insert_string_n (dh, res, nmem, tag, len);        if (!partag)  /* top tag? */        e  = data1_getelementbytagname (dh, at->root->u.root.absyn,                                        0 /* index as local */,                                        res->u.tag.tag);    else    {        /* only set element for known tags */        e = partag->u.tag.element;        if (e)            e = data1_getelementbytagname (dh, at->root->u.root.absyn,                                           e, res->u.tag.tag);    }    res->u.tag.element = e;    p = &res->u.tag.attributes;    while (attr && *attr)    {        *p = (data1_xattr*) nmem_malloc (nmem, sizeof(**p));        (*p)->name = nmem_strdup (nmem, *attr++);        (*p)->value = nmem_strdup (nmem, *attr++);        (*p)->what = DATA1I_text;        p = &(*p)->next;    }    *p = 0;    return res;}void data1_tag_add_attr (data1_handle dh, NMEM nmem,                         data1_node *res, const char **attr){    data1_xattr **p;    if (res->which != DATA1N_tag)        return;    p = &res->u.tag.attributes;    while (*p)        p = &(*p)->next;    while (attr && *attr)    {        *p = (data1_xattr*) nmem_malloc (nmem, sizeof(**p));        (*p)->name = nmem_strdup (nmem, *attr++);        (*p)->value = nmem_strdup (nmem, *attr++);        (*p)->what = DATA1I_text;        p = &(*p)->next;    }    *p = 0;}data1_node *data1_mk_tag (data1_handle dh, NMEM nmem,                          const char *tag, const char **attr, data1_node *at) {    return data1_mk_tag_n (dh, nmem, tag, strlen(tag), attr, at);}data1_node *data1_search_tag (data1_handle dh, data1_node *n,                              const char *tag){    if (*tag == '/')    {        n = data1_get_root_tag (dh, n);        if (n)            n = n->child;        tag++;    }    for (; n; n = n->next)	if (n->which == DATA1N_tag && n->u.tag.tag &&	    !yaz_matchstr (n->u.tag.tag, tag))	{	    return n;	}    return 0;}data1_node *data1_mk_tag_uni (data1_handle dh, NMEM nmem,                               const char *tag, data1_node *at){    data1_node *node = data1_search_tag (dh, at->child, tag);    if (!node)	node = data1_mk_tag (dh, nmem, tag, 0 /* attr */, at);    else        node->child = node->last_child = 0;    return node;}data1_node *data1_mk_text_n (data1_handle dh, NMEM mem,                             const char *buf, size_t len, data1_node *parent){    data1_node *res = data1_mk_node2 (dh, mem, DATA1N_data, parent);    res->u.data.what = DATA1I_text;    res->u.data.len = len;        res->u.data.data = data1_insert_string_n (dh, res, mem, buf, len);    return res;}data1_node *data1_mk_text_nf (data1_handle dh, NMEM mem,                              const char *buf, size_t len, data1_node *parent){    data1_node *res = data1_mk_text_n (dh, mem, buf, len, parent);    res->u.data.formatted_text = 1;    return res;}data1_node *data1_mk_text (data1_handle dh, NMEM mem,                           const char *buf, data1_node *parent){    return data1_mk_text_n (dh, mem, buf, strlen(buf), parent);}data1_node *data1_mk_comment_n (data1_handle dh, NMEM mem,                                const char *buf, size_t len,                                data1_node *parent){    data1_node *res = data1_mk_node2 (dh, mem, DATA1N_comment, parent);    res->u.data.what = DATA1I_text;    res->u.data.len = len;        res->u.data.data = data1_insert_string_n (dh, res, mem, buf, len);    return res;}data1_node *data1_mk_comment (data1_handle dh, NMEM mem,                              const char *buf, data1_node *parent)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -