⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 d1_read.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
	    while (amp || (c && c != '>' && c != '/' && !d1_isspace(c)))	    {		if (i < (sizeof(tag)-1))		    tag[i++] = c;		c = ampr (get_byte, fh, &amp);	    }	    tag[i] = '\0';	    xattr = data1_read_xattr (dh, m, get_byte, fh, wrbuf, &c, &amp);	    args[0] = '\0';	    if (amp == 0 && c == '/')	    {    /* <tag attrs/> or <tag/> */		null_tag = 1;		c = ampr (get_byte, fh, &amp);	    }	    if (amp || c != '>')	    {		yaz_log(LOG_WARN, "d1: %d: Malformed tag", line);		return 0;	    }	    else		c = ampr (get_byte, fh, &amp);	    /* End tag? */	    if (end_tag)       	    {		if (*tag == '\0')		    --level;        /* </> */		else		{                   /* </tag> */		    int i = level;		    while (i > 0)		    {			parent = d1_stack[--i];			if ((parent->which == DATA1N_root &&			     !strcmp(tag, parent->u.root.type)) ||			    (parent->which == DATA1N_tag &&			     !strcmp(tag, parent->u.tag.tag)))			{			    level = i;			    break;			}		    }		    if (i != level)		    {			yaz_log (LOG_WARN, "%d: no begin tag for %s",				 line, tag);			break;		    }		}                if (data1_is_xmlmode(dh))                {                    if (level <= 1)                        return d1_stack[0];                }                else                {                    if (level <= 0)                        return d1_stack[0];                }		continue;	    }		    else if (!strcmp(tag, "var"))	    {		char tclass[DATA1_MAX_SYMBOL], type[DATA1_MAX_SYMBOL];		data1_vartype *tp;		int val_offset;				if (sscanf(args, "%s %s %n", tclass, type, &val_offset) != 2)		{		    yaz_log(LOG_WARN, "Malformed variant triple at '%s'", tag);		    continue;		}		if (!(tp =		      data1_getvartypebyct(dh,					   parent->root->u.root.absyn->varset,					   tclass, type)))		    continue;		/*		 * If we're the first variant in this group, create a parent 		 * variant, and insert it before the current variant.		 */		if (parent->which != DATA1N_variant)		{		    res = data1_mk_node2 (dh, m, DATA1N_variant, parent);		}		else		{		    /*		     * now determine if one of our ancestor triples is of		     * same type. If so, we break here.		     */		    int i;		    for (i = level-1; d1_stack[i]->which==DATA1N_variant; --i)			if (d1_stack[i]->u.variant.type == tp)			{			    level = i;			    break;			}		    res = data1_mk_node2 (dh, m, DATA1N_variant, parent);		    res->u.variant.type = tp;		    res->u.variant.value =			data1_insert_string (dh, res, m, args + val_offset);		}	    }	    else             {                                /* tag .. acquire our element in the abstract syntax */                if (level == 0)                {                    parent = data1_mk_root (dh, m, tag);                    res = d1_stack[level] = parent;                    if (data1_is_xmlmode(dh))                    {                        level++;                        res = data1_mk_tag (dh, m, tag, 0 /* attr */, parent);                        res->u.tag.attributes = xattr;                    }                }                else                {                    res = data1_mk_tag (dh, m, tag, 0 /* attr */, parent);                    res->u.tag.attributes = xattr;                }            }	    d1_stack[level] = res;	    d1_stack[level+1] = 0;	    if (level < 250 && !null_tag)		++level;	}	else /* != '<'... this is a body of text */	{	    int len;	    	    if (level == 0)	    {		c = ampr (get_byte, fh, &amp);		continue;	    }	    res = data1_mk_node2 (dh, m, DATA1N_data, parent);	    res->u.data.what = DATA1I_xmltext;	    res->u.data.formatted_text = 0;	    d1_stack[level] = res;	    	    wrbuf_rewind(wrbuf);	    while (amp || (c && c != '<'))	    {		wrbuf_putc (wrbuf, c);		c = ampr (get_byte, fh, &amp);	    }	    len = wrbuf_len(wrbuf);	    /* use local buffer of nmem if too large */	    if (len >= DATA1_LOCALDATA)		res->u.data.data = (char*) nmem_malloc (m, len);	    else		res->u.data.data = res->lbuf;	            if (len)                memcpy (res->u.data.data, wrbuf_buf(wrbuf), len);            else                res->u.data.data = 0;            res->u.data.len = len;	}    }    return 0;}int getc_mem (void *fh){    const char **p = (const char **) fh;    if (**p)	return *(*p)++;    return 0;}data1_node *data1_read_node (data1_handle dh, const char **buf, NMEM m){    WRBUF wrbuf = wrbuf_alloc();    data1_node *node;    node = data1_read_nodex(dh, m, getc_mem, (void *) (buf), wrbuf);    wrbuf_free (wrbuf, 1);    return node;}/* * Read a record in the native syntax. */data1_node *data1_read_record(data1_handle dh,			      int (*rf)(void *, char *, size_t), void *fh,                              NMEM m){    int *size;    char **buf = data1_get_read_buf (dh, &size);    const char *bp;    int rd = 0, res;        if (!*buf)	*buf = (char *)xmalloc(*size = 4096);        for (;;)    {	if (rd + 2048 >= *size && !(*buf =(char *)xrealloc(*buf, *size *= 2)))	    abort();	if ((res = (*rf)(fh, *buf + rd, 2048)) <= 0)	{	    if (!res)	    {		bp = *buf;		(*buf)[rd] = '\0';		return data1_read_node(dh, &bp, m);	    }	    else		return 0;	}	rd += res;    }}data1_node *data1_read_sgml (data1_handle dh, NMEM m, const char *buf){    const char *bp = buf;    return data1_read_node (dh, &bp, m);}static int conv_item (NMEM m, yaz_iconv_t t,                       WRBUF wrbuf, char *inbuf, size_t inlen){    wrbuf_rewind (wrbuf);    if (wrbuf->size < 10)        wrbuf_grow (wrbuf, 10);    for (;;)    {        char *outbuf = wrbuf->buf + wrbuf->pos;        size_t outlen = wrbuf->size - wrbuf->pos;        if (yaz_iconv (t, &inbuf, &inlen, &outbuf, &outlen) ==            (size_t)(-1) && yaz_iconv_error(t) != YAZ_ICONV_E2BIG)        {            /* bad data. stop and skip conversion entirely */            return -1;        }        else if (inlen == 0)        {   /* finished converting */            wrbuf->pos = wrbuf->size - outlen;            break;        }        else        {            /* buffer too small: make sure we expand buffer */            wrbuf->pos = wrbuf->size - outlen;            wrbuf_grow(wrbuf, 20);        }    }    return 0;}static void data1_iconv_s (data1_handle dh, NMEM m, data1_node *n,                           yaz_iconv_t t, WRBUF wrbuf, const char *tocode){    for (; n; n = n->next)    {        switch (n->which)        {        case DATA1N_data:        case DATA1N_comment:            if (conv_item (m, t, wrbuf, n->u.data.data, n->u.data.len) == 0)            {                n->u.data.data =                    data1_insert_string_n (dh, n, m, wrbuf->buf,                                           wrbuf->pos);                n->u.data.len = wrbuf->pos;            }            break;        case DATA1N_tag:            if (conv_item (m, t, wrbuf, n->u.tag.tag, strlen(n->u.tag.tag))                == 0)            {                n->u.tag.tag =                    data1_insert_string_n (dh, n, m,                                            wrbuf->buf, wrbuf->pos);            }            if (n->u.tag.attributes)            {                data1_xattr *p;                for (p = n->u.tag.attributes; p; p = p->next)                {                    if (p->value &&                        conv_item(m, t, wrbuf, p->value, strlen(p->value))                        == 0)                    {                        wrbuf_puts (wrbuf, "");                        p->value = nmem_strdup (m, wrbuf->buf);                    }                }            }            break;        case DATA1N_preprocess:            if (strcmp(n->u.preprocess.target, "xml") == 0)            {                data1_xattr *p = n->u.preprocess.attributes;                for (; p; p = p->next)                    if (strcmp (p->name, "encoding") == 0)                        p->value = nmem_strdup (m, tocode);            }            break;        }        data1_iconv_s (dh, m, n->child, t, wrbuf, tocode);    }}const char *data1_get_encoding (data1_handle dh, data1_node *n){    /* see if we have an xml header that specifies encoding */    if (n && n->child && n->child->which == DATA1N_preprocess &&        strcmp (n->child->u.preprocess.target, "xml") == 0)    {        data1_xattr *xp = n->child->u.preprocess.attributes;        for (; xp; xp = xp->next)            if (!strcmp (xp->name, "encoding") == 0)                return xp->value;    }    /* no encoding in header, so see if "encoding" was specified for abs */    if (n && n->which == DATA1N_root &&        n->u.root.absyn && n->u.root.absyn->encoding)        return n->u.root.absyn->encoding;    /* none of above, return a hard coded default */    return "ISO-8859-1";}int data1_iconv (data1_handle dh, NMEM m, data1_node *n,                  const char *tocode,                   const char *fromcode){    if (yaz_matchstr (tocode, fromcode))    {        WRBUF wrbuf = wrbuf_alloc();        yaz_iconv_t t = yaz_iconv_open (tocode, fromcode);        if (!t)            return -1;        data1_iconv_s (dh, m, n, t, wrbuf, tocode);        yaz_iconv_close (t);        wrbuf_free (wrbuf, 1);    }    return 0;}void data1_concat_text(data1_handle dh, NMEM m, data1_node *n){    for (; n; n = n->next)    {        if (n->which == DATA1N_data && n->next &&             n->next->which == DATA1N_data)        {            int sz = 0;            int off = 0;            char *ndata;            data1_node *np;            for (np = n; np && np->which == DATA1N_data; np=np->next)                sz += np->u.data.len;            ndata = nmem_malloc(m, sz);            for (np = n; np && np->which == DATA1N_data; np=np->next)            {                memcpy(ndata+off, np->u.data.data, np->u.data.len);                off += np->u.data.len;            }            n->u.data.data = ndata;            n->u.data.len = sz;            n->next = np;	    if (!np && n->parent)		n->parent->last_child = n;		        }        data1_concat_text(dh, m, n->child);    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -