⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 extract.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 4 页
字号:
    qsort (key_buf + ptr_top-ptr_i, ptr_i, sizeof(char*), key_x_compare);    extract_get_fname_tmp (out_fname, key_file_no);    if (!(outf = fopen (out_fname, "wb")))    {        logf (LOG_FATAL|LOG_ERRNO, "fopen %s", out_fname);        exit (1);    }    logf (LOG_LOG, "writing section %d", key_file_no);    i = ptr_i;    prevcp =  key_buf[ptr_top-i];    while (1)        if (!--i || strcmp (prevcp, key_buf[ptr_top-i]))        {            key_y_len = strlen(prevcp)+1;#if 0            logf (LOG_LOG, "key_y_len: %2d %02x %02x %s",                      key_y_len, prevcp[0], prevcp[1], 2+prevcp);#endif            qsort (key_buf + ptr_top-ptr_i, ptr_i - i,                                   sizeof(char*), key_y_compare);            cp = key_buf[ptr_top-ptr_i];            --key_y_len;            encode_key_init (&encode_info);            encode_key_write (cp, &encode_info, outf);            while (--ptr_i > i)            {                cp = key_buf[ptr_top-ptr_i];                encode_key_write (cp+key_y_len, &encode_info, outf);            }            encode_key_flush ( &encode_info, outf);            if (!i)                break;            prevcp = key_buf[ptr_top-ptr_i];        }#endif    if (fclose (outf))    {        logf (LOG_FATAL|LOG_ERRNO, "fclose %s", out_fname);        exit (1);    }    logf (LOG_LOG, "finished section %d", zh->reg->key_file_no);    zh->reg->ptr_i = 0;    zh->reg->key_buf_used = 0;}void extract_add_index_string (RecWord *p, const char *string,                               int length){    char *dst;    unsigned char attrSet;    unsigned short attrUse;    int lead = 0;    int diff = 0;    int *pseqno = &p->seqno;    ZebraHandle zh = p->extractCtrl->handle;    ZebraExplainInfo zei = zh->reg->zei;    struct recKeys *keys = &zh->reg->keys;        if (keys->buf_used+1024 > keys->buf_max)    {        char *b;        b = (char *) xmalloc (keys->buf_max += 128000);        if (keys->buf_used > 0)            memcpy (b, keys->buf, keys->buf_used);        xfree (keys->buf);        keys->buf = b;    }    dst = keys->buf + keys->buf_used;    attrSet = p->attrSet;    if (keys->buf_used > 0 && keys->prevAttrSet == attrSet)        lead |= 1;    else        keys->prevAttrSet = attrSet;    attrUse = p->attrUse;    if (keys->buf_used > 0 && keys->prevAttrUse == attrUse)        lead |= 2;    else        keys->prevAttrUse = attrUse;#if 1    diff = 1 + *pseqno - keys->prevSeqNo;    if (diff >= 1 && diff <= 15)        lead |= (diff << 2);    else        diff = 0;#endif    keys->prevSeqNo = *pseqno;        *dst++ = lead;#if SU_SCHEME    if ((lead & 3) < 3)    {        int ch = zebraExplain_lookupSU (zei, attrSet, attrUse);        if (ch < 0)        {            ch = zebraExplain_addSU (zei, attrSet, attrUse);            yaz_log (LOG_DEBUG, "addSU set=%d use=%d SU=%d",                     attrSet, attrUse, ch);        }	assert (ch > 0);	memcpy (dst, &ch, sizeof(ch));	dst += sizeof(ch);    }#else    if (!(lead & 1))    {        memcpy (dst, &attrSet, sizeof(attrSet));        dst += sizeof(attrSet);    }    if (!(lead & 2))    {        memcpy (dst, &attrUse, sizeof(attrUse));        dst += sizeof(attrUse);    }#endif    *dst++ = p->reg_type;    memcpy (dst, string, length);    dst += length;    *dst++ = '\0';    if (!diff)    {        memcpy (dst, pseqno, sizeof(*pseqno));        dst += sizeof(*pseqno);    }    keys->buf_used = dst - keys->buf;}static void extract_add_sort_string (RecWord *p, const char *string,				     int length){    ZebraHandle zh = p->extractCtrl->handle;    struct sortKeys *sk = &zh->reg->sortKeys;    int off = 0;    while (off < sk->buf_used)    {        int set, use, slen;        off += key_SU_decode(&set, sk->buf + off);        off += key_SU_decode(&use, sk->buf + off);        off += key_SU_decode(&slen, sk->buf + off);        off += slen;        if (p->attrSet == set && p->attrUse == use)            return;    }    assert (off == sk->buf_used);        if (sk->buf_used + IT_MAX_WORD > sk->buf_max)    {        char *b;                b = (char *) xmalloc (sk->buf_max += 128000);        if (sk->buf_used > 0)            memcpy (b, sk->buf, sk->buf_used);        xfree (sk->buf);        sk->buf = b;    }    off += key_SU_encode(p->attrSet, sk->buf + off);    off += key_SU_encode(p->attrUse, sk->buf + off);    off += key_SU_encode(length, sk->buf + off);    memcpy (sk->buf + off, string, length);    sk->buf_used = off + length;}void extract_add_string (RecWord *p, const char *string, int length){    assert (length > 0);    if (zebra_maps_is_sort (p->zebra_maps, p->reg_type))	extract_add_sort_string (p, string, length);    else	extract_add_index_string (p, string, length);}static void extract_add_incomplete_field (RecWord *p){    const char *b = p->string;    int remain = p->length;    const char **map = 0;    if (remain > 0)	map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);    while (map)    {	char buf[IT_MAX_WORD+1];	int i, remain;	/* Skip spaces */	while (map && *map && **map == *CHR_SPACE)	{	    remain = p->length - (b - p->string);	    if (remain > 0)		map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);	    else		map = 0;	}	if (!map)	    break;	i = 0;	while (map && *map && **map != *CHR_SPACE)	{	    const char *cp = *map;	    while (i < IT_MAX_WORD && *cp)		buf[i++] = *(cp++);	    remain = p->length - (b - p->string);	    if (remain > 0)		map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);	    else		map = 0;	}	if (!i)	    return;	extract_add_string (p, buf, i);        p->seqno++;    }}static void extract_add_complete_field (RecWord *p){    const char *b = p->string;    char buf[IT_MAX_WORD+1];    const char **map = 0;    int i = 0, remain = p->length;    if (remain > 0)	map = zebra_maps_input (p->zebra_maps, p->reg_type, &b, remain);    while (remain > 0 && i < IT_MAX_WORD)    {	while (map && *map && **map == *CHR_SPACE)	{	    remain = p->length - (b - p->string);	    if (remain > 0)		map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);	    else		map = 0;	}	if (!map)	    break;	if (i && i < IT_MAX_WORD)	    buf[i++] = *CHR_SPACE;	while (map && *map && **map != *CHR_SPACE)	{	    const char *cp = *map;	    if (i >= IT_MAX_WORD)		break;	    while (i < IT_MAX_WORD && *cp)		buf[i++] = *(cp++);	    remain = p->length  - (b - p->string);	    if (remain > 0)		map = zebra_maps_input (p->zebra_maps, p->reg_type, &b,					remain);	    else		map = 0;	}    }    if (!i)	return;    extract_add_string (p, buf, i);}void extract_token_add (RecWord *p){    WRBUF wrbuf;#if 0    yaz_log (LOG_LOG, "token_add "	     "reg_type=%c attrSet=%d attrUse=%d seqno=%d s=%.*s",             p->reg_type, p->attrSet, p->attrUse, p->seqno, p->length,             p->string);#endif    if ((wrbuf = zebra_replace(p->zebra_maps, p->reg_type, 0,			       p->string, p->length)))    {	p->string = wrbuf_buf(wrbuf);	p->length = wrbuf_len(wrbuf);    }    if (zebra_maps_is_complete (p->zebra_maps, p->reg_type))	extract_add_complete_field (p);    else	extract_add_incomplete_field(p);}void extract_schema_add (struct recExtractCtrl *p, Odr_oid *oid){    ZebraHandle zh = (ZebraHandle) (p->handle);    zebraExplain_addSchema (zh->reg->zei, oid);}void extract_flushSortKeys (ZebraHandle zh, SYSNO sysno,                            int cmd, struct sortKeys *sk){    SortIdx sortIdx = zh->reg->sortIdx;    int off = 0;    sortIdx_sysno (sortIdx, sysno);    while (off < sk->buf_used)    {        int set, use, slen;                off += key_SU_decode(&set, sk->buf + off);        off += key_SU_decode(&use, sk->buf + off);        off += key_SU_decode(&slen, sk->buf + off);                sortIdx_type(sortIdx, use);        if (cmd == 1)            sortIdx_add(sortIdx, sk->buf + off, slen);        else            sortIdx_add(sortIdx, "", 1);        off += slen;    }}void encode_key_init (struct encode_info *i){    i->sysno = 0;    i->seqno = 0;    i->cmd = -1;    i->prevsys=0;    i->prevseq=0;    i->prevcmd=-1;    i->keylen=0;}char *encode_key_int (int d, char *bp){    if (d <= 63)        *bp++ = d;    else if (d <= 16383)    {        *bp++ = 64 + (d>>8);        *bp++ = d  & 255;    }    else if (d <= 4194303)    {        *bp++ = 128 + (d>>16);        *bp++ = (d>>8) & 255;        *bp++ = d & 255;    }    else    {        *bp++ = 192 + (d>>24);        *bp++ = (d>>16) & 255;        *bp++ = (d>>8) & 255;        *bp++ = d & 255;    }    return bp;}#define OLDENCODE 1#ifdef OLDENCODE/* this is the old encode_key_write  * may be deleted once we are confident that the new works * HL 15-oct-2002 */void encode_key_write (char *k, struct encode_info *i, FILE *outf){    struct it_key key;    char *bp = i->buf;    while ((*bp++ = *k++))        ;    memcpy (&key, k+1, sizeof(struct it_key));    bp = encode_key_int ( (key.sysno - i->sysno) * 2 + *k, bp);    if (i->sysno != key.sysno)    {        i->sysno = key.sysno;        i->seqno = 0;    }    else if (!i->seqno && !key.seqno && i->cmd == *k)	return;    bp = encode_key_int (key.seqno - i->seqno, bp);    i->seqno = key.seqno;    i->cmd = *k;    if (fwrite (i->buf, bp - i->buf, 1, outf) != 1)    {        logf (LOG_FATAL|LOG_ERRNO, "fwrite");        exit (1);    }}void encode_key_flush (struct encode_info *i, FILE *outf){ /* dummy routine */}#else/* new encode_key_write * The idea is to buffer one more key, and compare them * If we are going to delete and insert the same key,  * we may as well not bother. Should make a difference in  * updates with small modifications (appending to a mbox) */void encode_key_write (char *k, struct encode_info *i, FILE *outf){    struct it_key key;    char *bp;     if (*k)  /* first time for new key */    {        bp = i->buf;        while ((*bp++ = *k++))            ;	i->keylen= bp - i->buf -1;    	assert(i->keylen+1+sizeof(struct it_key) < ENCODE_BUFLEN);    }    else    {	bp=i->buf + i->keylen;	*bp++=0;	k++;    }    memcpy (&key, k+1, sizeof(struct it_key));    if (0==i->prevsys) /* no previous filter, fill up */    {        i->prevsys=key.sysno;	i->prevseq=key.seqno;	i->prevcmd=*k;    }    else if ( (i->prevsys==key.sysno) &&              (i->prevseq==key.seqno) &&	      (i->prevcmd!=*k) )    { /* same numbers, diff cmd, they cancel out */        i->prevsys=0;    }    else     { /* different stuff, write previous, move buf */        bp = encode_key_int ( (i->prevsys - i->sysno) * 2 + i->prevcmd, bp);	if (i->sysno != i->prevsys)	{	    i->sysno = i->prevsys;	    i->seqno = 0;        }        else if (!i->seqno && !i->prevseq && i->cmd == i->prevcmd)	{	    return; /* ??? Filters some sort of duplicates away */	            /* ??? Can this ever happen   -H 15oct02 */	}        bp = encode_key_int (i->prevseq - i->seqno, bp);        i->seqno = i->prevseq;        i->cmd = i->prevcmd;        if (fwrite (i->buf, bp - i->buf, 1, outf) != 1)        {            logf (LOG_FATAL|LOG_ERRNO, "fwrite");            exit (1);        }        i->keylen=0; /* ok, it's written, forget it */	i->prevsys=key.sysno;	i->prevseq=key.seqno;	i->prevcmd=*k;    }}void encode_key_flush (struct encode_info *i, FILE *outf){ /* flush the last key from i */    char *bp =i->buf + i->keylen;    if (0==i->prevsys)    {        return; /* nothing to flush */    }    *bp++=0;    bp = encode_key_int ( (i->prevsys - i->sysno) * 2 + i->prevcmd, bp);    if (i->sysno != i->prevsys)    {        i->sysno = i->prevsys;        i->seqno = 0;    }    else if (!i->seqno && !i->prevseq && i->cmd == i->prevcmd)    {        return; /* ??? Filters some sort of duplicates away */                /* ??? Can this ever happen   -H 15oct02 */    }    bp = encode_key_int (i->prevseq - i->seqno, bp);    i->seqno = i->prevseq;    i->cmd = i->prevcmd;    if (fwrite (i->buf, bp - i->buf, 1, outf) != 1)    {        logf (LOG_FATAL|LOG_ERRNO, "fwrite");        exit (1);    }    i->keylen=0; /* ok, it's written, forget it */    i->prevsys=0; /* forget the values too */    i->prevseq=0;}#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -