⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 extract.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 4 页
字号:
/* $Id: extract.c,v 1.146 2003/10/07 09:18:43 adam Exp $   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003   Index Data ApsThis file is part of the Zebra server.Zebra is free software; you can redistribute it and/or modify it underthe terms of the GNU General Public License as published by the FreeSoftware Foundation; either version 2, or (at your option) any laterversion.Zebra is distributed in the hope that it will be useful, but WITHOUT ANYWARRANTY; without even the implied warranty of MERCHANTABILITY orFITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public Licensefor more details.You should have received a copy of the GNU General Public Licensealong with Zebra; see the file LICENSE.zebra.  If not, write to theFree Software Foundation, 59 Temple Place - Suite 330, Boston, MA02111-1307, USA.*/#include <stdio.h>#include <assert.h>#ifdef WIN32#include <io.h>#else#include <unistd.h>#endif#include <fcntl.h>#include "index.h"#include <direntz.h>#include <charmap.h>#if _FILE_OFFSET_BITS == 64#define PRINTF_OFF_T "%Ld"#else#define PRINTF_OFF_T "%ld"#endif#define USE_SHELLSORT 0#if USE_SHELLSORTstatic void shellsort(void *ar, int r, size_t s,                      int (*cmp)(const void *a, const void *b)){    char *a = ar;    char v[100];    int h, i, j, k;    static const int incs[16] = { 1391376, 463792, 198768, 86961, 33936,                                  13776, 4592, 1968, 861, 336,                                   112, 48, 21, 7, 3, 1 };    for ( k = 0; k < 16; k++)        for (h = incs[k], i = h; i < r; i++)        {             memcpy (v, a+s*i, s);            j = i;            while (j > h && (*cmp)(a + s*(j-h), v) > 0)            {                memcpy (a + s*j, a + s*(j-h), s);                j -= h;            }            memcpy (a+s*j, v, s);        } }#endifstatic void logRecord (ZebraHandle zh){    ++zh->records_processed;    if (!(zh->records_processed % 1000))    {        logf (LOG_LOG, "Records: %7d i/u/d %d/%d/%d",               zh->records_processed, zh->records_inserted, zh->records_updated,              zh->records_deleted);    }}static void extract_init (struct recExtractCtrl *p, RecWord *w){    w->zebra_maps = p->zebra_maps;    w->seqno = 1;    w->attrSet = VAL_BIB1;    w->attrUse = 1016;    w->reg_type = 'w';    w->extractCtrl = p;}static const char **searchRecordKey (ZebraHandle zh,                                     struct recKeys *reckeys,				     int attrSetS, int attrUseS){    static const char *ws[32];    int off = 0;    int startSeq = -1;    int i;    int seqno = 0;#if SU_SCHEME    int chS, ch;#else    short attrUse;    char attrSet;#endif    for (i = 0; i<32; i++)        ws[i] = NULL;#if SU_SCHEME    chS = zebraExplain_lookupSU (zh->reg->zei, attrSetS, attrUseS);    if (chS < 0)	return ws;#endif    while (off < reckeys->buf_used)    {        const char *src = reckeys->buf + off;	const char *wstart;        int lead;            lead = *src++;#if SU_SCHEME	if ((lead & 3)<3)	{	    memcpy (&ch, src, sizeof(ch));	    src += sizeof(ch);	}#else        if (!(lead & 1))        {            memcpy (&attrSet, src, sizeof(attrSet));            src += sizeof(attrSet);        }        if (!(lead & 2))        {            memcpy (&attrUse, src, sizeof(attrUse));            src += sizeof(attrUse);        }#endif        wstart = src;        while (*src++)            ;        if (lead & 60)            seqno += ((lead>>2) & 15)-1;        else        {            memcpy (&seqno, src, sizeof(seqno));            src += sizeof(seqno);        }	if (#if SU_SCHEME	    ch == chS#else	    attrUseS == attrUse && attrSetS == attrSet#endif	    )        {            int woff;            if (startSeq == -1)                startSeq = seqno;            woff = seqno - startSeq;            if (woff >= 0 && woff < 31)                ws[woff] = wstart;        }        off = src - reckeys->buf;    }    assert (off == reckeys->buf_used);    return ws;}struct file_read_info {    off_t file_max;	    /* maximum offset so far */    off_t file_offset;      /* current offset */    off_t file_moffset;     /* offset of rec/rec boundary */    int file_more;    int fd;    char *sdrbuf;    int sdrmax;};static struct file_read_info *file_read_start (int fd){    struct file_read_info *fi = (struct file_read_info *)	xmalloc (sizeof(*fi));    fi->fd = fd;    fi->file_max = 0;    fi->file_moffset = 0;    fi->sdrbuf = 0;    fi->sdrmax = 0;    return fi;}static void file_read_stop (struct file_read_info *fi){    xfree (fi);}static off_t file_seek (void *handle, off_t offset){    struct file_read_info *p = (struct file_read_info *) handle;    p->file_offset = offset;    if (p->sdrbuf)	return offset;    return lseek (p->fd, offset, SEEK_SET);}static off_t file_tell (void *handle){    struct file_read_info *p = (struct file_read_info *) handle;    return p->file_offset;}static int file_read (void *handle, char *buf, size_t count){    struct file_read_info *p = (struct file_read_info *) handle;    int fd = p->fd;    int r;    if (p->sdrbuf)    {	r = count;	if (r > p->sdrmax - p->file_offset)	    r = p->sdrmax - p->file_offset;	if (r)	    memcpy (buf, p->sdrbuf + p->file_offset, r);    }    else	r = read (fd, buf, count);    if (r > 0)    {        p->file_offset += r;        if (p->file_offset > p->file_max)            p->file_max = p->file_offset;    }    return r;}static void file_begin (void *handle){    struct file_read_info *p = (struct file_read_info *) handle;    p->file_offset = p->file_moffset;    if (!p->sdrbuf && p->file_moffset)        lseek (p->fd, p->file_moffset, SEEK_SET);    p->file_more = 0;}static void file_end (void *handle, off_t offset){    struct file_read_info *p = (struct file_read_info *) handle;    assert (p->file_more == 0);    p->file_more = 1;    p->file_moffset = offset;}static char *fileMatchStr (ZebraHandle zh,                           struct recKeys *reckeys, struct recordGroup *rGroup,                           const char *fname, const char *spec){    static char dstBuf[2048];      /* static here ??? */    char *dst = dstBuf;    const char *s = spec;    static const char **w;    while (1)    {        while (*s == ' ' || *s == '\t')            s++;        if (!*s)            break;        if (*s == '(')        {	    char attset_str[64], attname_str[64];	    data1_attset *attset;	    int i;            char matchFlag[32];            int attSet = 1, attUse = 1;            int first = 1;            s++;	    for (i = 0; *s && *s != ',' && *s != ')'; s++)		if (i < 63)		    attset_str[i++] = *s;	    attset_str[i] = '\0';	    if (*s == ',')	    {		s++;		for (i = 0; *s && *s != ')'; s++)		    if (i < 63)			attname_str[i++] = *s;		attname_str[i] = '\0';	    }	    	    if ((attset = data1_get_attset (zh->reg->dh, attset_str)))	    {		data1_att *att;		attSet = attset->reference;		att = data1_getattbyname(zh->reg->dh, attset, attname_str);		if (att)		    attUse = att->value;		else		    attUse = atoi (attname_str);	    }            w = searchRecordKey (zh, reckeys, attSet, attUse);            assert (w);            if (*s == ')')            {                for (i = 0; i<32; i++)                    matchFlag[i] = 1;            }            else            {                logf (LOG_WARN, "Missing ) in match criteria %s in group %s",                      spec, rGroup->groupName ? rGroup->groupName : "none");                return NULL;            }            s++;            for (i = 0; i<32; i++)                if (matchFlag[i] && w[i])                {                    if (first)                    {                        *dst++ = ' ';                        first = 0;                    }                    strcpy (dst, w[i]);                    dst += strlen(w[i]);                }            if (first)            {                logf (LOG_WARN, "Record didn't contain match"                      " fields in (%s,%s)", attset_str, attname_str);                return NULL;            }        }        else if (*s == '$')        {            int spec_len;            char special[64];            const char *spec_src = NULL;            const char *s1 = ++s;            while (*s1 && *s1 != ' ' && *s1 != '\t')                s1++;            spec_len = s1 - s;            if (spec_len > 63)                spec_len = 63;            memcpy (special, s, spec_len);            special[spec_len] = '\0';            s = s1;            if (!strcmp (special, "group"))                spec_src = rGroup->groupName;            else if (!strcmp (special, "database"))                spec_src = rGroup->databaseName;            else if (!strcmp (special, "filename")) {                spec_src = fname;	    }            else if (!strcmp (special, "type"))                spec_src = rGroup->recordType;            else                 spec_src = NULL;            if (spec_src)            {                strcpy (dst, spec_src);                dst += strlen (spec_src);            }        }        else if (*s == '\"' || *s == '\'')        {            int stopMarker = *s++;            char tmpString[64];            int i = 0;            while (*s && *s != stopMarker)            {                if (i < 63)                    tmpString[i++] = *s++;            }            if (*s)                s++;            tmpString[i] = '\0';            strcpy (dst, tmpString);            dst += strlen (tmpString);        }        else        {            logf (LOG_WARN, "Syntax error in match criteria %s in group %s",                  spec, rGroup->groupName ? rGroup->groupName : "none");            return NULL;        }        *dst++ = 1;    }    if (dst == dstBuf)    {        logf (LOG_WARN, "No match criteria for record %s in group %s",              fname, rGroup->groupName ? rGroup->groupName : "none");        return NULL;    }    *dst = '\0';    return dstBuf;}struct recordLogInfo {    const char *fname;    int recordOffset;    struct recordGroup *rGroup;};     static int recordExtract (ZebraHandle zh,                          SYSNO *sysno, const char *fname,                          struct recordGroup *rGroup, int deleteFlag,                          struct file_read_info *fi,			  RecType recType, char *subType, void *clientData,                          int force_update){    RecordAttr *recordAttr;    int r;    char *matchStr;    SYSNO sysnotmp;    Record rec;    off_t recordOffset = 0;    if (fi->fd != -1)    {	struct recExtractCtrl extractCtrl;        /* we are going to read from a file, so prepare the extraction */	int i;	zh->reg->keys.buf_used = 0;	zh->reg->keys.prevAttrUse = -1;	zh->reg->keys.prevAttrSet = -1;	zh->reg->keys.prevSeqNo = 0;	zh->reg->sortKeys.buf_used = 0;		recordOffset = fi->file_moffset;	extractCtrl.offset = fi->file_moffset;	extractCtrl.readf = file_read;	extractCtrl.seekf = file_seek;	extractCtrl.tellf = file_tell;	extractCtrl.endf = file_end;	extractCtrl.fh = fi;	extractCtrl.subType = subType;	extractCtrl.init = extract_init;	extractCtrl.tokenAdd = extract_token_add;	extractCtrl.schemaAdd = extract_schema_add;	extractCtrl.dh = zh->reg->dh;        extractCtrl.handle = zh;	for (i = 0; i<256; i++)	{	    if (zebra_maps_is_positioned(zh->reg->zebra_maps, i))		extractCtrl.seqno[i] = 1;	    else		extractCtrl.seqno[i] = 0;	}	extractCtrl.zebra_maps = zh->reg->zebra_maps;	extractCtrl.flagShowRecords = !rGroup->flagRw;        if (!rGroup->flagRw)            printf ("File: %s " PRINTF_OFF_T "\n", fname, recordOffset);        if (rGroup->flagRw)        {            char msg[512];            sprintf (msg, "%s:" PRINTF_OFF_T , fname, recordOffset);            yaz_log_init_prefix2 (msg);        }        r = (*recType->extract)(clientData, &extractCtrl);        yaz_log_init_prefix2 (0);	if (r == RECCTRL_EXTRACT_EOF)	    return 0;	else if (r == RECCTRL_EXTRACT_ERROR_GENERIC)	{            /* error occured during extraction ... */            if (rGroup->flagRw &&		zh->records_processed < rGroup->fileVerboseLimit)            {                logf (LOG_WARN, "fail %s %s " PRINTF_OFF_T, rGroup->recordType,                      fname, recordOffset);            }            return 0;        }	else if (r == RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER)	{

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -