⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 extract.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 4 页
字号:
        recType = recType_byName (zh->reg->recTypes, recordType, subType,                                  &clientData);    } else {        if (!(rGroup->recordType)) {            logf (LOG_WARN, "No such record type defined");            return 0;        }        logf (LOG_DEBUG, "Get record type from rgroup: %s",rGroup->recordType);        recType = recType_byName (zh->reg->recTypes, rGroup->recordType, subType,                                  &clientData);        recordType = rGroup->recordType;    }        if (!recType) {        logf (LOG_WARN, "No such record type: %s", rGroup->recordType);        return 0;    }        extractCtrl.subType = subType;    extractCtrl.init = extract_init;    extractCtrl.tokenAdd = extract_token_add;    extractCtrl.schemaAdd = extract_schema_add;    extractCtrl.dh = zh->reg->dh;    extractCtrl.handle = zh;    extractCtrl.zebra_maps = zh->reg->zebra_maps;    extractCtrl.flagShowRecords = 0;    for (i = 0; i<256; i++)    {	if (zebra_maps_is_positioned(zh->reg->zebra_maps, i))	    extractCtrl.seqno[i] = 1;	else	    extractCtrl.seqno[i] = 0;    }    r = (*recType->extract)(clientData, &extractCtrl);    if (r == RECCTRL_EXTRACT_EOF)	return 0;    else if (r == RECCTRL_EXTRACT_ERROR_GENERIC)    {	/* error occured during extraction ... */	yaz_log (LOG_WARN, "extract error: generic");	return 0;    }    else if (r == RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER)    {	/* error occured during extraction ... */	yaz_log (LOG_WARN, "extract error: no such filter");	return 0;    }    if (zh->reg->keys.buf_used == 0)    {	/* the extraction process returned no information - the record	   is probably empty - unless flagShowRecords is in use */	if (test_mode)	    return 1;	logf (LOG_WARN, "No keys generated for record");	logf (LOG_WARN, " The file is probably empty");	return 1;    }    /* match criteria */    matchStr = NULL;    if (! *sysno && match_criteria) {        char *rinfo;        if (*match_criteria) {            matchStr = (char *)match_criteria;        } else {            if (rGroup->recordId && *rGroup->recordId) {                matchStr = fileMatchStr (zh, &zh->reg->keys, rGroup, fname,                                          rGroup->recordId);            }        }        if (matchStr) {            rinfo = dict_lookup (zh->reg->matchDict, matchStr);            if (rinfo)                memcpy (sysno, rinfo+1, sizeof(*sysno));        } else {            logf (LOG_WARN, "Bad match criteria (recordID)");            return 0;        }    }    if (! *sysno)    {        /* new record */        if (delete_flag)        {	    logf (LOG_LOG, "delete %s %s %ld", recordType,		  fname, (long) recordOffset);            logf (LOG_WARN, "cannot delete record above (seems new)");            return 1;        }	logf (LOG_LOG, "add %s %s %ld", recordType, fname,	      (long) recordOffset);        rec = rec_new (zh->reg->records);        *sysno = rec->sysno;	recordAttr = rec_init_attr (zh->reg->zei, rec);        if (matchStr)        {            dict_insert (zh->reg->matchDict, matchStr,                         sizeof(*sysno), sysno);        }	extract_flushSortKeys (zh, *sysno, 1, &zh->reg->sortKeys);        extract_flushRecordKeys (zh, *sysno, 1, &zh->reg->keys);        zh->records_inserted++;    }     else    {        /* record already exists */        struct recKeys delkeys;        struct sortKeys sortKeys;	if (!allow_update) {	      logf (LOG_LOG, "skipped %s %s %ld", 		    recordType, fname, (long) recordOffset);	      logRecord(zh);	      return -1;	}        rec = rec_get (zh->reg->records, *sysno);        assert (rec);		recordAttr = rec_init_attr (zh->reg->zei, rec);	if (!force_update) {	  if (recordAttr->runNumber ==	      zebraExplain_runNumberIncrement (zh->reg->zei, 0))	    {	      logf (LOG_LOG, "skipped %s %s %ld", recordType,		    fname, (long) recordOffset);	      extract_flushSortKeys (zh, *sysno, -1, &zh->reg->sortKeys);	      rec_rm (&rec);	      logRecord(zh);	      return 1;	    }	}        delkeys.buf_used = rec->size[recInfo_delKeys];	delkeys.buf = rec->info[recInfo_delKeys];        sortKeys.buf_used = rec->size[recInfo_sortKeys];        sortKeys.buf = rec->info[recInfo_sortKeys];	extract_flushSortKeys (zh, *sysno, 0, &sortKeys);        extract_flushRecordKeys (zh, *sysno, 0, &delkeys);        if (delete_flag)        {            /* record going to be deleted */            if (!delkeys.buf_used)            {                logf (LOG_LOG, "delete %s %s %ld", recordType,                      fname, (long) recordOffset);                logf (LOG_WARN, "cannot delete file above, storeKeys false");            }            else            {		logf (LOG_LOG, "delete %s %s %ld", recordType,		      fname, (long) recordOffset);                zh->records_deleted++;                if (matchStr)                    dict_delete (zh->reg->matchDict, matchStr);                rec_del (zh->reg->records, &rec);            }	    rec_rm (&rec);            logRecord(zh);            return 1;        }        else        {            /* record going to be updated */            if (!delkeys.buf_used)            {                logf (LOG_LOG, "update %s %s %ld", recordType,                      fname, (long) recordOffset);                logf (LOG_WARN, "cannot update file above, storeKeys false");            }            else            {		logf (LOG_LOG, "update %s %s %ld", recordType,		      fname, (long) recordOffset);                extract_flushSortKeys (zh, *sysno, 1, &zh->reg->sortKeys);                extract_flushRecordKeys (zh, *sysno, 1, &zh->reg->keys);                zh->records_updated++;            }        }    }    /* update file type */    xfree (rec->info[recInfo_fileType]);    rec->info[recInfo_fileType] =        rec_strdup (recordType, &rec->size[recInfo_fileType]);    /* update filename */    xfree (rec->info[recInfo_filename]);    rec->info[recInfo_filename] =        rec_strdup (fname, &rec->size[recInfo_filename]);    /* update delete keys */    xfree (rec->info[recInfo_delKeys]);    if (zh->reg->keys.buf_used > 0 && rGroup->flagStoreKeys == 1)    {        rec->size[recInfo_delKeys] = zh->reg->keys.buf_used;        rec->info[recInfo_delKeys] = zh->reg->keys.buf;        zh->reg->keys.buf = NULL;        zh->reg->keys.buf_max = 0;    }    else    {        rec->info[recInfo_delKeys] = NULL;        rec->size[recInfo_delKeys] = 0;    }    /* update sort keys */    xfree (rec->info[recInfo_sortKeys]);    rec->size[recInfo_sortKeys] = zh->reg->sortKeys.buf_used;    rec->info[recInfo_sortKeys] = zh->reg->sortKeys.buf;    zh->reg->sortKeys.buf = NULL;    zh->reg->sortKeys.buf_max = 0;    /* save file size of original record */    zebraExplain_recordBytesIncrement (zh->reg->zei,				       - recordAttr->recordSize);#if 0    recordAttr->recordSize = fi->file_moffset - recordOffset;    if (!recordAttr->recordSize)	recordAttr->recordSize = fi->file_max - recordOffset;#else    recordAttr->recordSize = buf_size;#endif    zebraExplain_recordBytesIncrement (zh->reg->zei,				       recordAttr->recordSize);    /* set run-number for this record */    recordAttr->runNumber =	zebraExplain_runNumberIncrement (zh->reg->zei, 0);    /* update store data */    xfree (rec->info[recInfo_storeData]);    if (rGroup->flagStoreData == 1)    {        rec->size[recInfo_storeData] = recordAttr->recordSize;        rec->info[recInfo_storeData] = (char *)	    xmalloc (recordAttr->recordSize);#if 1        memcpy (rec->info[recInfo_storeData], buf, recordAttr->recordSize);#else        if (lseek (fi->fd, recordOffset, SEEK_SET) < 0)        {            logf (LOG_ERRNO|LOG_FATAL, "seek to %ld in %s",                  (long) recordOffset, fname);            exit (1);        }        if (read (fi->fd, rec->info[recInfo_storeData], recordAttr->recordSize)	    < recordAttr->recordSize)        {            logf (LOG_ERRNO|LOG_FATAL, "read %d bytes of %s",                  recordAttr->recordSize, fname);            exit (1);        }#endif    }    else    {        rec->info[recInfo_storeData] = NULL;        rec->size[recInfo_storeData] = 0;    }    /* update database name */    xfree (rec->info[recInfo_databaseName]);    rec->info[recInfo_databaseName] =        rec_strdup (rGroup->databaseName, &rec->size[recInfo_databaseName]);     /* update offset */    recordAttr->recordOffset = recordOffset;        /* commit this record */    rec_put (zh->reg->records, &rec);    logRecord(zh);    return 0;}int explain_extract (void *handle, Record rec, data1_node *n){    ZebraHandle zh = (ZebraHandle) handle;    struct recExtractCtrl extractCtrl;    int i;    if (zebraExplain_curDatabase (zh->reg->zei,				  rec->info[recInfo_databaseName]))    {	abort();        if (zebraExplain_newDatabase (zh->reg->zei,				      rec->info[recInfo_databaseName], 0))            abort ();    }    zh->reg->keys.buf_used = 0;    zh->reg->keys.prevAttrUse = -1;    zh->reg->keys.prevAttrSet = -1;    zh->reg->keys.prevSeqNo = 0;    zh->reg->sortKeys.buf_used = 0;        extractCtrl.init = extract_init;    extractCtrl.tokenAdd = extract_token_add;    extractCtrl.schemaAdd = extract_schema_add;    extractCtrl.dh = zh->reg->dh;    for (i = 0; i<256; i++)	extractCtrl.seqno[i] = 0;    extractCtrl.zebra_maps = zh->reg->zebra_maps;    extractCtrl.flagShowRecords = 0;    extractCtrl.handle = handle;    if (n)	grs_extract_tree(&extractCtrl, n);    if (rec->size[recInfo_delKeys])    {	struct recKeys delkeys;	struct sortKeys sortkeys;	delkeys.buf_used = rec->size[recInfo_delKeys];	delkeys.buf = rec->info[recInfo_delKeys];	sortkeys.buf_used = rec->size[recInfo_sortKeys];	sortkeys.buf = rec->info[recInfo_sortKeys];	extract_flushSortKeys (zh, rec->sysno, 0, &sortkeys);	extract_flushRecordKeys (zh, rec->sysno, 0, &delkeys);    }    extract_flushRecordKeys (zh, rec->sysno, 1, &zh->reg->keys);    extract_flushSortKeys (zh, rec->sysno, 1, &zh->reg->sortKeys);    xfree (rec->info[recInfo_delKeys]);    rec->size[recInfo_delKeys] = zh->reg->keys.buf_used;    rec->info[recInfo_delKeys] = zh->reg->keys.buf;    zh->reg->keys.buf = NULL;    zh->reg->keys.buf_max = 0;    xfree (rec->info[recInfo_sortKeys]);    rec->size[recInfo_sortKeys] = zh->reg->sortKeys.buf_used;    rec->info[recInfo_sortKeys] = zh->reg->sortKeys.buf;    zh->reg->sortKeys.buf = NULL;    zh->reg->sortKeys.buf_max = 0;    return 0;}void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno,                              int cmd, struct recKeys *reckeys){#if SU_SCHEME#else    unsigned char attrSet = (unsigned char) -1;    unsigned short attrUse = (unsigned short) -1;#endif    int seqno = 0;    int off = 0;    int ch = 0;    ZebraExplainInfo zei = zh->reg->zei;    if (!zh->reg->key_buf)    {	int mem= 1024*1024* atoi( res_get_def( zh->res, "memmax", "8"));	if (mem <= 0)	{	    logf(LOG_WARN, "Invalid memory setting, using default 8 MB");	    mem= 1024*1024*8;	}	/* FIXME: That "8" should be in a default settings include */	/* not hard-coded here! -H */	zh->reg->key_buf = (char**) xmalloc (mem);	zh->reg->ptr_top = mem/sizeof(char*);	zh->reg->ptr_i = 0;	zh->reg->key_buf_used = 0;	zh->reg->key_file_no = 0;    }    zebraExplain_recordCountIncrement (zei, cmd ? 1 : -1);    while (off < reckeys->buf_used)    {        const char *src = reckeys->buf + off;        struct it_key key;        int lead;            lead = *src++;#if SU_SCHEME	if ((lead & 3) < 3)	{	    memcpy (&ch, src, sizeof(ch));	    src += sizeof(ch);	}#else        if (!(lead & 1))        {            memcpy (&attrSet, src, sizeof(attrSet));            src += sizeof(attrSet);        }        if (!(lead & 2))        {            memcpy (&attrUse, src, sizeof(attrUse));            src += sizeof(attrUse);        }#endif        if (zh->reg->key_buf_used + 1024 >             (zh->reg->ptr_top -zh->reg->ptr_i)*sizeof(char*))            extract_flushWriteKeys (zh);        ++(zh->reg->ptr_i);        (zh->reg->key_buf)[zh->reg->ptr_top - zh->reg->ptr_i] =	    (char*)zh->reg->key_buf + zh->reg->key_buf_used;#if SU_SCHEME#else        ch = zebraExplain_lookupSU (zei, attrSet, attrUse);        if (ch < 0)            ch = zebraExplain_addSU (zei, attrSet, attrUse);#endif        assert (ch > 0);	zh->reg->key_buf_used +=	    key_SU_encode (ch,((char*)zh->reg->key_buf) +                           zh->reg->key_buf_used);        while (*src)            ((char*)zh->reg->key_buf) [(zh->reg->key_buf_used)++] = *src++;        src++;        ((char*)(zh->reg->key_buf))[(zh->reg->key_buf_used)++] = '\0';        ((char*)(zh->reg->key_buf))[(zh->reg->key_buf_used)++] = cmd;        if (lead & 60)            seqno += ((lead>>2) & 15)-1;        else        {            memcpy (&seqno, src, sizeof(seqno));            src += sizeof(seqno);        }        key.seqno = seqno;        key.sysno = sysno;        memcpy ((char*)zh->reg->key_buf + zh->reg->key_buf_used, &key, sizeof(key));        (zh->reg->key_buf_used) += sizeof(key);        off = src - reckeys->buf;    }    assert (off == reckeys->buf_used);}void extract_flushWriteKeys (ZebraHandle zh){    FILE *outf;    char out_fname[200];    char *prevcp, *cp;    struct encode_info encode_info;    int ptr_i = zh->reg->ptr_i;#if SORT_EXTRA    int i;#endif    if (!zh->reg->key_buf || ptr_i <= 0)        return;    (zh->reg->key_file_no)++;    logf (LOG_LOG, "sorting section %d", (zh->reg->key_file_no));#if !SORT_EXTRA    qsort (zh->reg->key_buf + zh->reg->ptr_top - ptr_i, ptr_i,               sizeof(char*), key_qsort_compare);    extract_get_fname_tmp (zh, out_fname, zh->reg->key_file_no);    if (!(outf = fopen (out_fname, "wb")))    {        logf (LOG_FATAL|LOG_ERRNO, "fopen %s", out_fname);        exit (1);    }    logf (LOG_LOG, "writing section %d", zh->reg->key_file_no);    prevcp = cp = (zh->reg->key_buf)[zh->reg->ptr_top - ptr_i];        encode_key_init (&encode_info);    encode_key_write (cp, &encode_info, outf);        while (--ptr_i > 0)    {        cp = (zh->reg->key_buf)[zh->reg->ptr_top - ptr_i];        if (strcmp (cp, prevcp))        {            encode_key_flush ( &encode_info, outf);            encode_key_init (&encode_info);            encode_key_write (cp, &encode_info, outf);            prevcp = cp;        }        else            encode_key_write (cp + strlen(cp), &encode_info, outf);    }    encode_key_flush ( &encode_info, outf);#else

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -