⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 extract.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 4 页
字号:
            /* error occured during extraction ... */            if (rGroup->flagRw &&		zh->records_processed < rGroup->fileVerboseLimit)            {                logf (LOG_WARN, "no filter for %s %s "                       PRINTF_OFF_T, rGroup->recordType,                      fname, recordOffset);            }            return 0;        }        if (zh->reg->keys.buf_used == 0)        {            /* the extraction process returned no information - the record               is probably empty - unless flagShowRecords is in use */            if (!rGroup->flagRw)                return 1;	    	    logf (LOG_WARN, "empty %s %s " PRINTF_OFF_T, rGroup->recordType,		  fname, recordOffset);            return 1;        }    }    /* perform match if sysno not known and if match criteria is specified */           matchStr = NULL;    if (!sysno)     {        sysnotmp = 0;        sysno = &sysnotmp;        if (rGroup->recordId && *rGroup->recordId)        {            char *rinfo;                    matchStr = fileMatchStr (zh, &zh->reg->keys, rGroup, fname,                                      rGroup->recordId);            if (matchStr)            {                rinfo = dict_lookup (zh->reg->matchDict, matchStr);                if (rinfo)                    memcpy (sysno, rinfo+1, sizeof(*sysno));            }            else            {                logf (LOG_WARN, "Bad match criteria");                return 0;            }        }    }    if (! *sysno)    {        /* new record */        if (deleteFlag)        {	    logf (LOG_LOG, "delete %s %s " PRINTF_OFF_T, rGroup->recordType,		  fname, recordOffset);            logf (LOG_WARN, "cannot delete record above (seems new)");            return 1;        }        if (zh->records_processed < rGroup->fileVerboseLimit)            logf (LOG_LOG, "add %s %s " PRINTF_OFF_T, rGroup->recordType,                  fname, recordOffset);        rec = rec_new (zh->reg->records);        *sysno = rec->sysno;	recordAttr = rec_init_attr (zh->reg->zei, rec);        if (matchStr)        {            dict_insert (zh->reg->matchDict, matchStr, sizeof(*sysno), sysno);        }	extract_flushSortKeys (zh, *sysno, 1, &zh->reg->sortKeys);        extract_flushRecordKeys (zh, *sysno, 1, &zh->reg->keys);        zh->records_inserted++;    }    else    {        /* record already exists */        struct recKeys delkeys;        struct sortKeys sortKeys;        rec = rec_get (zh->reg->records, *sysno);        assert (rec);		recordAttr = rec_init_attr (zh->reg->zei, rec);	if (!force_update && recordAttr->runNumber ==            zebraExplain_runNumberIncrement (zh->reg->zei, 0))	{            yaz_log (LOG_LOG, "run number = %d", recordAttr->runNumber);	    yaz_log (LOG_LOG, "skipped %s %s " PRINTF_OFF_T,                     rGroup->recordType, fname, recordOffset);	    extract_flushSortKeys (zh, *sysno, -1, &zh->reg->sortKeys);	    rec_rm (&rec);	    logRecord (zh);	    return 1;	}        delkeys.buf_used = rec->size[recInfo_delKeys];	delkeys.buf = rec->info[recInfo_delKeys];        sortKeys.buf_used = rec->size[recInfo_sortKeys];        sortKeys.buf = rec->info[recInfo_sortKeys];	extract_flushSortKeys (zh, *sysno, 0, &sortKeys);        extract_flushRecordKeys (zh, *sysno, 0, &delkeys);        if (deleteFlag)        {            /* record going to be deleted */            if (!delkeys.buf_used)            {                logf (LOG_LOG, "delete %s %s " PRINTF_OFF_T,                      rGroup->recordType, fname, recordOffset);                logf (LOG_WARN, "cannot delete file above, storeKeys false");            }            else            {                if (zh->records_processed < rGroup->fileVerboseLimit)                    logf (LOG_LOG, "delete %s %s " PRINTF_OFF_T,                         rGroup->recordType, fname, recordOffset);                zh->records_deleted++;                if (matchStr)                    dict_delete (zh->reg->matchDict, matchStr);                rec_del (zh->reg->records, &rec);            }	    rec_rm (&rec);            logRecord (zh);            return 1;        }        else        {            /* record going to be updated */            if (!delkeys.buf_used)            {                logf (LOG_LOG, "update %s %s " PRINTF_OFF_T,                      rGroup->recordType, fname, recordOffset);                logf (LOG_WARN, "cannot update file above, storeKeys false");            }            else            {                if (zh->records_processed < rGroup->fileVerboseLimit)                    logf (LOG_LOG, "update %s %s " PRINTF_OFF_T,                        rGroup->recordType, fname, recordOffset);                extract_flushSortKeys (zh, *sysno, 1, &zh->reg->sortKeys);                extract_flushRecordKeys (zh, *sysno, 1, &zh->reg->keys);                zh->records_updated++;            }        }    }    /* update file type */    xfree (rec->info[recInfo_fileType]);    rec->info[recInfo_fileType] =        rec_strdup (rGroup->recordType, &rec->size[recInfo_fileType]);    /* update filename */    xfree (rec->info[recInfo_filename]);    rec->info[recInfo_filename] =        rec_strdup (fname, &rec->size[recInfo_filename]);    /* update delete keys */    xfree (rec->info[recInfo_delKeys]);    if (zh->reg->keys.buf_used > 0 && rGroup->flagStoreKeys == 1)    {        rec->size[recInfo_delKeys] = zh->reg->keys.buf_used;        rec->info[recInfo_delKeys] = zh->reg->keys.buf;        zh->reg->keys.buf = NULL;        zh->reg->keys.buf_max = 0;    }    else    {        rec->info[recInfo_delKeys] = NULL;        rec->size[recInfo_delKeys] = 0;    }    /* update sort keys */    xfree (rec->info[recInfo_sortKeys]);    rec->size[recInfo_sortKeys] = zh->reg->sortKeys.buf_used;    rec->info[recInfo_sortKeys] = zh->reg->sortKeys.buf;    zh->reg->sortKeys.buf = NULL;    zh->reg->sortKeys.buf_max = 0;    /* save file size of original record */    zebraExplain_recordBytesIncrement (zh->reg->zei,                                       - recordAttr->recordSize);    recordAttr->recordSize = fi->file_moffset - recordOffset;    if (!recordAttr->recordSize)	recordAttr->recordSize = fi->file_max - recordOffset;    zebraExplain_recordBytesIncrement (zh->reg->zei,                                       recordAttr->recordSize);    /* set run-number for this record */    recordAttr->runNumber = zebraExplain_runNumberIncrement (zh->reg->zei,                                                             0);    /* update store data */    xfree (rec->info[recInfo_storeData]);    if (rGroup->flagStoreData == 1)    {        rec->size[recInfo_storeData] = recordAttr->recordSize;        rec->info[recInfo_storeData] = (char *)	    xmalloc (recordAttr->recordSize);        if (lseek (fi->fd, recordOffset, SEEK_SET) < 0)        {            logf (LOG_ERRNO|LOG_FATAL, "seek to " PRINTF_OFF_T " in %s",                  recordOffset, fname);            exit (1);        }        if (read (fi->fd, rec->info[recInfo_storeData], recordAttr->recordSize)	    < recordAttr->recordSize)        {            logf (LOG_ERRNO|LOG_FATAL, "read %d bytes of %s",                  recordAttr->recordSize, fname);            exit (1);        }    }    else    {        rec->info[recInfo_storeData] = NULL;        rec->size[recInfo_storeData] = 0;    }    /* update database name */    xfree (rec->info[recInfo_databaseName]);    rec->info[recInfo_databaseName] =        rec_strdup (rGroup->databaseName, &rec->size[recInfo_databaseName]);     /* update offset */    recordAttr->recordOffset = recordOffset;        /* commit this record */    rec_put (zh->reg->records, &rec);    logRecord (zh);    return 1;}int fileExtract (ZebraHandle zh, SYSNO *sysno, const char *fname,                  const struct recordGroup *rGroupP, int deleteFlag){    int r, i, fd;    char gprefix[128];    char ext[128];    char ext_res[128];    char subType[128];    RecType recType;    struct recordGroup rGroupM;    struct recordGroup *rGroup = &rGroupM;    struct file_read_info *fi;    void *clientData;    memcpy (rGroup, rGroupP, sizeof(*rGroupP));       if (!rGroup->groupName || !*rGroup->groupName)        *gprefix = '\0';    else        sprintf (gprefix, "%s.", rGroup->groupName);    logf (LOG_DEBUG, "fileExtract %s", fname);    /* determine file extension */    *ext = '\0';    for (i = strlen(fname); --i >= 0; )        if (fname[i] == '/')            break;        else if (fname[i] == '.')        {            strcpy (ext, fname+i+1);            break;        }    /* determine file type - depending on extension */    if (!rGroup->recordType)    {        sprintf (ext_res, "%srecordType.%s", gprefix, ext);        if (!(rGroup->recordType = res_get (zh->res, ext_res)))        {            sprintf (ext_res, "%srecordType", gprefix);            rGroup->recordType = res_get (zh->res, ext_res);        }    }    if (!rGroup->recordType)    {        if (zh->records_processed < rGroup->fileVerboseLimit)            logf (LOG_LOG, "? %s", fname);        return 0;    }    if (!*rGroup->recordType)	return 0;    if (!(recType =	  recType_byName (zh->reg->recTypes, rGroup->recordType, subType,			  &clientData)))    {        logf (LOG_WARN, "No such record type: %s", rGroup->recordType);        return 0;    }    /* determine match criteria */    if (!rGroup->recordId)    {        sprintf (ext_res, "%srecordId.%s", gprefix, ext);        rGroup->recordId = res_get (zh->res, ext_res);    }    /* determine database name */    if (!rGroup->databaseName)    {        sprintf (ext_res, "%sdatabase.%s", gprefix, ext);        if (!(rGroup->databaseName = res_get (zh->res, ext_res)))        {            sprintf (ext_res, "%sdatabase", gprefix);            rGroup->databaseName = res_get (zh->res, ext_res);        }    }    if (!rGroup->databaseName)        rGroup->databaseName = "Default";    /* determine if explain database */        sprintf (ext_res, "%sexplainDatabase", gprefix);    rGroup->explainDatabase =	atoi (res_get_def (zh->res, ext_res, "0"));    /* announce database */    if (zebraExplain_curDatabase (zh->reg->zei, rGroup->databaseName))    {        if (zebraExplain_newDatabase (zh->reg->zei, rGroup->databaseName,				      rGroup->explainDatabase))	    return 0;    }    if (rGroup->flagStoreData == -1)    {        const char *sval;        sprintf (ext_res, "%sstoreData.%s", gprefix, ext);        if (!(sval = res_get (zh->res, ext_res)))        {            sprintf (ext_res, "%sstoreData", gprefix);            sval = res_get (zh->res, ext_res);        }        if (sval)            rGroup->flagStoreData = atoi (sval);    }    if (rGroup->flagStoreData == -1)        rGroup->flagStoreData = 0;    if (rGroup->flagStoreKeys == -1)    {        const char *sval;        sprintf (ext_res, "%sstoreKeys.%s", gprefix, ext);        sval = res_get (zh->res, ext_res);	if (!sval)        {            sprintf (ext_res, "%sstoreKeys", gprefix);            sval = res_get (zh->res, ext_res);        }	if (!sval)	    sval = res_get (zh->res, "storeKeys");        if (sval)            rGroup->flagStoreKeys = atoi (sval);    }    if (rGroup->flagStoreKeys == -1)        rGroup->flagStoreKeys = 0;    if (sysno && deleteFlag)        fd = -1;    else    {        char full_rep[1024];        if (zh->path_reg && !yaz_is_abspath (fname))        {            strcpy (full_rep, zh->path_reg);            strcat (full_rep, "/");            strcat (full_rep, fname);        }        else            strcpy (full_rep, fname);                if ((fd = open (full_rep, O_BINARY|O_RDONLY)) == -1)        {            logf (LOG_WARN|LOG_ERRNO, "open %s", full_rep);            return 0;        }    }    fi = file_read_start (fd);    do    {        file_begin (fi);        r = recordExtract (zh, sysno, fname, rGroup, deleteFlag, fi,                           recType, subType, clientData, 1);    } while (r && !sysno && fi->file_more);    file_read_stop (fi);    if (fd != -1)        close (fd);    return r;}int extract_rec_in_mem (ZebraHandle zh, const char *recordType,                        const char *buf, size_t buf_size,                        const char *databaseName, int delete_flag,                        int test_mode, int *sysno,                        int store_keys, int store_data,                        const char *match_criteria){    struct recordGroup rGroup;    rGroup.groupName = NULL;    rGroup.databaseName = (char *)databaseName;    rGroup.path = NULL;    rGroup.recordId = NULL;    rGroup.recordType = (char *)recordType;    rGroup.flagStoreData = store_data;    rGroup.flagStoreKeys = store_keys;    rGroup.flagRw = 1;    rGroup.databaseNamePath = 0;    rGroup.explainDatabase = 0;    rGroup.fileVerboseLimit = 100000;    rGroup.followLinks = -1;    return (bufferExtractRecord (zh,				 buf, buf_size,				 &rGroup,				 delete_flag,				 test_mode,				 recordType,				 sysno,				 match_criteria,				 "<no file>",				 0,1));}/*  If sysno is provided, then it's used to identify the reocord.  If not, and match_criteria is provided, then sysno is guessed  If not, and a record is provided, then sysno is got from there   */int bufferExtractRecord (ZebraHandle zh, 			 const char *buf, size_t buf_size,			 struct recordGroup *rGroup, 			 int delete_flag,			 int test_mode, 			 const char *recordType,			 int *sysno,			 const char *match_criteria,			 const char *fname,			 int force_update,			 int allow_update){    RecordAttr *recordAttr;    struct recExtractCtrl extractCtrl;    int i, r;    char *matchStr = 0;    RecType recType = NULL;    char subType[1024];    void *clientData;    Record rec;    long recordOffset = 0;    struct zebra_fetch_control fc;    fc.fd = -1;    fc.record_int_buf = buf;    fc.record_int_len = buf_size;    fc.record_int_pos = 0;    fc.offset_end = 0;    fc.record_offset = 0;    extractCtrl.offset = 0;    extractCtrl.readf = zebra_record_int_read;    extractCtrl.seekf = zebra_record_int_seek;    extractCtrl.tellf = zebra_record_int_tell;    extractCtrl.endf = zebra_record_int_end;    extractCtrl.fh = &fc;    zh->reg->keys.buf_used = 0;    zh->reg->keys.prevAttrUse = -1;    zh->reg->keys.prevAttrSet = -1;    zh->reg->keys.prevSeqNo = 0;    zh->reg->sortKeys.buf_used = 0;    /* announce database */    if (!(rGroup->databaseName)) {        logf (LOG_WARN, "Invalid record group, no database name given");	return 0;    }        if (zebraExplain_curDatabase (zh->reg->zei, rGroup->databaseName))    {        if (zebraExplain_newDatabase (zh->reg->zei, rGroup->databaseName, 0))            return 0;    }        if (*recordType) {        logf (LOG_DEBUG, "Record type explicitly specified: %s", recordType);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -