⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 mozcache.c

📁 网络爬虫程序
💻 C
字号:
/***************************************************************************//*    This code is part of WWW grabber called pavuk                        *//*    Copyright (c) 1997 - 2001 Stefan Ondrejicka                          *//*    Distributed under GPL 2 or later                                     *//***************************************************************************/#include "config.h"#include <sys/types.h>#include <sys/stat.h>#include <netinet/in.h>#include <unistd.h>#include <time.h>#include <limits.h>#include <fcntl.h>#include <string.h>#include <stdlib.h>#include <stdio.h>#include "dlhash.h"#include "dlhash_tools.h"#include "mozcache.h"#include "tools.h"typedef int int32__t;typedef unsigned int uint32__t;#ifdef HAVE_BDB_18x#ifdef HAVE_DB1_H#include <db1/db.h>#elif defined(HAVE_DB185_H)#include <db_185.h>#elif defined(HAVE_DB2_DB185_H)#include <db2/db_185.h>#elif defined(HAVE_DB3_DB185_H)#include <db3/db_185.h>#elif defined(HAVE_DB4_DB185_H)#include <db4/db_185.h>#else#include <db.h>#endif#ifdef WORDS_BIGENDIAN#define COPY_INT32(i1,i2) \  ((char *)(i1))[0] = ((char *)(i2))[3]; \  ((char *)(i1))[1] = ((char *)(i2))[2]; \  ((char *)(i1))[2] = ((char *)(i2))[1]; \  ((char *)(i1))[3] = ((char *)(i2))[0];#else#define COPY_INT32(i1,i2)  *((int32__t *) i1) = *((int32__t *) i2);#endif#endif /* HAVE_BDB_18x */static char *moz_cache_filename = NULL;static time_t moz_cache_db_modtime = 0L;static dlhash *moz_hash = NULL;typedef struct{  char *urlstr;  char *filename;} moz_pair_t;static moz_pair_t *moz_pair_new(char *f, char *u){  moz_pair_t *rv = NULL;  if(f && u)  {    rv = _malloc(sizeof(moz_pair_t));    rv->filename = tl_strdup(f);    rv->urlstr = tl_strdup(u);  }  return rv;}static void moz_pair_free(dllist_t dl){  moz_pair_t * mp = (moz_pair_t *)dl;  _free(mp->filename);  _free(mp->urlstr);  _free(mp);}static dllist_t moz_pair_get_urlstr(dllist_t mp){  return (dllist_t) ((moz_pair_t *)mp)->urlstr;}#ifdef HAVE_BDB_18xstatic char *moz_old_cache_get_filename(DBT * data){  char *tmp = data->data;  uint32__t size;  COPY_INT32(&size, tmp);  if(size != data->size || data->size < 20)    return NULL;  tmp += sizeof(uint32__t)      /* size */    + sizeof(int32__t);         /* recordid */  COPY_INT32(&size, tmp);  tmp += sizeof(uint32__t) + size;      /* key_len + key */  COPY_INT32(&size, tmp);  tmp += sizeof(uint32__t) + size;      /* meta_len + meta */  tmp += sizeof(uint32__t);     /* filename_len */  return tmp;}static char *moz_old_cache_get_urlstr(DBT * data){  char *tmp = data->data;  uint32__t size;  COPY_INT32(&size, tmp);  if(size != data->size || data->size < 20)    return NULL;  tmp += sizeof(uint32__t)      /* size */    + sizeof(int32__t);         /* recordid */  COPY_INT32(&size, tmp);  tmp += sizeof(uint32__t);     /* key_len */  return tmp;}static void moz_old_cache_read_db(char *dirname){  bool_t reopen = FALSE;  HASHINFO hash_info = {    4 * 1024,                   /* bucket size */    0,                          /* fill factor */    0,                          /* number of elements */    96 * 1024,                  /* bytes to cache */    0,                          /* hash function */    0  };                            /* byte order */  char pom[PATH_MAX];  if(dirname)  {    struct stat estat;    snprintf(pom, sizeof(pom), "%s/cache.db", dirname);    if(!moz_cache_filename || strcmp(pom, moz_cache_filename))      reopen = TRUE;    if(!reopen && !stat(pom, &estat) &&      estat.st_mtime != moz_cache_db_modtime)      reopen = TRUE;  }  if(reopen || !dirname)  {    _free(moz_cache_filename);    moz_cache_db_modtime = 0L;    dlhash_empty(moz_hash);  }  if(reopen)  {    struct stat estat;    DB *moz_cache_db;    DBT key, data;    moz_cache_db = dbopen(pom, O_RDONLY, 0600, DB_HASH, &hash_info);    if(!moz_cache_db)    {      xprintf(0, gettext("Unable to open Mozilla cache index - %s\n"), pom);    }    else    {      if(!stat(pom, &estat))        moz_cache_db_modtime = estat.st_mtime;      _free(moz_cache_filename);      moz_cache_filename = tl_strdup(pom);      while(!moz_cache_db->seq(moz_cache_db, &key, &data, R_NEXT))      {        moz_pair_t *mp;        mp = moz_pair_new(moz_old_cache_get_filename(&data),          moz_old_cache_get_urlstr(&data));        if(mp)          dlhash_insert(moz_hash, mp);      }      moz_cache_db->close(moz_cache_db);    }  }}#endif /* HAVE_BDB_18x */#define MOZ2_REC_PER_BUCKET 256#define MOZ2_REC_SIZE   16#define MOZ2_BITMAP_SIZE  4096typedef struct{  uint32__t version;  int32__t size;  int32__t nentries;  uint32__t dirty;} moz2_cache_map_head;typedef struct{  uint32__t hashnr;  uint32__t erank;  uint32__t data_loc;  uint32__t meta_loc;} moz2_cache_map_record;typedef struct{  uint32__t version;  uint32__t meta_loc;  uint32__t fetch_count;  uint32__t last_fetched;  uint32__t last_modified;  uint32__t expiration;  uint32__t data_size;  uint32__t key_size;  uint32__t meta_size;} moz2_cache_record_header;static char *moz2_db_dir = NULL;static struct{  enum  {    MOZ2_MAP_FILE,    MOZ2_B1_FILE,    MOZ2_B2_FILE,    MOZ2_B3_FILE,    MOZ2_LAST  } id;  int fd;  int block_size;  char *name;} moz2_files[] ={  {MOZ2_MAP_FILE, -1, 0, "_CACHE_MAP_"},  {MOZ2_B1_FILE, -1, 256, "_CACHE_001_"},  {MOZ2_B2_FILE, -1, 1024, "_CACHE_002_"},  {MOZ2_B3_FILE, -1, 4096, "_CACHE_003_"}};#define MOZ2_LOC_INITED(v)     ((v) & 0x80000000)#define MOZ2_LOC_FILE(v)      (((v) & 0x30000000) >> 28)#define MOZ2_LOC_BLOCKS(v)    (((v) & 0x03000000) >> 24)#define MOZ2_LOC_BLOCKNR(v)    ((v) & 0x00FFFFFF)#define MOZ2_LOC_GENERATION(v) ((v) & 0xFF)static int moz2_open_db(char *name){  char *p;  char pom[PATH_MAX];  int i, len, rv = 0;  if(!name)    return -1;  strncpy(pom, name, PATH_MAX);  pom[PATH_MAX - 1] = '\0';  p = pom + strlen(pom);  len = PATH_MAX - strlen(pom);  for(i = 0; i < MOZ2_LAST; i++)  {    strncpy(p, moz2_files[i].name, len);    pom[PATH_MAX - 1] = '\0';    moz2_files[i].fd = open(pom, O_RDWR);    if(moz2_files[i].fd < 0)      moz2_files[i].fd = open(pom, O_RDONLY);    if(moz2_files[i].fd < 0)    {      xprintf(1, gettext("Error opening cache index file %s\n"));      xperror(pom);      rv = -1;      break;    }  }  if(!rv)  {    char bucket[MOZ2_REC_SIZE * MOZ2_REC_PER_BUCKET];    moz2_cache_map_head *cmh;    cmh = (moz2_cache_map_head *) bucket;    if((read(moz2_files[MOZ2_MAP_FILE].fd, bucket, sizeof(bucket))        != sizeof(bucket)) || (ntohl(cmh->version) != 0x00010003))    {      xprintf(1, gettext("Mozilla new cache map file format not recognized"));      rv = -1;    }  }  if(!rv)  {    *p = '\0';    moz2_db_dir = tl_strdup(pom);  }  return rv;}static void moz2_close_db(void){  int i;  for(i = 0; i < MOZ2_LAST; i++)  {    if(moz2_files[i].fd >= 0)    {      close(moz2_files[i].fd);      moz2_files[i].fd = -1;    }  }}static char *moz2_get_filename(int type, uint32__t hashnr,  uint32__t generation){  char pom[PATH_MAX];  snprintf(pom, sizeof(pom), "%s%08X%c%02X", moz2_db_dir, hashnr, (char) type, generation);  return tl_strdup(pom);}static char *load_blocks(int id, uint32__t blocknr, uint32__t blocks){  int ofs, sz;  char *rv;  ofs = MOZ2_BITMAP_SIZE + moz2_files[id].block_size * blocknr;  if(lseek(moz2_files[id].fd, ofs, SEEK_SET) != ofs)  {    xprintf(1, gettext("Corrupted Mozilla cache blockfile!"));    return NULL;  }  sz = moz2_files[id].block_size * (blocks + 1);  rv = _malloc(sz);  if(read(moz2_files[id].fd, rv, sz) != sz)  {    xprintf(1, gettext("Corrupted Mozilla cache blockfile!"));    _free(rv);    return NULL;  }  return rv;}static moz2_cache_record_header *moz2_load_record_data(moz2_cache_map_record *  rec){  char *p;  if(!MOZ2_LOC_FILE(rec->meta_loc))  {    /* load from file */    char *fn;    fn = moz2_get_filename('m', rec->hashnr,      MOZ2_LOC_GENERATION(rec->meta_loc));    p = tl_load_text_file(fn);    _free(fn);  }  else  {    p = load_blocks(moz2_files[MOZ2_LOC_FILE(rec->meta_loc)].id,      MOZ2_LOC_BLOCKNR(rec->meta_loc), MOZ2_LOC_BLOCKS(rec->meta_loc));  }  return (moz2_cache_record_header *) p;}static moz_pair_t *moz2_compose_record(moz2_cache_map_record * rec,  moz2_cache_record_header * meta){  moz_pair_t *ret = NULL;  char *u, *p;  p = moz2_get_filename('d', rec->hashnr, MOZ2_LOC_GENERATION(rec->data_loc));  u = (char *) meta;  u += sizeof(moz2_cache_record_header);  u = strchr(u, ':') + 1;  ret = moz_pair_new(p, u);  return ret;}static moz_pair_t *moz2_get_record_info(moz2_cache_map_record * rec){  moz2_cache_record_header *meta;  moz_pair_t *rv = NULL;  if(!rec->hashnr)    return NULL;  if(!MOZ2_LOC_INITED(rec->data_loc) || !MOZ2_LOC_INITED(rec->meta_loc))    return NULL;  /* don't know how to handle data file in block files */  if(MOZ2_LOC_FILE(rec->data_loc))    return NULL;  meta = moz2_load_record_data(rec);  if(meta && (ntohl(meta->version) == 0x00010003))    rv = moz2_compose_record(rec, meta);  _free(meta);  return rv;}static void moz2_read_db(char *dbname){  int l, i;  char bucket[MOZ2_REC_SIZE * MOZ2_REC_PER_BUCKET];  char *p;  if(moz2_open_db(dbname))    return;  lseek(moz2_files[MOZ2_MAP_FILE].fd, sizeof(bucket), SEEK_SET);  for(;;)  {    l = read(moz2_files[MOZ2_MAP_FILE].fd, bucket, sizeof(bucket));    if(!l)      break;    if(l != sizeof(bucket))    {      xprintf(1, gettext("Corrupted Mozilla cache map file!"));      break;    }    p = bucket;    for(i = 0; i < MOZ2_REC_PER_BUCKET; i++, p += MOZ2_REC_SIZE)    {      moz2_cache_map_record *cmr = (moz2_cache_map_record *) p;      moz_pair_t *mp;      mp = moz2_get_record_info(cmr);      if(mp)        dlhash_insert(moz_hash, (dllist_t) mp);    }  }  moz2_close_db();}static void moz_new_cache_read_db(char *dirname){  bool_t reopen = FALSE;  char pom[PATH_MAX];  if(dirname)  {    struct stat estat;    if(!moz_cache_filename || strcmp(dirname, moz_cache_filename))      reopen = TRUE;    snprintf(pom, sizeof(pom), "%s/_CACHE_MAP_", dirname);    if(!reopen && !stat(pom, &estat) &&      estat.st_mtime != moz_cache_db_modtime)    {      reopen = TRUE;      moz_cache_db_modtime = estat.st_mtime;    }  }  if(reopen || !dirname)  {    _free(moz_cache_filename);    moz_cache_db_modtime = 0L;    dlhash_empty(moz_hash);  }  if(reopen)  {    moz2_read_db(dirname);  }}static void moz_cache_read_db(char *dirname){  char pom[PATH_MAX];  if(!moz_hash)  {    moz_hash = dlhash_new(256, moz_pair_get_urlstr,    str_hash_func, str_comp_func);    dlhash_set_free_func(moz_hash, moz_pair_free, NULL);  }  snprintf(pom, sizeof(pom), "%s/_CACHE_MAP_", dirname);#ifdef HAVE_BDB_18x  if(access(pom, F_OK))    moz_old_cache_read_db(dirname);  else#endif /* HAVE_BDB_18x */    moz_new_cache_read_db(dirname);}char *moz_cache_find_localname(char *urlstr){  char *retv = NULL;  LOCK_NSCACHE;  moz_cache_read_db(priv_cfg.moz_cache_dir);  if(moz_hash)  {    moz_pair_t *mp;    mp = (moz_pair_t *) dlhash_find_by_key(moz_hash, (dllist_t) urlstr);    if(mp)      retv = tl_strdup(mp->filename);  }  UNLOCK_NSCACHE;  return retv;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -