⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 trav.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
/* $Id: trav.c,v 1.43 2003/10/20 19:26:05 adam Exp $   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003   Index Data ApsThis file is part of the Zebra server.Zebra is free software; you can redistribute it and/or modify it underthe terms of the GNU General Public License as published by the FreeSoftware Foundation; either version 2, or (at your option) any laterversion.Zebra is distributed in the hope that it will be useful, but WITHOUT ANYWARRANTY; without even the implied warranty of MERCHANTABILITY orFITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public Licensefor more details.You should have received a copy of the GNU General Public Licensealong with Zebra; see the file LICENSE.zebra.  If not, write to theFree Software Foundation, 59 Temple Place - Suite 330, Boston, MA02111-1307, USA.*/#include <stdio.h>#include <assert.h>#include <sys/types.h>#ifdef WIN32#include <io.h>#define S_ISREG(x) (x & _S_IFREG)#define S_ISDIR(x) (x & _S_IFDIR)#else#include <unistd.h>#endif#include <direntz.h>#include <fcntl.h>#include <time.h>#include "index.h"static int repComp (const char *a, const char *b, size_t len){    if (!len)        return 0;    return memcmp (a, b, len);}static void repositoryExtractR (ZebraHandle zh, int deleteFlag, char *rep,                                struct recordGroup *rGroup,				int level){    struct dir_entry *e;    int i;    size_t rep_len = strlen (rep);    e = dir_open (rep, zh->path_reg, rGroup->followLinks);    if (!e)        return;    yaz_log (LOG_LOG, "dir %s", rep);    if (rep[rep_len-1] != '/')        rep[rep_len] = '/';    else        --rep_len;        for (i=0; e[i].name; i++)    {	char *ecp;        strcpy (rep +rep_len+1, e[i].name);	if ((ecp = strrchr (e[i].name, '/')))	    *ecp = '\0';	if (level == 0 && rGroup->databaseNamePath)	    rGroup->databaseName = e[i].name;        switch (e[i].kind)        {        case dirs_file:            fileExtract (zh, NULL, rep, rGroup, deleteFlag);            break;        case dirs_dir:            repositoryExtractR (zh, deleteFlag, rep, rGroup, level+1);            break;        }    }    dir_free (&e);}static void fileDeleteR (ZebraHandle zh,                         struct dirs_info *di, struct dirs_entry *dst,                         const char *base, char *src,                         struct recordGroup *rGroup){    char tmppath[1024];    size_t src_len = strlen (src);    while (dst && !repComp (dst->path, src, src_len+1))    {        switch (dst->kind)        {        case dirs_file:            sprintf (tmppath, "%s%s", base, dst->path);            fileExtract (zh, &dst->sysno, tmppath, rGroup, 1);                         strcpy (tmppath, dst->path);            dst = dirs_read (di);             dirs_del (di, tmppath);            break;        case dirs_dir:            strcpy (tmppath, dst->path);            dst = dirs_read (di);            dirs_rmdir (di, tmppath);            break;        default:            dst = dirs_read (di);        }    }}static void fileUpdateR (ZebraHandle zh,                         struct dirs_info *di, struct dirs_entry *dst,			 const char *base, char *src, 			 struct recordGroup *rGroup,			 int level){    struct dir_entry *e_src;    int i_src = 0;    static char tmppath[1024];    size_t src_len = strlen (src);    sprintf (tmppath, "%s%s", base, src);    e_src = dir_open (tmppath, zh->path_reg, rGroup->followLinks);    yaz_log (LOG_LOG, "dir %s", tmppath);#if 0    if (!dst || repComp (dst->path, src, src_len))#else    if (!dst || strcmp (dst->path, src))#endif    {        if (!e_src)            return;        if (src_len && src[src_len-1] != '/')        {            src[src_len] = '/';            src[++src_len] = '\0';        }        dirs_mkdir (di, src, 0);        if (dst && repComp (dst->path, src, src_len))            dst = NULL;    }    else if (!e_src)    {        strcpy (src, dst->path);        fileDeleteR (zh, di, dst, base, src, rGroup);        return;    }    else    {        if (src_len && src[src_len-1] != '/')        {            src[src_len] = '/';            src[++src_len] = '\0';        }        dst = dirs_read (di);     }    dir_sort (e_src);    while (1)    {        int sd;        if (dst && !repComp (dst->path, src, src_len))        {            if (e_src[i_src].name)            {                yaz_log (LOG_DEBUG, "dst=%s src=%s", dst->path + src_len,		      e_src[i_src].name);                sd = strcmp (dst->path + src_len, e_src[i_src].name);            }            else                sd = -1;        }        else if (e_src[i_src].name)            sd = 1;        else            break;        yaz_log (LOG_DEBUG, "trav sd=%d", sd);	if (level == 0 && rGroup->databaseNamePath)	    rGroup->databaseName = e_src[i_src].name;        if (sd == 0)        {            strcpy (src + src_len, e_src[i_src].name);            sprintf (tmppath, "%s%s", base, src);                        switch (e_src[i_src].kind)            {            case dirs_file:                if (e_src[i_src].mtime > dst->mtime)                {                    if (fileExtract (zh, &dst->sysno, tmppath, rGroup, 0))                    {                        dirs_add (di, src, dst->sysno, e_src[i_src].mtime);                    }		    yaz_log (LOG_DEBUG, "old: %s", ctime (&dst->mtime));                    yaz_log (LOG_DEBUG, "new: %s", ctime (&e_src[i_src].mtime));                }                dst = dirs_read (di);                break;            case dirs_dir:                fileUpdateR (zh, di, dst, base, src, rGroup, level+1);                dst = dirs_last (di);                yaz_log (LOG_DEBUG, "last is %s", dst ? dst->path : "null");                break;            default:                dst = dirs_read (di);             }            i_src++;        }        else if (sd > 0)        {            SYSNO sysno = 0;            strcpy (src + src_len, e_src[i_src].name);            sprintf (tmppath, "%s%s", base, src);            switch (e_src[i_src].kind)            {            case dirs_file:                if (fileExtract (zh, &sysno, tmppath, rGroup, 0))                    dirs_add (di, src, sysno, e_src[i_src].mtime);                            break;            case dirs_dir:                fileUpdateR (zh, di, dst, base, src, rGroup, level+1);                if (dst)                    dst = dirs_last (di);                break;            }            i_src++;        }        else  /* sd < 0 */        {            strcpy (src, dst->path);            sprintf (tmppath, "%s%s", base, dst->path);            switch (dst->kind)            {            case dirs_file:                fileExtract (zh, &dst->sysno, tmppath, rGroup, 1);                dirs_del (di, dst->path);                dst = dirs_read (di);                break;            case dirs_dir:                fileDeleteR (zh, di, dst, base, src, rGroup);                dst = dirs_last (di);            }        }    }    dir_free (&e_src);}static void groupRes (ZebraHandle zh, struct recordGroup *rGroup){    char resStr[256];    char gPrefix[256];    if (!rGroup->groupName || !*rGroup->groupName)        *gPrefix = '\0';    else        sprintf (gPrefix, "%s.", rGroup->groupName);    sprintf (resStr, "%srecordId", gPrefix);    rGroup->recordId = res_get (zh->res, resStr);    sprintf (resStr, "%sdatabasePath", gPrefix);    rGroup->databaseNamePath =	atoi (res_get_def (zh->res, resStr, "0"));    rGroup->databaseNamePath =	atoi (res_get_def (zh->res, resStr, "0"));    if (rGroup->followLinks == -1)    {        sprintf (resStr, "%sfollowLinks", gPrefix);        rGroup->followLinks =             atoi (res_get_def (zh->res, resStr, "1"));    }}void repositoryShow (ZebraHandle zh)                     {    struct recordGroup *rGroup = &zh->rGroup;    char src[1024];    int src_len;    struct dirs_entry *dst;    Dict dict;    struct dirs_info *di;        if (!(dict = dict_open (zh->reg->bfs, FMATCH_DICT, 50, 0, 0)))    {        yaz_log (LOG_FATAL, "dict_open fail of %s", FMATCH_DICT);	return;    }        assert (rGroup->path);        strcpy (src, rGroup->path);    src_len = strlen (src);        if (src_len && src[src_len-1] != '/')    {        src[src_len] = '/';        src[++src_len] = '\0';    }        di = dirs_open (dict, src, rGroup->flagRw);        while ( (dst = dirs_read (di)) )        yaz_log (LOG_LOG, "%s", dst->path);    dirs_free (&di);    dict_close (dict);}static void fileUpdate (ZebraHandle zh,                        Dict dict, struct recordGroup *rGroup,                        const char *path){    struct dirs_info *di;    struct stat sbuf;    char src[1024];    char dst[1024];    int src_len, ret;    assert (path);    if (zh->path_reg && !yaz_is_abspath(path))    {        strcpy (src, zh->path_reg);        strcat (src, "/");    }    else        *src = '\0';    strcat (src, path);    ret = zebra_file_stat (src, &sbuf, rGroup->followLinks);    strcpy (src, path);    src_len = strlen (src);    if (ret == -1)    {        yaz_log (LOG_WARN|LOG_ERRNO, "Cannot access path %s", src);    }     else if (S_ISREG(sbuf.st_mode))    {        struct dirs_entry *e_dst;        di = dirs_fopen (dict, src);        e_dst = dirs_read (di);        if (e_dst)        {            if (sbuf.st_mtime > e_dst->mtime)                if (fileExtract (zh, &e_dst->sysno, src, rGroup, 0))                    dirs_add (di, src, e_dst->sysno, sbuf.st_mtime);        }        else        {            SYSNO sysno = 0;            if (fileExtract (zh, &sysno, src, rGroup, 0))                 dirs_add (di, src, sysno, sbuf.st_mtime);        }        dirs_free (&di);    }    else if (S_ISDIR(sbuf.st_mode))    {        if (src_len && src[src_len-1] != '/')        {            src[src_len] = '/';            src[++src_len] = '\0';        }        di = dirs_open (dict, src, rGroup->flagRw);        *dst = '\0';        fileUpdateR (zh, di, dirs_read (di), src, dst, rGroup, 0);        dirs_free (&di);    }    else    {        yaz_log (LOG_WARN, "Skipping path %s", src);    }}static void repositoryExtract (ZebraHandle zh,                               int deleteFlag, struct recordGroup *rGroup,                               const char *path){    struct stat sbuf;    char src[1024];    int ret;    assert (path);    if (zh->path_reg && !yaz_is_abspath(path))    {        strcpy (src, zh->path_reg);        strcat (src, "/");    }    else        *src = '\0';    strcat (src, path);    ret = zebra_file_stat (src, &sbuf, rGroup->followLinks);    strcpy (src, path);    if (ret == -1)        yaz_log (LOG_WARN|LOG_ERRNO, "Cannot access path %s", src);    else if (S_ISREG(sbuf.st_mode))        fileExtract (zh, NULL, src, rGroup, deleteFlag);    else if (S_ISDIR(sbuf.st_mode))	repositoryExtractR (zh, deleteFlag, src, rGroup, 0);    else        yaz_log (LOG_WARN, "Skipping path %s", src);}static void repositoryExtractG (ZebraHandle zh,                                int deleteFlag, struct recordGroup *rGroup){    if (*rGroup->path == '\0' || !strcmp(rGroup->path, "-"))    {        char src[1024];        while (scanf ("%1020s", src) == 1)            repositoryExtract (zh, deleteFlag, rGroup, src);    }    else        repositoryExtract (zh, deleteFlag, rGroup, rGroup->path);}void repositoryUpdate (ZebraHandle zh){    struct recordGroup *rGroup = &zh->rGroup;    groupRes (zh, rGroup);    assert (rGroup->path);    if (rGroup->recordId && !strcmp (rGroup->recordId, "file"))    {        Dict dict;        if (!(dict = dict_open (zh->reg->bfs, FMATCH_DICT, 50,				rGroup->flagRw, 0)))        {            yaz_log (LOG_FATAL, "dict_open fail of %s", FMATCH_DICT);	    return ;        }        if (*rGroup->path == '\0' || !strcmp(rGroup->path, "-"))        {            char src[1024];            while (scanf ("%s", src) == 1)                fileUpdate (zh, dict, rGroup, src);        }        else            fileUpdate (zh, dict, rGroup, rGroup->path);        dict_close (dict);    }    else         repositoryExtractG (zh, 0, rGroup);}void repositoryDelete (ZebraHandle zh){    struct recordGroup *rGroup = &zh->rGroup;    groupRes (zh, rGroup);    assert (rGroup->path);    repositoryExtractG (zh, 1, rGroup);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -