⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 regxread.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 4 页
字号:
/* $Id: regxread.c,v 1.49 2003/09/16 13:56:52 adam Exp $   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003   Index Data ApsThis file is part of the Zebra server.Zebra is free software; you can redistribute it and/or modify it underthe terms of the GNU General Public License as published by the FreeSoftware Foundation; either version 2, or (at your option) any laterversion.Zebra is distributed in the hope that it will be useful, but WITHOUT ANYWARRANTY; without even the implied warranty of MERCHANTABILITY orFITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public Licensefor more details.You should have received a copy of the GNU General Public Licensealong with Zebra; see the file LICENSE.zebra.  If not, write to theFree Software Foundation, 59 Temple Place - Suite 330, Boston, MA02111-1307, USA.*/#include <stdio.h>#include <assert.h>#include <string.h>#include <ctype.h>#include <yaz/tpath.h>#include <zebrautl.h>#include <dfa.h>#include "grsread.h"#if HAVE_TCL_H#include <tcl.h>#if MAJOR_VERSION >= 8#define HAVE_TCL_OBJECTS#endif#endif#define REGX_DEBUG 0#define F_WIN_EOF 2000000000#define F_WIN_READ 1#define REGX_EOF     0#define REGX_PATTERN 1#define REGX_BODY    2#define REGX_BEGIN   3#define REGX_END     4#define REGX_CODE    5#define REGX_CONTEXT 6#define REGX_INIT    7struct regxCode {    char *str;#if HAVE_TCL_OBJECTS    Tcl_Obj *tcl_obj;#endif};struct lexRuleAction {    int which;     union {        struct {            struct DFA *dfa;    /* REGX_PATTERN */            int body;        } pattern;        struct regxCode *code;  /* REGX_CODE */    } u;    struct lexRuleAction *next;};struct lexRuleInfo {    int no;    struct lexRuleAction *actionList;};struct lexRule {    struct lexRuleInfo info;    struct lexRule *next;};struct lexContext {    char *name;    struct DFA *dfa;    struct lexRule *rules;    struct lexRuleInfo **fastRule;    int ruleNo;    int initFlag;    struct lexRuleAction *beginActionList;    struct lexRuleAction *endActionList;    struct lexRuleAction *initActionList;    struct lexContext *next;};struct lexConcatBuf {    int max;    char *buf;};struct lexSpec {    char *name;    struct lexContext *context;    struct lexContext **context_stack;    int context_stack_size;    int context_stack_top;    int lineNo;    NMEM m;    data1_handle dh;#if HAVE_TCL_H    Tcl_Interp *tcl_interp;#endif    void *f_win_fh;    void (*f_win_ef)(void *, off_t);    int f_win_start;      /* first byte of buffer is this file offset */    int f_win_end;        /* last byte of buffer is this offset - 1 */    int f_win_size;       /* size of buffer */    char *f_win_buf;      /* buffer itself */    int (*f_win_rf)(void *, char *, size_t);    off_t (*f_win_sf)(void *, off_t);    struct lexConcatBuf *concatBuf;    int maxLevel;    data1_node **d1_stack;    int d1_level;    int stop_flag;        int *arg_start;    int *arg_end;    int arg_no;    int ptr;};struct lexSpecs {    struct lexSpec *spec;};static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,                        int *size){    int i, r, off = start_pos - spec->f_win_start;    if (off >= 0 && end_pos <= spec->f_win_end)    {        *size = end_pos - start_pos;        return spec->f_win_buf + off;    }    if (off < 0 || start_pos >= spec->f_win_end)    {        (*spec->f_win_sf)(spec->f_win_fh, start_pos);        spec->f_win_start = start_pos;        if (!spec->f_win_buf)            spec->f_win_buf = (char *) xmalloc (spec->f_win_size);        *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,                                  spec->f_win_size);        spec->f_win_end = spec->f_win_start + *size;        if (*size > end_pos - start_pos)            *size = end_pos - start_pos;        return spec->f_win_buf;    }    for (i = 0; i<spec->f_win_end - start_pos; i++)        spec->f_win_buf[i] = spec->f_win_buf[i + off];    r = (*spec->f_win_rf)(spec->f_win_fh,                          spec->f_win_buf + i,                          spec->f_win_size - i);    spec->f_win_start = start_pos;    spec->f_win_end += r;    *size = i + r;    if (*size > end_pos - start_pos)        *size = end_pos - start_pos;    return spec->f_win_buf;}static int f_win_advance (struct lexSpec *spec, int *pos){    int size;    char *buf;        if (*pos >= spec->f_win_start && *pos < spec->f_win_end)        return spec->f_win_buf[(*pos)++ - spec->f_win_start];    if (*pos == F_WIN_EOF)        return 0;    buf = f_win_get (spec, *pos, *pos+1, &size);    if (size == 1)    {        (*pos)++;        return *buf;    }    *pos = F_WIN_EOF;    return 0;}static void regxCodeDel (struct regxCode **pp){    struct regxCode *p = *pp;    if (p)    {#if HAVE_TCL_OBJECTS	if (p->tcl_obj)	    Tcl_DecrRefCount (p->tcl_obj);#endif        xfree (p->str);         xfree (p);        *pp = NULL;    }}static void regxCodeMk (struct regxCode **pp, const char *buf, int len){    struct regxCode *p;    p = (struct regxCode *) xmalloc (sizeof(*p));    p->str = (char *) xmalloc (len+1);    memcpy (p->str, buf, len);    p->str[len] = '\0';#if HAVE_TCL_OBJECTS    p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);    if (p->tcl_obj)	Tcl_IncrRefCount (p->tcl_obj);#endif    *pp = p;}static struct DFA *lexSpecDFA (void){    struct DFA *dfa;    dfa = dfa_init ();    dfa_parse_cmap_del (dfa, ' ');    dfa_parse_cmap_del (dfa, '\t');    dfa_parse_cmap_add (dfa, '/', 0);    return dfa;}static void actionListDel (struct lexRuleAction **rap){    struct lexRuleAction *ra1, *ra;    for (ra = *rap; ra; ra = ra1)    {        ra1 = ra->next;        switch (ra->which)        {        case REGX_PATTERN:            dfa_delete (&ra->u.pattern.dfa);            break;        case REGX_CODE:            regxCodeDel (&ra->u.code);            break;        }        xfree (ra);    }    *rap = NULL;}static struct lexContext *lexContextCreate (const char *name){    struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));    p->name = xstrdup (name);    p->ruleNo = 1;    p->initFlag = 0;    p->dfa = lexSpecDFA ();    p->rules = NULL;    p->fastRule = NULL;    p->beginActionList = NULL;    p->endActionList = NULL;    p->initActionList = NULL;    p->next = NULL;    return p;}static void lexContextDestroy (struct lexContext *p){    struct lexRule *rp, *rp1;    dfa_delete (&p->dfa);    xfree (p->fastRule);    for (rp = p->rules; rp; rp = rp1)    {	rp1 = rp->next;        actionListDel (&rp->info.actionList);        xfree (rp);    }    actionListDel (&p->beginActionList);    actionListDel (&p->endActionList);    actionListDel (&p->initActionList);    xfree (p->name);    xfree (p);}static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh){    struct lexSpec *p;    int i;        p = (struct lexSpec *) xmalloc (sizeof(*p));    p->name = (char *) xmalloc (strlen(name)+1);    strcpy (p->name, name);#if HAVE_TCL_H    p->tcl_interp = 0;#endif    p->dh = dh;    p->context = NULL;    p->context_stack_size = 100;    p->context_stack = (struct lexContext **)	xmalloc (sizeof(*p->context_stack) * p->context_stack_size);    p->f_win_buf = NULL;    p->maxLevel = 128;    p->concatBuf = (struct lexConcatBuf *)	xmalloc (sizeof(*p->concatBuf) * p->maxLevel);    for (i = 0; i < p->maxLevel; i++)    {	p->concatBuf[i].max = 0;	p->concatBuf[i].buf = 0;    }    p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);    p->d1_level = 0;    return p;}static void lexSpecDestroy (struct lexSpec **pp){    struct lexSpec *p;    struct lexContext *lt;    int i;    assert (pp);    p = *pp;    if (!p)        return ;    for (i = 0; i < p->maxLevel; i++)	xfree (p->concatBuf[i].buf);    xfree (p->concatBuf);    lt = p->context;    while (lt)    {	struct lexContext *lt_next = lt->next;	lexContextDestroy (lt);	lt = lt_next;    }#if HAVE_TCL_OBJECTS    if (p->tcl_interp)	Tcl_DeleteInterp (p->tcl_interp);#endif    xfree (p->name);    xfree (p->f_win_buf);    xfree (p->context_stack);    xfree (p->d1_stack);    xfree (p);    *pp = NULL;}static int readParseToken (const char **cpp, int *len){    const char *cp = *cpp;    char cmd[32];    int i, level;    while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')        cp++;    switch (*cp)    {    case '\0':        return 0;    case '/':        *cpp = cp+1;        return REGX_PATTERN;    case '{':        *cpp = cp+1;        level = 1;        while (*++cp)        {            if (*cp == '{')                level++;            else if (*cp == '}')            {                level--;                if (level == 0)                    break;            }        }        *len = cp - *cpp;        return REGX_CODE;    default:        i = 0;        while (1)        {            if (*cp >= 'a' && *cp <= 'z')                cmd[i] = *cp;            else if (*cp >= 'A' && *cp <= 'Z')                cmd[i] = *cp + 'a' - 'A';            else                break;            if (i < (int) sizeof(cmd)-2)		i++;            cp++;        }        cmd[i] = '\0';        if (i == 0)        {            logf (LOG_WARN, "bad character %d %c", *cp, *cp);            cp++;            while (*cp && *cp != ' ' && *cp != '\t' &&                   *cp != '\n' && *cp != '\r')                cp++;            *cpp = cp;            return 0;        }        *cpp = cp;        if (!strcmp (cmd, "begin"))            return REGX_BEGIN;        else if (!strcmp (cmd, "end"))            return REGX_END;        else if (!strcmp (cmd, "body"))            return REGX_BODY;	else if (!strcmp (cmd, "context"))	    return REGX_CONTEXT;	else if (!strcmp (cmd, "init"))	    return REGX_INIT;        else        {            logf (LOG_WARN, "bad command %s", cmd);            return 0;        }    }}static int actionListMk (struct lexSpec *spec, const char *s,                         struct lexRuleAction **ap){    int r, tok, len;    int bodyMark = 0;    const char *s0;    while ((tok = readParseToken (&s, &len)))    {        switch (tok)        {        case REGX_BODY:            bodyMark = 1;            continue;        case REGX_CODE:            *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));            (*ap)->which = tok;            regxCodeMk (&(*ap)->u.code, s, len);            s += len+1;            break;        case REGX_PATTERN:            *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));            (*ap)->which = tok;            (*ap)->u.pattern.body = bodyMark;            bodyMark = 0;            (*ap)->u.pattern.dfa = lexSpecDFA ();	    s0 = s;            r = dfa_parse ((*ap)->u.pattern.dfa, &s);            if (r || *s != '/')            {                xfree (*ap);                *ap = NULL;                logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);                return -1;            }	    if (debug_dfa_tran)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -