⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dfa.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
/* $Id: dfa.c,v 1.30 2003/06/18 21:32:44 adam Exp $   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003   Index Data ApsThis file is part of the Zebra server.Zebra is free software; you can redistribute it and/or modify it underthe terms of the GNU General Public License as published by the FreeSoftware Foundation; either version 2, or (at your option) any laterversion.Zebra is distributed in the hope that it will be useful, but WITHOUT ANYWARRANTY; without even the implied warranty of MERCHANTABILITY orFITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public Licensefor more details.You should have received a copy of the GNU General Public Licensealong with Zebra; see the file LICENSE.zebra.  If not, write to theFree Software Foundation, 59 Temple Place - Suite 330, Boston, MA02111-1307, USA.*/#include <stdio.h>#include <assert.h>#include <stdlib.h>#include <string.h>#include <ctype.h>#include <zebrautl.h>#include "dfap.h"#include "imalloc.h"#define DFA_OPEN_RANGE 1#define CAT     16000#define OR      16001#define STAR    16002#define PLUS    16003#define EPSILON 16004struct Tnode {    union  {        struct Tnode *p[2];   /* CAT,OR,STAR,PLUS (left,right) */        short ch[2];          /* if ch[0] >= 0 then this Tnode represents */                              /*   a character in range [ch[0]..ch[1]]    */                              /* otherwise ch[0] represents */                              /*   accepting no (-acceptno) */    } u;    unsigned pos : 15;        /* CAT/OR/STAR/EPSILON or non-neg. position */    unsigned nullable : 1;    Set firstpos;             /* first positions */    Set lastpos;              /* last positions */};struct Tblock {               /* Tnode bucket (block) */    struct Tblock *next;      /* pointer to next bucket */    struct Tnode *tarray;     /* array of nodes in bucket */};#define TADD 64#define STATE_HASH 199#define POSET_CHUNK 100int debug_dfa_trav  = 0;int debug_dfa_tran  = 0;int debug_dfa_followpos = 0;int dfa_verbose = 0;static struct Tnode *mk_Tnode      (struct DFA_parse *parse_info);static struct Tnode *mk_Tnode_cset (struct DFA_parse *parse_info,				    BSet charset);static void        term_Tnode      (struct DFA_parse *parse_info);static void     del_followpos  (struct DFA_parse *parse_info),     init_pos       (struct DFA_parse *parse_info),     del_pos        (struct DFA_parse *parse_info),    mk_dfa_tran    (struct DFA_parse *parse_info, struct DFA_states *dfas),    add_follow     (struct DFA_parse *parse_info, Set lastpos, Set firstpos),    dfa_trav       (struct DFA_parse *parse_info, struct Tnode *n),    init_followpos (struct DFA_parse *parse_info),    pr_tran        (struct DFA_parse *parse_info, struct DFA_states *dfas),    pr_verbose     (struct DFA_parse *parse_info, struct DFA_states *dfas),    pr_followpos   (struct DFA_parse *parse_info),    out_char       (int c),    lex            (struct DFA_parse *parse_info);static int    nextchar       (struct DFA_parse *parse_info, int *esc),    read_charset   (struct DFA_parse *parse_info);static const char     *str_char      (unsigned c);#define L_LP 1#define L_RP 2#define L_CHAR 3#define L_CHARS 4#define L_ANY 5#define L_ALT 6#define L_ANYZ 7#define L_WILD 8#define L_QUEST 9#define L_CLOS1 10#define L_CLOS0 11#define L_END 12#define L_START 13static struct Tnode *expr_1 (struct DFA_parse *parse_info),                    *expr_2 (struct DFA_parse *parse_info),                    *expr_3 (struct DFA_parse *parse_info),                    *expr_4 (struct DFA_parse *parse_info);static struct Tnode *expr_1 (struct DFA_parse *parse_info){     struct Tnode *t1, *t2, *tn;    if (!(t1 = expr_2 (parse_info)))        return t1;    while (parse_info->lookahead == L_ALT)    {        lex (parse_info);        if (!(t2 = expr_2 (parse_info)))            return t2;                tn = mk_Tnode (parse_info);        tn->pos = OR;        tn->u.p[0] = t1;        tn->u.p[1] = t2;        t1 = tn;    }    return t1;}static struct Tnode *expr_2 (struct DFA_parse *parse_info){    struct Tnode *t1, *t2, *tn;        if (!(t1 = expr_3 (parse_info)))        return t1;    while (parse_info->lookahead == L_WILD ||           parse_info->lookahead == L_ANYZ ||           parse_info->lookahead == L_ANY ||           parse_info->lookahead == L_LP ||           parse_info->lookahead == L_CHAR ||           parse_info->lookahead == L_CHARS)    {        if (!(t2 = expr_3 (parse_info)))            return t2;                tn = mk_Tnode (parse_info);        tn->pos = CAT;        tn->u.p[0] = t1;        tn->u.p[1] = t2;        t1 = tn;    }    return t1;}static struct Tnode *expr_3 (struct DFA_parse *parse_info){    struct Tnode *t1, *tn;    if (!(t1 = expr_4 (parse_info)))        return t1;    if (parse_info->lookahead == L_CLOS0)    {        lex (parse_info);        tn = mk_Tnode (parse_info);        tn->pos = STAR;        tn->u.p[0] = t1;        t1 = tn;    }    else if (parse_info->lookahead == L_CLOS1)    {        lex (parse_info);        tn = mk_Tnode (parse_info);        tn->pos = PLUS;        tn->u.p[0] = t1;        t1 = tn;    }    else if (parse_info->lookahead == L_QUEST)    {        lex (parse_info);        tn = mk_Tnode(parse_info);        tn->pos = OR;        tn->u.p[0] = t1;        tn->u.p[1] = mk_Tnode(parse_info);        tn->u.p[1]->pos = EPSILON;        t1 = tn;    }    return t1;}static struct Tnode *expr_4 (struct DFA_parse *parse_info){    struct Tnode *t1;    switch (parse_info->lookahead)    {    case L_LP:        lex (parse_info);        if (!(t1 = expr_1 (parse_info)))            return t1;        if (parse_info->lookahead == L_RP)            lex (parse_info);        else            return NULL;        break;    case L_CHAR:        t1 = mk_Tnode(parse_info);        t1->pos = ++parse_info->position;        t1->u.ch[1] = t1->u.ch[0] = parse_info->look_ch;        lex (parse_info);        break;    case L_CHARS:        t1 = mk_Tnode_cset (parse_info, parse_info->look_chars);        lex (parse_info);        break;    case L_ANY:        t1 = mk_Tnode_cset (parse_info, parse_info->anyset);        lex (parse_info);        break;    case L_ANYZ:        t1 = mk_Tnode(parse_info);        t1->pos = OR;        t1->u.p[0] = mk_Tnode_cset (parse_info, parse_info->anyset);        t1->u.p[1] = mk_Tnode(parse_info);        t1->u.p[1]->pos = EPSILON;        lex (parse_info);        break;    case L_WILD:        t1 = mk_Tnode(parse_info);        t1->pos = STAR;        t1->u.p[0] = mk_Tnode_cset (parse_info, parse_info->anyset);        lex (parse_info);    default:        t1 = NULL;    }    return t1;}static void do_parse (struct DFA_parse *parse_info, const char **s,                      struct Tnode **tnp){    int start_anchor_flag = 0;    struct Tnode *t1, *t2, *tn;    parse_info->err_code = 0;    parse_info->expr_ptr =  * (const unsigned char **) s;    parse_info->inside_string = 0;    lex (parse_info);    if (parse_info->lookahead == L_START)    {        start_anchor_flag = 1;        lex (parse_info);    }    if (parse_info->lookahead == L_END)    {        t1 = mk_Tnode (parse_info);        t1->pos = ++parse_info->position;        t1->u.ch[1] = t1->u.ch[0] = '\n';        lex (parse_info);    }    else    {        t1 = expr_1 (parse_info);        if (t1 && parse_info->lookahead == L_END)        {            t2 = mk_Tnode (parse_info);            t2->pos = ++parse_info->position;            t2->u.ch[1] = t2->u.ch[0] = '\n';                        tn = mk_Tnode (parse_info);            tn->pos = CAT;            tn->u.p[0] = t1;            tn->u.p[1] = t2;            t1 = tn;                        lex (parse_info);        }    }    if (t1 && parse_info->lookahead == 0)    {        t2 = mk_Tnode(parse_info);        t2->pos = ++parse_info->position;        t2->u.ch[0] = -(++parse_info->rule);        t2->u.ch[1] = start_anchor_flag ? 0 : -(parse_info->rule);                *tnp = mk_Tnode(parse_info);        (*tnp)->pos = CAT;        (*tnp)->u.p[0] = t1;        (*tnp)->u.p[1] = t2;    }    else    {        if (!parse_info->err_code)        {            if (parse_info->lookahead == L_RP)                parse_info->err_code = DFA_ERR_RP;            else if (parse_info->lookahead == L_LP)                parse_info->err_code = DFA_ERR_LP;            else                parse_info->err_code = DFA_ERR_SYNTAX;        }    }    *s = (const char *) parse_info->expr_ptr;}static int nextchar (struct DFA_parse *parse_info, int *esc){    *esc = 0;    if (*parse_info->expr_ptr == '\0')        return 0;    else if (*parse_info->expr_ptr != '\\')        return *parse_info->expr_ptr++;    *esc = 1;    switch (*++parse_info->expr_ptr)    {    case '\r':    case '\n':    case '\0':        return '\\';    case '\t':        ++parse_info->expr_ptr;        return ' ';    case 'n':        ++parse_info->expr_ptr;        return '\n';    case 't':        ++parse_info->expr_ptr;        return '\t';    case 'r':        ++parse_info->expr_ptr;        return '\r';    case 'f':        ++parse_info->expr_ptr;        return '\f';    default:        return *parse_info->expr_ptr++;    }}static int nextchar_set (struct DFA_parse *parse_info, int *esc){    if (*parse_info->expr_ptr == ' ')    {        *esc = 0;        return *parse_info->expr_ptr++;    }    return nextchar (parse_info, esc);}static int read_charset (struct DFA_parse *parse_info){    int i, ch0, ch1, esc0, esc1, cc = 0;    parse_info->look_chars = mk_BSet (&parse_info->charset);    res_BSet (parse_info->charset, parse_info->look_chars);    ch0 = nextchar_set (parse_info, &esc0);    if (!esc0 && ch0 == '^')    {        cc = 1;        ch0 = nextchar_set (parse_info, &esc0);    }    while (ch0 != 0)    {        if (!esc0 && ch0 == ']')            break;	if (!esc0 && ch0 == '-')	{	    ch1 = ch0;	    esc1 = esc0;	    ch0 = 1;	    add_BSet (parse_info->charset, parse_info->look_chars, ch0);	}	else	{	    if (parse_info->cmap)	    {		const char **mapto;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -