⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 grepper.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
/* $Id: grepper.c,v 1.10 2002/08/02 19:26:55 adam Exp $   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002   Index Data ApsThis file is part of the Zebra server.Zebra is free software; you can redistribute it and/or modify it underthe terms of the GNU General Public License as published by the FreeSoftware Foundation; either version 2, or (at your option) any laterversion.Zebra is distributed in the hope that it will be useful, but WITHOUT ANYWARRANTY; without even the implied warranty of MERCHANTABILITY orFITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public Licensefor more details.You should have received a copy of the GNU General Public Licensealong with Zebra; see the file LICENSE.zebra.  If not, write to theFree Software Foundation, 59 Temple Place - Suite 330, Boston, MA02111-1307, USA.*/#include <stdlib.h>#include <string.h>#include <stdio.h>#include <ctype.h>#include <assert.h>#include <zebrautl.h>#include <dfa.h>#include "imalloc.h"char *prog;static int show_line = 0;typedef unsigned MatchWord;#define WORD_BITS 32typedef struct {    int n;           /* no of MatchWord needed */    int range;       /* max no. of errors */    MatchWord *Sc;   /* Mask Sc */} MatchContext;#define INFBUF_SIZE 16384#define INLINE static INLINE void set_bit (MatchContext *mc, MatchWord *m, int ch, int state){    int off = state & (WORD_BITS-1);    int wno = state / WORD_BITS;    m[mc->n * ch + wno] |= 1<<off;}static INLINE void reset_bit (MatchContext *mc, MatchWord *m, int ch,                              int state){    int off = state & (WORD_BITS-1);    int wno = state / WORD_BITS;    m[mc->n * ch + wno] &= ~(1<<off);}static INLINE MatchWord get_bit (MatchContext *mc, MatchWord *m, int ch,                                 int state){    int off = state & (WORD_BITS-1);    int wno = state / WORD_BITS;    return m[mc->n * ch + wno] & (1<<off);}static MatchContext *mk_MatchContext (struct DFA *dfa, int range){    MatchContext *mc = imalloc (sizeof(*mc));    int i;    mc->n = (dfa->no_states+WORD_BITS) / WORD_BITS;    mc->range = range;    mc->Sc = icalloc (sizeof(*mc->Sc) * 256 * mc->n);        for (i=0; i<dfa->no_states; i++)    {        int j;        struct DFA_state *state = dfa->states[i];        for (j=0; j<state->tran_no; j++)        {            int ch;            int ch0 = state->trans[j].ch[0];            int ch1 = state->trans[j].ch[1];            assert (ch0 >= 0 && ch1 >= 0);                        for (ch = ch0; ch <= ch1; ch++)                set_bit (mc, mc->Sc, ch, i);        }    }    return mc;}static void mask_shift (MatchContext *mc, MatchWord *Rdst, MatchWord *Rsrc,                        struct DFA *dfa, int ch){    int j, s = 0;    MatchWord *Rsrc_p = Rsrc, mask;    Rdst[0] = 1;    for (j = 1; j<mc->n; j++)        Rdst[j] = 0;    while (1)    {        mask = *Rsrc_p++;        for (j = 0; j<WORD_BITS/4; j++)        {            if (mask & 15)            {                if (mask & 1)                {                    struct DFA_state *state = dfa->states[s];                    int i = state->tran_no;                    while (--i >= 0)                        if (ch >= state->trans[i].ch[0] &&                            ch <= state->trans[i].ch[1])                            set_bit (mc, Rdst, 0, state->trans[i].to);                }                if (mask & 2)                {                    struct DFA_state *state = dfa->states[s+1];                    int i = state->tran_no;                    while (--i >= 0)                        if (ch >= state->trans[i].ch[0] &&                            ch <= state->trans[i].ch[1])                            set_bit (mc, Rdst, 0, state->trans[i].to);                }                if (mask & 4)                {                    struct DFA_state *state = dfa->states[s+2];                    int i = state->tran_no;                    while (--i >= 0)                        if (ch >= state->trans[i].ch[0] &&                            ch <= state->trans[i].ch[1])                            set_bit (mc, Rdst, 0, state->trans[i].to);                }                if (mask & 8)                {                    struct DFA_state *state = dfa->states[s+3];                    int i = state->tran_no;                    while (--i >= 0)                        if (ch >= state->trans[i].ch[0] &&                            ch <= state->trans[i].ch[1])                            set_bit (mc, Rdst, 0, state->trans[i].to);                }            }            s += 4;            if (s >= dfa->no_states)                return;            mask >>= 4;        }    }}static void shift (MatchContext *mc, MatchWord *Rdst, MatchWord *Rsrc,                   struct DFA *dfa){    int j, s = 0;    MatchWord *Rsrc_p = Rsrc, mask;    for (j = 0; j<mc->n; j++)        Rdst[j] = 0;    while (1)    {        mask = *Rsrc_p++;        for (j = 0; j<WORD_BITS/4; j++)        {            if (mask & 15)            {                if (mask & 1)                {                    struct DFA_state *state = dfa->states[s];                    int i = state->tran_no;                    while (--i >= 0)                        set_bit (mc, Rdst, 0, state->trans[i].to);                }                if (mask & 2)                {                    struct DFA_state *state = dfa->states[s+1];                    int i = state->tran_no;                    while (--i >= 0)                        set_bit (mc, Rdst, 0, state->trans[i].to);                }                if (mask & 4)                {                    struct DFA_state *state = dfa->states[s+2];                    int i = state->tran_no;                    while (--i >= 0)                        set_bit (mc, Rdst, 0, state->trans[i].to);                }                if (mask & 8)                {                    struct DFA_state *state = dfa->states[s+3];                    int i = state->tran_no;                    while (--i >= 0)                        set_bit (mc, Rdst, 0, state->trans[i].to);                }            }            s += 4;            if (s >= dfa->no_states)                return;            mask >>= 4;        }    }}static void or (MatchContext *mc, MatchWord *Rdst,                MatchWord *Rsrc1, MatchWord *Rsrc2){    int i;    for (i = 0; i<mc->n; i++)        Rdst[i] = Rsrc1[i] | Rsrc2[i];}static int go (MatchContext *mc, struct DFA *dfa, FILE *inf){    MatchWord *Rj, *Rj1, *Rj_a, *Rj_b, *Rj_c;    int s, d, ch;    int lineno = 1;    char *infbuf;    int inf_ptr = 1;    int no_match = 0;    infbuf = imalloc (INFBUF_SIZE);    infbuf[0] = '\n';    Rj = icalloc (mc->n * (mc->range+1) * sizeof(*Rj));    Rj1 = icalloc (mc->n * (mc->range+1) * sizeof(*Rj));    Rj_a = icalloc (mc->n * sizeof(*Rj));    Rj_b = icalloc (mc->n * sizeof(*Rj));    Rj_c = icalloc (mc->n * sizeof(*Rj));    set_bit (mc, Rj, 0, 0);    for (d = 1; d<=mc->range; d++)    {        int s;        memcpy (Rj + mc->n * d, Rj + mc->n * (d-1), mc->n * sizeof(*Rj));        for (s = 0; s<dfa->no_states; s++)        {            if (get_bit (mc, Rj, d-1, s))            {                struct DFA_state *state = dfa->states[s];                int i = state->tran_no;                while (--i >= 0)                    set_bit (mc, Rj, d, state->trans[i].to);            }        }    }    while ((ch = getc (inf)) != EOF)    {        MatchWord *Rj_t;                infbuf[inf_ptr] = ch;        if (ch == '\n')        {            if (no_match)            {                int i = inf_ptr;                if (show_line)                    printf ("%5d:", lineno);                do                {                    if (--i < 0)                        i = INFBUF_SIZE-1;                } while (infbuf[i] != '\n');                do                {                    if (++i == INFBUF_SIZE)                        i = 0;                    putchar (infbuf[i]);                } while (infbuf[i] != '\n');                no_match = 0;            }            lineno++;        }        if (++inf_ptr == INFBUF_SIZE)            inf_ptr = 0;        mask_shift (mc, Rj1, Rj, dfa, ch);        for (d = 1; d <= mc->range; d++)        {            mask_shift (mc, Rj_b, Rj+d*mc->n, dfa, ch);    /* 1 */            or (mc, Rj_a, Rj+(d-1)*mc->n, Rj1+(d-1)*mc->n); /* 2,3 */            shift (mc, Rj_c, Rj_a, dfa);            or (mc, Rj_a, Rj_b, Rj_c);                      /* 1,2,3*/            or (mc, Rj1+d*mc->n, Rj_a, Rj+(d-1)*mc->n);     /* 1,2,3,4 */        }        for (s = 0; s<dfa->no_states; s++)        {            if (dfa->states[s]->rule_no)                if (get_bit (mc, Rj1+mc->range*mc->n, 0, s))                    no_match++;        }        for (d = 0; d <= mc->range; d++)            reset_bit (mc, Rj1+d*mc->n, 0, dfa->no_states);        Rj_t = Rj1;        Rj1 = Rj;        Rj = Rj_t;    }    ifree (Rj);    ifree (Rj1);    ifree (Rj_a);    ifree (Rj_b);    ifree (Rj_c);    ifree (infbuf);    return 0;}static int grep_file (struct DFA *dfa, const char *fname, int range){    FILE *inf;    MatchContext *mc;    if (fname)    {        inf = fopen (fname, "r");        if (!inf)        {            logf (LOG_FATAL|LOG_ERRNO, "cannot open `%s'", fname);            exit (1);        }    }    else        inf = stdin;         mc = mk_MatchContext (dfa, range);    go (mc, dfa, inf);    if (fname)        fclose (inf);    return 0;}int main (int argc, char **argv){    int ret;    int range = 0;    char *arg;    const char *pattern = NULL;    int no_files = 0;    struct DFA *dfa = dfa_init();    prog = argv[0];    while ((ret = options ("nr:dsv:", argv, argc, &arg)) != -2)    {        if (ret == 0)        {            if (!pattern)            {                int i;                pattern = arg;                i = dfa_parse (dfa, &pattern);                if (i || *pattern)                {                    fprintf (stderr, "%s: illegal pattern\n", prog);                    return 1;                }                dfa_mkstate (dfa);            }            else            {                no_files++;                grep_file (dfa, arg, range);            }        }        else if (ret == 'v')        {            yaz_log_init (yaz_log_mask_str(arg), prog, NULL);        }        else if (ret == 's')        {            dfa_verbose = 1;        }        else if (ret == 'd')        {            debug_dfa_tran = 1;            debug_dfa_followpos = 1;            debug_dfa_trav = 1;        }        else if (ret == 'r')        {            range = atoi (arg);        }        else if (ret == 'n')        {            show_line = 1;        }        else        {            logf (LOG_FATAL, "Unknown option '-%s'", arg);            exit (1);        }    }    if (!pattern)    {        fprintf (stderr, "usage:\n "                 " %s [-d] [-n] [-r n] [-s] [-v n] pattern file ..\n", prog);        exit (1);    }    else if (no_files == 0)    {        grep_file (dfa, NULL, range);    }    dfa_delete (&dfa);    return 0;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -