⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 regxread.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 4 页
字号:
	    else		logf (LOG_WARN, "bad keyword '%s' after end", p);	}        else if (!strcmp (p, "data"))        {            int textFlag = 0;            int element_len;            const char *element_str = NULL;                        while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)            {                if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))                    textFlag = 1;                else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))                {                    r = execTok (spec, &s, &element_str, &element_len);                    if (r < 2)                        break;                }                else                     logf (LOG_WARN, "bad data option: %.*s",                          cmd_len, cmd_str);            }            if (r != 2)            {                logf (LOG_WARN, "missing data item after data");                continue;            }            if (element_str)                tagBegin (spec, element_str, element_len);            do            {                execData (spec, cmd_str, cmd_len,textFlag);                r = execTok (spec, &s, &cmd_str, &cmd_len);            } while (r > 1);            if (element_str)                tagEnd (spec, 1, NULL, 0);        }        else if (!strcmp (p, "unread"))        {            int no, offset;            r = execTok (spec, &s, &cmd_str, &cmd_len);            if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))            {                r = execTok (spec, &s, &cmd_str, &cmd_len);                if (r < 2)                {                    logf (LOG_WARN, "missing number after -offset");                    continue;                }                p = regxStrz (cmd_str, cmd_len, ptmp);                offset = atoi (p);                r = execTok (spec, &s, &cmd_str, &cmd_len);            }            else                offset = 0;            if (r < 2)            {                logf (LOG_WARN, "missing index after unread command");                continue;            }            if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')            {                logf (LOG_WARN, "bad index after unread command");                continue;            }            else            {                no = *cmd_str - '0';                if (no >= spec->arg_no)                    no = spec->arg_no - 1;                spec->ptr = spec->arg_start[no] + offset;            }            r = execTok (spec, &s, &cmd_str, &cmd_len);        }	else if (!strcmp (p, "context"))	{            if (r > 1)	    {		struct lexContext *lc = spec->context;		r = execTok (spec, &s, &cmd_str, &cmd_len);		p = regxStrz (cmd_str, cmd_len, ptmp);				while (lc && strcmp (p, lc->name))		    lc = lc->next;		if (lc)		    spec->context_stack[spec->context_stack_top] = lc;		else		    logf (LOG_WARN, "unknown context %s", p);	    }	    r = execTok (spec, &s, &cmd_str, &cmd_len);	}        else        {            logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);            r = execTok (spec, &s, &cmd_str, &cmd_len);            continue;        }        if (r > 1)        {            logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);            do {                r = execTok (spec, &s, &cmd_str, &cmd_len);            } while (r > 1);        }    }}static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,                       int start_ptr, int *pptr){    int sptr;    int arg_start[20];    int arg_end[20];    int arg_no = 1;    if (!ap)	return 1;    arg_start[0] = start_ptr;    arg_end[0] = *pptr;    spec->arg_start = arg_start;    spec->arg_end = arg_end;    while (ap)    {        switch (ap->which)        {        case REGX_PATTERN:            if (ap->u.pattern.body)            {                arg_start[arg_no] = *pptr;                if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa, 0))                {                    arg_end[arg_no] = F_WIN_EOF;                    arg_no++;                    arg_start[arg_no] = F_WIN_EOF;                    arg_end[arg_no] = F_WIN_EOF;		    yaz_log(LOG_DEBUG, "Pattern match rest of record");		    *pptr = F_WIN_EOF;                }                else                {                    arg_end[arg_no] = sptr;                    arg_no++;                    arg_start[arg_no] = sptr;                    arg_end[arg_no] = *pptr;                }            }            else            {                arg_start[arg_no] = *pptr;                if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa, 1))                    return 1;                if (sptr != arg_start[arg_no])                    return 1;                arg_end[arg_no] = *pptr;            }            arg_no++;            break;        case REGX_CODE:	    spec->arg_no = arg_no;	    spec->ptr = *pptr;#if HAVE_TCL_H	    if (spec->tcl_interp)		execTcl(spec, ap->u.code);	    else		execCode (spec, ap->u.code);#else	    execCode (spec, ap->u.code);#endif	    *pptr = spec->ptr;	    if (spec->stop_flag)		return 0;            break;        case REGX_END:            arg_start[arg_no] = *pptr;            arg_end[arg_no] = F_WIN_EOF;            arg_no++;            *pptr = F_WIN_EOF;        }        ap = ap->next;    }    return 1;}static int execRule (struct lexSpec *spec, struct lexContext *context,                     int ruleNo, int start_ptr, int *pptr){#if REGX_DEBUG    logf (LOG_LOG, "exec rule %d", ruleNo);#endif    return execAction (spec, context->fastRule[ruleNo]->actionList,                       start_ptr, pptr);}data1_node *lexNode (struct lexSpec *spec, int *ptr){    struct lexContext *context = spec->context_stack[spec->context_stack_top];    struct DFA_state *state = context->dfa->states[0];    struct DFA_tran *t;    unsigned char c;    unsigned char c_prev = '\n';    int i;    int last_rule = 0;        /* rule number of current match */    int last_ptr = *ptr;      /* last char of match */    int start_ptr = *ptr;     /* first char of match */    int skip_ptr = *ptr;      /* first char of run */    while (1)    {        c = f_win_advance (spec, ptr);        if (*ptr == F_WIN_EOF)        {	    /* end of file met */            if (last_rule)            {		/* there was a match */                if (skip_ptr < start_ptr)                {		    /* deal with chars that didn't match */                    int size;                    char *buf;                    buf = f_win_get (spec, skip_ptr, start_ptr, &size);                    execDataP (spec, buf, size, 0);                }		/* restore pointer */                *ptr = last_ptr;		/* execute rule */                if (!execRule (spec, context, last_rule, start_ptr, ptr))		    break;		/* restore skip pointer */                skip_ptr = *ptr;                last_rule = 0;            }            else if (skip_ptr < *ptr)            {		/* deal with chars that didn't match */                int size;                char *buf;                buf = f_win_get (spec, skip_ptr, *ptr, &size);                execDataP (spec, buf, size, 0);            }            if (*ptr == F_WIN_EOF)                break;        }        t = state->trans;        i = state->tran_no;        while (1)            if (--i < 0)            {   /* no transition for character c ... */                if (last_rule)                {                    if (skip_ptr < start_ptr)                    {			/* deal with chars that didn't match */                        int size;                        char *buf;                        buf = f_win_get (spec, skip_ptr, start_ptr, &size);                        execDataP (spec, buf, size, 0);                    }		    /* restore pointer */                    *ptr = last_ptr;                    if (!execRule (spec, context, last_rule, start_ptr, ptr))                    {                        if (spec->f_win_ef && *ptr != F_WIN_EOF)			{#if REGX_DEBUG			    logf (LOG_LOG, "regx: endf ptr=%d", *ptr);#endif                            (*spec->f_win_ef)(spec->f_win_fh, *ptr);			}                        return NULL;                    }		    context = spec->context_stack[spec->context_stack_top];                    skip_ptr = *ptr;                    last_rule = 0;                    last_ptr = start_ptr = *ptr;                    if (start_ptr > 0)                    {                        --start_ptr;                        c_prev = f_win_advance (spec, &start_ptr);                    }                }                else                {                    c_prev = f_win_advance (spec, &start_ptr);                    *ptr = start_ptr;                }                state = context->dfa->states[0];                break;            }            else if (c >= t->ch[0] && c <= t->ch[1])            {   /* transition ... */                state = context->dfa->states[t->to];                if (state->rule_no)                {                    if (c_prev == '\n')                    {                        last_rule = state->rule_no;                        last_ptr = *ptr;                    }                     else if (state->rule_nno)                    {                        last_rule = state->rule_nno;                        last_ptr = *ptr;                    }                }                break;            }            else                t++;    }    return NULL;}static data1_node *lexRoot (struct lexSpec *spec, off_t offset,			    const char *context_name){    struct lexContext *lt = spec->context;    int ptr = offset;    spec->stop_flag = 0;    spec->d1_level = 0;    spec->context_stack_top = 0;        while (lt)    {	if (!strcmp (lt->name, context_name))	    break;	lt = lt->next;    }    if (!lt)    {	logf (LOG_WARN, "cannot find context %s", context_name);	return NULL;    }    spec->context_stack[spec->context_stack_top] = lt;    spec->d1_stack[spec->d1_level] = NULL;#if 1    if (!lt->initFlag)    {	lt->initFlag = 1;	execAction (spec, lt->initActionList, ptr, &ptr);    }#endif    execAction (spec, lt->beginActionList, ptr, &ptr);    lexNode (spec, &ptr);    while (spec->d1_level)    {	tagDataRelease (spec);	(spec->d1_level)--;    }    execAction (spec, lt->endActionList, ptr, &ptr);    return spec->d1_stack[0];}void grs_destroy(void *clientData){    struct lexSpecs *specs = (struct lexSpecs *) clientData;    if (specs->spec)    {	lexSpecDestroy(&specs->spec);    }    xfree (specs);}void *grs_init(void){    struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));    specs->spec = 0;    return specs;}data1_node *grs_read_regx (struct grs_read_info *p){    int res;    struct lexSpecs *specs = (struct lexSpecs *) p->clientData;    struct lexSpec **curLexSpec = &specs->spec;#if REGX_DEBUG    logf (LOG_LOG, "grs_read_regx");#endif    if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))    {        if (*curLexSpec)            lexSpecDestroy (curLexSpec);        *curLexSpec = lexSpecCreate (p->type, p->dh);        res = readFileSpec (*curLexSpec);        if (res)        {            lexSpecDestroy (curLexSpec);            return NULL;        }    }    (*curLexSpec)->dh = p->dh;    if (!p->offset)    {        (*curLexSpec)->f_win_start = 0;        (*curLexSpec)->f_win_end = 0;        (*curLexSpec)->f_win_rf = p->readf;        (*curLexSpec)->f_win_sf = p->seekf;        (*curLexSpec)->f_win_fh = p->fh;        (*curLexSpec)->f_win_ef = p->endf;        (*curLexSpec)->f_win_size = 500000;    }    (*curLexSpec)->m = p->mem;    return lexRoot (*curLexSpec, p->offset, "main");}static struct recTypeGrs regx_type = {    "regx",    grs_init,    grs_destroy,    grs_read_regx};RecTypeGrs recTypeGrs_regx = &regx_type;#if HAVE_TCL_Hdata1_node *grs_read_tcl (struct grs_read_info *p){    int res;    struct lexSpecs *specs = (struct lexSpecs *) p->clientData;    struct lexSpec **curLexSpec = &specs->spec;#if REGX_DEBUG    logf (LOG_LOG, "grs_read_tcl");#endif    if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))    {	Tcl_Interp *tcl_interp;        if (*curLexSpec)            lexSpecDestroy (curLexSpec);        *curLexSpec = lexSpecCreate (p->type, p->dh);	Tcl_FindExecutable("");	tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();	Tcl_Init(tcl_interp);	Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);	Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);	Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);	Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,			   *curLexSpec, 0);        res = readFileSpec (*curLexSpec);        if (res)        {            lexSpecDestroy (curLexSpec);            return NULL;        }    }    (*curLexSpec)->dh = p->dh;    if (!p->offset)    {        (*curLexSpec)->f_win_start = 0;        (*curLexSpec)->f_win_end = 0;        (*curLexSpec)->f_win_rf = p->readf;        (*curLexSpec)->f_win_sf = p->seekf;        (*curLexSpec)->f_win_fh = p->fh;        (*curLexSpec)->f_win_ef = p->endf;        (*curLexSpec)->f_win_size = 500000;    }    (*curLexSpec)->m = p->mem;    return lexRoot (*curLexSpec, p->offset, "main");}static struct recTypeGrs tcl_type = {    "tcl",    grs_init,    grs_destroy,    grs_read_tcl};RecTypeGrs recTypeGrs_tcl = &tcl_type;#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -