⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 regxread.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 4 页
字号:
		printf ("pattern: %.*s\n", s-s0, s0);            dfa_mkstate ((*ap)->u.pattern.dfa);            s++;            break;        case REGX_BEGIN:            logf (LOG_WARN, "cannot use BEGIN here");            continue;        case REGX_INIT:            logf (LOG_WARN, "cannot use INIT here");            continue;        case REGX_END:            *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));            (*ap)->which = tok;            break;        }        ap = &(*ap)->next;    }    *ap = NULL;    return 0;}int readOneSpec (struct lexSpec *spec, const char *s){    int len, r, tok;    struct lexRule *rp;    struct lexContext *lc;    tok = readParseToken (&s, &len);    if (tok == REGX_CONTEXT)    {	char context_name[32];	tok = readParseToken (&s, &len);	if (tok != REGX_CODE)	{	    logf (LOG_WARN, "missing name after CONTEXT keyword");	    return 0;	}	if (len > 31)	    len = 31;	memcpy (context_name, s, len);	context_name[len] = '\0';	lc = lexContextCreate (context_name);	lc->next = spec->context;	spec->context = lc;	return 0;    }    if (!spec->context)	spec->context = lexContextCreate ("main");           switch (tok)    {    case REGX_BEGIN:        actionListDel (&spec->context->beginActionList);        actionListMk (spec, s, &spec->context->beginActionList);	break;    case REGX_END:        actionListDel (&spec->context->endActionList);        actionListMk (spec, s, &spec->context->endActionList);	break;    case REGX_INIT:        actionListDel (&spec->context->initActionList);        actionListMk (spec, s, &spec->context->initActionList);	break;    case REGX_PATTERN:#if REGX_DEBUG	logf (LOG_LOG, "rule %d %s", spec->context->ruleNo, s);#endif        r = dfa_parse (spec->context->dfa, &s);        if (r)        {            logf (LOG_WARN, "regular expression error. r=%d", r);            return -1;        }        if (*s != '/')        {            logf (LOG_WARN, "expects / at end of pattern. got %c", *s);            return -1;        }        s++;        rp = (struct lexRule *) xmalloc (sizeof(*rp));        rp->info.no = spec->context->ruleNo++;        rp->next = spec->context->rules;        spec->context->rules = rp;        actionListMk (spec, s, &rp->info.actionList);    }    return 0;}int readFileSpec (struct lexSpec *spec){    struct lexContext *lc;    int c, i, errors = 0;    FILE *spec_inf = 0;    WRBUF lineBuf;    char fname[256];#if HAVE_TCL_H    if (spec->tcl_interp)    {	sprintf (fname, "%s.tflt", spec->name);	spec_inf = data1_path_fopen (spec->dh, fname, "r");    }#endif    if (!spec_inf)    {	sprintf (fname, "%s.flt", spec->name);	spec_inf = data1_path_fopen (spec->dh, fname, "r");    }    if (!spec_inf)    {        logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);        return -1;    }    logf (LOG_LOG, "reading regx filter %s", fname);#if HAVE_TCL_H    if (spec->tcl_interp)	logf (LOG_LOG, "Tcl enabled");#endif#if 0    debug_dfa_trav = 0;    debug_dfa_tran = 1;    debug_dfa_followpos = 0;    dfa_verbose = 1;#endif    lineBuf = wrbuf_alloc();    spec->lineNo = 0;    c = getc (spec_inf);    while (c != EOF)    {	wrbuf_rewind (lineBuf);        if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')        {            while (c != '\n' && c != EOF)                c = getc (spec_inf);            spec->lineNo++;            if (c == '\n')                c = getc (spec_inf);        }        else        {            int addLine = 0;	                while (1)            {                int c1 = c;		wrbuf_putc(lineBuf, c);                c = getc (spec_inf);		while (c == '\r')		    c = getc (spec_inf);                if (c == EOF)                    break;                if (c1 == '\n')                {                    if (c != ' ' && c != '\t')                        break;                    addLine++;                }            }	    wrbuf_putc(lineBuf, '\0');            readOneSpec (spec, wrbuf_buf(lineBuf));            spec->lineNo += addLine;        }    }    fclose (spec_inf);    wrbuf_free(lineBuf, 1);    for (lc = spec->context; lc; lc = lc->next)    {	struct lexRule *rp;	lc->fastRule = (struct lexRuleInfo **)	    xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);	for (i = 0; i < lc->ruleNo; i++)	    lc->fastRule[i] = NULL;	for (rp = lc->rules; rp; rp = rp->next)	    lc->fastRule[rp->info.no] = &rp->info;	dfa_mkstate (lc->dfa);    }    if (errors)        return -1;        return 0;}#if 0static struct lexSpec *curLexSpec = NULL;#endifstatic void execData (struct lexSpec *spec,                      const char *ebuf, int elen, int formatted_text){    struct data1_node *res, *parent;    int org_len;    if (elen == 0) /* shouldn't happen, but it does! */	return ;#if REGX_DEBUG    if (elen > 80)        logf (LOG_LOG, "data(%d bytes) %.40s ... %.*s", elen,	      ebuf, 40, ebuf + elen-40);    else if (elen == 1 && ebuf[0] == '\n')    {        logf (LOG_LOG, "data(new line)");    }    else if (elen > 0)        logf (LOG_LOG, "data(%d bytes) %.*s", elen, elen, ebuf);    else         logf (LOG_LOG, "data(%d bytes)", elen);#endif            if (spec->d1_level <= 1)        return;    parent = spec->d1_stack[spec->d1_level -1];    assert (parent);    if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)	org_len = res->u.data.len;    else    {	org_len = 0;	res = data1_mk_node2 (spec->dh, spec->m, DATA1N_data, parent);	res->u.data.what = DATA1I_text;	res->u.data.len = 0;	res->u.data.formatted_text = formatted_text;	res->u.data.data = 0;		if (spec->d1_stack[spec->d1_level])	    spec->d1_stack[spec->d1_level]->next = res;	spec->d1_stack[spec->d1_level] = res;    }    if (org_len + elen >= spec->concatBuf[spec->d1_level].max)    {	char *old_buf, *new_buf;	spec->concatBuf[spec->d1_level].max = org_len + elen + 256;	new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);	if ((old_buf = spec->concatBuf[spec->d1_level].buf))	{	    memcpy (new_buf, old_buf, org_len);	    xfree (old_buf);	}	spec->concatBuf[spec->d1_level].buf = new_buf;    }    memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);    res->u.data.len += elen;}static void execDataP (struct lexSpec *spec,                       const char *ebuf, int elen, int formatted_text){    execData (spec, ebuf, elen, formatted_text);}static void tagDataRelease (struct lexSpec *spec){    data1_node *res;        if ((res = spec->d1_stack[spec->d1_level]) &&	res->which == DATA1N_data && 	res->u.data.what == DATA1I_text)    {	assert (!res->u.data.data);	assert (res->u.data.len > 0);	if (res->u.data.len > DATA1_LOCALDATA)	    res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);	else	    res->u.data.data = res->lbuf;	memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf,		res->u.data.len);    }}static void variantBegin (struct lexSpec *spec, 			  const char *class_str, int class_len,			  const char *type_str, int type_len,			  const char *value_str, int value_len){    struct data1_node *parent = spec->d1_stack[spec->d1_level -1];    char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];    data1_vartype *tp;    int i;    data1_node *res;    if (spec->d1_level == 0)    {        logf (LOG_WARN, "in variant begin. No record type defined");        return ;    }    if (class_len >= DATA1_MAX_SYMBOL)	class_len = DATA1_MAX_SYMBOL-1;    memcpy (tclass, class_str, class_len);    tclass[class_len] = '\0';    if (type_len >= DATA1_MAX_SYMBOL)	type_len = DATA1_MAX_SYMBOL-1;    memcpy (ttype, type_str, type_len);    ttype[type_len] = '\0';#if REGX_DEBUG     logf (LOG_LOG, "variant begin(%s,%s,%d)", tclass, ttype,	  spec->d1_level);#endif    if (!(tp =	  data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,			       tclass, ttype)))	return;        if (parent->which != DATA1N_variant)    {	res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);	if (spec->d1_stack[spec->d1_level])	    tagDataRelease (spec);	spec->d1_stack[spec->d1_level] = res;	spec->d1_stack[++(spec->d1_level)] = NULL;    }    for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)	if (spec->d1_stack[i]->u.variant.type == tp)	{	    spec->d1_level = i;	    break;	}#if REGX_DEBUG     logf (LOG_LOG, "variant node(%d)", spec->d1_level);#endif    parent = spec->d1_stack[spec->d1_level-1];    res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);    res->u.variant.type = tp;    if (value_len >= DATA1_LOCALDATA)	value_len =DATA1_LOCALDATA-1;    memcpy (res->lbuf, value_str, value_len);    res->lbuf[value_len] = '\0';    res->u.variant.value = res->lbuf;        if (spec->d1_stack[spec->d1_level])	tagDataRelease (spec);    spec->d1_stack[spec->d1_level] = res;    spec->d1_stack[++(spec->d1_level)] = NULL;}static void tagStrip (const char **tag, int *len){    int i;    for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)        ;    *len = i;    for (i = 0; i < *len && isspace((*tag)[i]); i++)        ;    *tag += i;    *len -= i;}static void tagBegin (struct lexSpec *spec,                       const char *tag, int len){    if (spec->d1_level == 0)    {        logf (LOG_WARN, "in element begin. No record type defined");        return ;    }    tagStrip (&tag, &len);    if (spec->d1_stack[spec->d1_level])	tagDataRelease (spec);#if REGX_DEBUG     logf (LOG_LOG, "begin tag(%.*s, %d)", len, tag, spec->d1_level);#endif    spec->d1_stack[spec->d1_level] = data1_mk_tag_n (        spec->dh, spec->m, tag, len, 0, spec->d1_stack[spec->d1_level -1]);    spec->d1_stack[++(spec->d1_level)] = NULL;}static void tagEnd (struct lexSpec *spec, int min_level,                    const char *tag, int len){    tagStrip (&tag, &len);    while (spec->d1_level > min_level)    {	tagDataRelease (spec);        (spec->d1_level)--;        if (spec->d1_level == 0)	    break;        if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&	    (!tag ||	     (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==	      (size_t) len &&	      !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))            break;    }#if REGX_DEBUG    logf (LOG_LOG, "end tag(%d)", spec->d1_level);#endif}static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,                     struct DFA *dfa, int greedy){    struct DFA_state *state = dfa->states[0];    struct DFA_tran *t;    unsigned char c;    unsigned char c_prev = 0;    int ptr = *pptr;          /* current pointer */    int start_ptr = *pptr;    /* first char of match */    int last_ptr = 0;         /* last char of match */    int last_rule = 0;        /* rule number of current match */    int restore_ptr = 0;    int i;    if (ptr)    {	--ptr;        c = f_win_advance (spec, &ptr);    }    while (1)    {	if (dfa->states[0] == state)	{	    c_prev = c;	    restore_ptr = ptr;	}        c = f_win_advance (spec, &ptr);        if (ptr == F_WIN_EOF)        {            if (last_rule)            {                *mptr = start_ptr;                *pptr = last_ptr;                return 1;            }            break;        }        t = state->trans;        i = state->tran_no;        while (1)            if (--i < 0)    /* no transition for character c */            {                if (last_rule)                {                    *mptr = start_ptr;     /* match starts here */                    *pptr = last_ptr;      /* match end here (+1) */                    return 1;                }                state = dfa->states[0];		ptr = restore_ptr;		c = f_win_advance (spec, &ptr);                start_ptr = ptr;                break;            }            else if (c >= t->ch[0] && c <= t->ch[1])            {                state = dfa->states[t->to];                if (state->rule_no && c_prev == '\n')		{		    last_rule = state->rule_no;		    last_ptr = ptr;		}		else if (state->rule_nno)		{		    last_rule = state->rule_nno;		    last_ptr = ptr;		}		break;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -