⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zrpn.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 5 页
字号:
                if (limit > 20)                    limit = 20;                while (--limit >= 0)                {                    dst[i++] = '.';                    dst[i++] = '?';                }            }            else            {                dst[i++] = '.';                dst[i++] = '*';            }        }        else if (*s0 == '*')        {            dst[i++] = '.';            dst[i++] = '*';	    dst_term[j++] = *s0++;        }        else if (*s0 == '#')	{            dst[i++] = '.';	    dst_term[j++] = *s0++;	}        {            s1 = s0;            map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0));            if (space_split && **map == *CHR_SPACE)                break;            while (s1 < s0)            {                if (strchr(REGEX_CHARS, *s1))                    dst[i++] = '\\';		dst_term[j++] = *s1;                dst[i++] = *s1++;            }        }    }    dst[i] = '\0';    dst_term[j++] = '\0';    *src = s0;    return i;}/* term_105/106: handle term, where trunc=Process * and ! and right trunc */static int term_105 (ZebraMaps zebra_maps, int reg_type,		     const char **src, char *dst, int space_split,		     char *dst_term, int right_truncate){    const char *s0, *s1;    const char **map;    int i = 0;    int j = 0;    if (!term_pre (zebra_maps, reg_type, src, "*!", "*!"))        return 0;    s0 = *src;    while (*s0)    {        if (*s0 == '*')        {            dst[i++] = '.';            dst[i++] = '*';	    dst_term[j++] = *s0++;        }        else if (*s0 == '!')	{            dst[i++] = '.';	    dst_term[j++] = *s0++;	}        {            s1 = s0;            map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0));            if (space_split && **map == *CHR_SPACE)                break;            while (s1 < s0)            {                if (strchr(REGEX_CHARS, *s1))                    dst[i++] = '\\';		dst_term[j++] = *s1;                dst[i++] = *s1++;            }        }    }    if (right_truncate)    {        dst[i++] = '.';        dst[i++] = '*';    }    dst[i] = '\0';        dst_term[j++] = '\0';    *src = s0;    return i;}/* gen_regular_rel - generate regular expression from relation *  val:     border value (inclusive) *  islt:    1 if <=; 0 if >=. */static void gen_regular_rel (char *dst, int val, int islt){    int dst_p;    int w, d, i;    int pos = 0;    char numstr[20];    logf (LOG_DEBUG, "gen_regular_rel. val=%d, islt=%d", val, islt);    if (val >= 0)    {        if (islt)            strcpy (dst, "(-[0-9]+|(");        else            strcpy (dst, "((");    }     else    {        if (!islt)        {            strcpy (dst, "([0-9]+|-(");            dst_p = strlen (dst);            islt = 1;        }        else        {            strcpy (dst, "(-(");            islt = 0;        }        val = -val;    }    dst_p = strlen (dst);    sprintf (numstr, "%d", val);    for (w = strlen(numstr); --w >= 0; pos++)    {        d = numstr[w];        if (pos > 0)        {            if (islt)            {                if (d == '0')                    continue;                d--;            }             else            {                if (d == '9')                    continue;                d++;            }        }                strcpy (dst + dst_p, numstr);        dst_p = strlen(dst) - pos - 1;        if (islt)        {            if (d != '0')            {                dst[dst_p++] = '[';                dst[dst_p++] = '0';                dst[dst_p++] = '-';                dst[dst_p++] = d;                dst[dst_p++] = ']';            }            else                dst[dst_p++] = d;        }        else        {            if (d != '9')            {                 dst[dst_p++] = '[';                dst[dst_p++] = d;                dst[dst_p++] = '-';                dst[dst_p++] = '9';                dst[dst_p++] = ']';            }            else                dst[dst_p++] = d;        }        for (i = 0; i<pos; i++)        {            dst[dst_p++] = '[';            dst[dst_p++] = '0';            dst[dst_p++] = '-';            dst[dst_p++] = '9';            dst[dst_p++] = ']';        }        dst[dst_p++] = '|';    }    dst[dst_p] = '\0';    if (islt)    {	/* match everything less than 10^(pos-1) */	strcat (dst, "0*");	for (i=1; i<pos; i++)	    strcat (dst, "[0-9]?");    }    else    {	/* match everything greater than 10^pos */        for (i = 0; i <= pos; i++)            strcat (dst, "[0-9]");        strcat (dst, "[0-9]*");    }    strcat (dst, "))");}void string_rel_add_char (char **term_p, const char *src, int *indx){    if (src[*indx] == '\\')	*(*term_p)++ = src[(*indx)++];    *(*term_p)++ = src[(*indx)++];}/* *   >  abc     ([b-].*|a[c-].*|ab[d-].*|abc.+) *              ([^-a].*|a[^-b].*ab[^-c].*|abc.+) *   >= abc     ([b-].*|a[c-].*|ab[c-].*) *              ([^-a].*|a[^-b].*|ab[c-].*) *   <  abc     ([-0].*|a[-a].*|ab[-b].*) *              ([^a-].*|a[^b-].*|ab[^c-].*) *   <= abc     ([-0].*|a[-a].*|ab[-b].*|abc) *              ([^a-].*|a[^b-].*|ab[^c-].*|abc) */static int string_relation (ZebraHandle zh, Z_AttributesPlusTerm *zapt,			    const char **term_sub, char *term_dict,			    oid_value attributeSet,			    int reg_type, int space_split, char *term_dst){    AttrType relation;    int relation_value;    int i;    char *term_tmp = term_dict + strlen(term_dict);    char term_component[2*IT_MAX_WORD+20];    attr_init (&relation, zapt, 2);    relation_value = attr_find (&relation, NULL);    logf (LOG_DEBUG, "string relation value=%d", relation_value);    switch (relation_value)    {    case 1:        if (!term_100 (zh->reg->zebra_maps, reg_type,		       term_sub, term_component,		       space_split, term_dst))            return 0;        logf (LOG_DEBUG, "Relation <");		*term_tmp++ = '(';	for (i = 0; term_component[i]; )	{	    int j = 0;	    if (i)		*term_tmp++ = '|';	    while (j < i)		string_rel_add_char (&term_tmp, term_component, &j);	    *term_tmp++ = '[';	    *term_tmp++ = '^';	    string_rel_add_char (&term_tmp, term_component, &i);	    *term_tmp++ = '-';	    *term_tmp++ = ']';	    *term_tmp++ = '.';	    *term_tmp++ = '*';            if ((term_tmp - term_dict) > IT_MAX_WORD)                break;	}	*term_tmp++ = ')';	*term_tmp = '\0';        break;    case 2:        if (!term_100 (zh->reg->zebra_maps, reg_type,		       term_sub, term_component,		       space_split, term_dst))            return 0;        logf (LOG_DEBUG, "Relation <=");	*term_tmp++ = '(';	for (i = 0; term_component[i]; )	{	    int j = 0;	    while (j < i)		string_rel_add_char (&term_tmp, term_component, &j);	    *term_tmp++ = '[';	    *term_tmp++ = '^';	    string_rel_add_char (&term_tmp, term_component, &i);	    *term_tmp++ = '-';	    *term_tmp++ = ']';	    *term_tmp++ = '.';	    *term_tmp++ = '*';	    *term_tmp++ = '|';            if ((term_tmp - term_dict) > IT_MAX_WORD)                break;	}	for (i = 0; term_component[i]; )	    string_rel_add_char (&term_tmp, term_component, &i);	*term_tmp++ = ')';	*term_tmp = '\0';        break;    case 5:        if (!term_100 (zh->reg->zebra_maps, reg_type,		       term_sub, term_component, space_split, term_dst))            return 0;        logf (LOG_DEBUG, "Relation >");	*term_tmp++ = '(';	for (i = 0; term_component[i];)	{	    int j = 0;	    while (j < i)		string_rel_add_char (&term_tmp, term_component, &j);	    *term_tmp++ = '[';	    	    *term_tmp++ = '^';	    *term_tmp++ = '-';	    string_rel_add_char (&term_tmp, term_component, &i);	    *term_tmp++ = ']';	    *term_tmp++ = '.';	    *term_tmp++ = '*';	    *term_tmp++ = '|';            if ((term_tmp - term_dict) > IT_MAX_WORD)                break;	}	for (i = 0; term_component[i];)	    string_rel_add_char (&term_tmp, term_component, &i);	*term_tmp++ = '.';	*term_tmp++ = '+';	*term_tmp++ = ')';	*term_tmp = '\0';        break;    case 4:        if (!term_100 (zh->reg->zebra_maps, reg_type, term_sub,		       term_component, space_split, term_dst))            return 0;        logf (LOG_DEBUG, "Relation >=");	*term_tmp++ = '(';	for (i = 0; term_component[i];)	{	    int j = 0;	    if (i)		*term_tmp++ = '|';	    while (j < i)		string_rel_add_char (&term_tmp, term_component, &j);	    *term_tmp++ = '[';	    if (term_component[i+1])	    {		*term_tmp++ = '^';		*term_tmp++ = '-';		string_rel_add_char (&term_tmp, term_component, &i);	    }	    else	    {		string_rel_add_char (&term_tmp, term_component, &i);		*term_tmp++ = '-';	    }	    *term_tmp++ = ']';	    *term_tmp++ = '.';	    *term_tmp++ = '*';            if ((term_tmp - term_dict) > IT_MAX_WORD)                break;	}	*term_tmp++ = ')';	*term_tmp = '\0';        break;    case 3:    default:        logf (LOG_DEBUG, "Relation =");        if (!term_100 (zh->reg->zebra_maps, reg_type, term_sub,		       term_component, space_split, term_dst))            return 0;	strcat (term_tmp, "(");	strcat (term_tmp, term_component);	strcat (term_tmp, ")");    }    return 1;}static int string_term (ZebraHandle zh, Z_AttributesPlusTerm *zapt,                        const char **term_sub,                         oid_value attributeSet, NMEM stream,                        struct grep_info *grep_info,                        int reg_type, int complete_flag,                        int num_bases, char **basenames,                        char *term_dst, int xpath_use);static RSET term_trunc (ZebraHandle zh, Z_AttributesPlusTerm *zapt,                        const char **term_sub,                         oid_value attributeSet, NMEM stream,                        struct grep_info *grep_info,                        int reg_type, int complete_flag,                        int num_bases, char **basenames,                        char *term_dst,                        const char *rank_type, int xpath_use){    int r;    grep_info->isam_p_indx = 0;    r = string_term (zh, zapt, term_sub, attributeSet, stream, grep_info,                     reg_type, complete_flag, num_bases, basenames,                     term_dst, xpath_use);    if (r < 1)        return 0;    logf (LOG_DEBUG, "term: %s", term_dst);    return rset_trunc (zh, grep_info->isam_p_buf,                       grep_info->isam_p_indx, term_dst,                       strlen(term_dst), rank_type, 1 /* preserve pos */,                       zapt->term->which);}static int string_term (ZebraHandle zh, Z_AttributesPlusTerm *zapt,                        const char **term_sub,                         oid_value attributeSet, NMEM stream,                        struct grep_info *grep_info,                        int reg_type, int complete_flag,                        int num_bases, char **basenames,                        char *term_dst, int xpath_use){    char term_dict[2*IT_MAX_WORD+4000];    int j, r, base_no;    AttrType truncation;    int truncation_value;    AttrType use;    int use_value;    const char *use_string = 0;    oid_value curAttributeSet = attributeSet;    const char *termp;    struct rpn_char_map_info rcmi;    int space_split = complete_flag ? 0 : 1;    int bases_ok = 0;     /* no of databases with OK attribute */    int errCode = 0;      /* err code (if any is not OK) */    char *errString = 0;  /* addinfo */    rpn_char_map_prepare (zh->reg, reg_type, &rcmi);    attr_init (&use, zapt, 1);    use_value = attr_find_ex (&use, &curAttributeSet, &use_string);    logf (LOG_DEBUG, "string_term, use value %d", use_value);    attr_init (&truncation, zapt, 5);    truncation_value = attr_find (&truncation, NULL);    logf (LOG_DEBUG, "truncation value %d", truncation_value);    if (use_value == -1)    /* no attribute - assumy "any" */        use_value = 1016;    for (base_no = 0; base_no < num_bases; base_no++)    {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -