⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zrpn.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 5 页
字号:
        attent attp;        data1_local_attribute id_xpath_attr;        data1_local_attribute *local_attr;        int max_pos, prefix_len = 0;        termp = *term_sub;        if (zebraExplain_curDatabase (zh->reg->zei, basenames[base_no]))        {            zh->errCode = 109; /* Database unavailable */            zh->errString = basenames[base_no];            return -1;        }        if (use_value == -2)  /* string attribute (assume IDXPATH/any) */        {            use_value = xpath_use;            attp.local_attributes = &id_xpath_attr;            attp.attset_ordinal = VAL_IDXPATH;            id_xpath_attr.next = 0;            id_xpath_attr.local = use_value;        }	else if (curAttributeSet == VAL_IDXPATH)        {            attp.local_attributes = &id_xpath_attr;            attp.attset_ordinal = VAL_IDXPATH;            id_xpath_attr.next = 0;            id_xpath_attr.local = use_value;        }        else        {            if ((r=att_getentbyatt (zh, &attp, curAttributeSet, use_value)))            {                logf (LOG_DEBUG, "att_getentbyatt fail. set=%d use=%d r=%d",                      curAttributeSet, use_value, r);                if (r == -1)                {                    /* set was found, but value wasn't defined */                    char val_str[32];                    sprintf (val_str, "%d", use_value);                    errCode = 114;                    errString = nmem_strdup (stream, val_str);                }                else                {                    int oid[OID_SIZE];                    struct oident oident;                                        oident.proto = PROTO_Z3950;                    oident.oclass = CLASS_ATTSET;                    oident.value = curAttributeSet;                    oid_ent_to_oid (&oident, oid);                                        errCode = 121;                    errString = nmem_strdup (stream, oident.desc);                }                continue;            }        }        for (local_attr = attp.local_attributes; local_attr;             local_attr = local_attr->next)        {            int ord;            char ord_buf[32];            int i, ord_len;                        ord = zebraExplain_lookupSU (zh->reg->zei, attp.attset_ordinal,                                         local_attr->local);            if (ord < 0)                continue;            if (prefix_len)                term_dict[prefix_len++] = '|';            else                term_dict[prefix_len++] = '(';                        ord_len = key_SU_encode (ord, ord_buf);            for (i = 0; i<ord_len; i++)            {                term_dict[prefix_len++] = 1;                term_dict[prefix_len++] = ord_buf[i];            }        }        if (!prefix_len)        {            char val_str[32];            sprintf (val_str, "%d", use_value);            errCode = 114;            errString = nmem_strdup (stream, val_str);	    continue;        }	bases_ok++; /* this has OK attributes */        term_dict[prefix_len++] = ')';        term_dict[prefix_len++] = 1;        term_dict[prefix_len++] = reg_type;	logf (LOG_DEBUG, "reg_type = %d", term_dict[prefix_len-1]);        term_dict[prefix_len] = '\0';	j = prefix_len;	switch (truncation_value)	{	case -1:         /* not specified */	case 100:        /* do not truncate */	    if (!string_relation (zh, zapt, &termp, term_dict,				  attributeSet,				  reg_type, space_split, term_dst))		return 0;	    logf (LOG_LOG, "dict_lookup_grep: %s", term_dict+prefix_len);	    r = dict_lookup_grep (zh->reg->dict, term_dict, 0,				  grep_info, &max_pos, 0, grep_handle);	    if (r)		logf (LOG_WARN, "dict_lookup_grep fail %d", r);	    break;	case 1:          /* right truncation */	    term_dict[j++] = '(';	    if (!term_100 (zh->reg->zebra_maps, reg_type,			   &termp, term_dict + j, space_split, term_dst))		return 0;	    strcat (term_dict, ".*)");	    dict_lookup_grep (zh->reg->dict, term_dict, 0, grep_info,			      &max_pos, 0, grep_handle);	    break;	case 2:          /* keft truncation */	    term_dict[j++] = '('; term_dict[j++] = '.'; term_dict[j++] = '*';	    if (!term_100 (zh->reg->zebra_maps, reg_type,			   &termp, term_dict + j, space_split, term_dst))		return 0;	    strcat (term_dict, ")");	    dict_lookup_grep (zh->reg->dict, term_dict, 0, grep_info,			      &max_pos, 0, grep_handle);	    break;	case 3:          /* left&right truncation */	    term_dict[j++] = '('; term_dict[j++] = '.'; term_dict[j++] = '*';	    if (!term_100 (zh->reg->zebra_maps, reg_type,			   &termp, term_dict + j, space_split, term_dst))		return 0;	    strcat (term_dict, ".*)");	    dict_lookup_grep (zh->reg->dict, term_dict, 0, grep_info,			      &max_pos, 0, grep_handle);	    break;	    zh->errCode = 120;	    return -1;	case 101:        /* process # in term */	    term_dict[j++] = '(';	    if (!term_101 (zh->reg->zebra_maps, reg_type,			   &termp, term_dict + j, space_split, term_dst))		return 0;	    strcat (term_dict, ")");	    r = dict_lookup_grep (zh->reg->dict, term_dict, 0, grep_info,				  &max_pos, 0, grep_handle);	    if (r)		logf (LOG_WARN, "dict_lookup_grep err, trunc=#: %d", r);	    break;	case 102:        /* Regexp-1 */	    term_dict[j++] = '(';	    if (!term_102 (zh->reg->zebra_maps, reg_type,			   &termp, term_dict + j, space_split, term_dst))		return 0;	    strcat (term_dict, ")");	    logf (LOG_DEBUG, "Regexp-1 tolerance=%d", r);	    r = dict_lookup_grep (zh->reg->dict, term_dict, 0, grep_info,				  &max_pos, 0, grep_handle);	    if (r)		logf (LOG_WARN, "dict_lookup_grep err, trunc=regular: %d",		      r);	    break;	case 103:       /* Regexp-2 */	    r = 1;	    term_dict[j++] = '(';	    if (!term_103 (zh->reg->zebra_maps, reg_type,			   &termp, term_dict + j, &r, space_split, term_dst))		return 0;	    strcat (term_dict, ")");	    logf (LOG_DEBUG, "Regexp-2 tolerance=%d", r);	    r = dict_lookup_grep (zh->reg->dict, term_dict, r, grep_info,				  &max_pos, 2, grep_handle);	    if (r)		logf (LOG_WARN, "dict_lookup_grep err, trunc=eregular: %d",		      r);	    break;	case 104:        /* process # and ! in term */	    term_dict[j++] = '(';	    if (!term_104 (zh->reg->zebra_maps, reg_type,			   &termp, term_dict + j, space_split, term_dst))		return 0;	    strcat (term_dict, ")");	    r = dict_lookup_grep (zh->reg->dict, term_dict, 0, grep_info,				  &max_pos, 0, grep_handle);	    if (r)		logf (LOG_WARN, "dict_lookup_grep err, trunc=#/!: %d", r);	    break;	case 105:        /* process * and ! in term */	    term_dict[j++] = '(';	    if (!term_105 (zh->reg->zebra_maps, reg_type,			   &termp, term_dict + j, space_split, term_dst, 1))		return 0;	    strcat (term_dict, ")");	    r = dict_lookup_grep (zh->reg->dict, term_dict, 0, grep_info,				  &max_pos, 0, grep_handle);	    if (r)		logf (LOG_WARN, "dict_lookup_grep err, trunc=*/!: %d", r);	    break;	case 106:        /* process * and ! in term */	    term_dict[j++] = '(';	    if (!term_105 (zh->reg->zebra_maps, reg_type,			   &termp, term_dict + j, space_split, term_dst, 0))		return 0;	    strcat (term_dict, ")");	    r = dict_lookup_grep (zh->reg->dict, term_dict, 0, grep_info,				  &max_pos, 0, grep_handle);	    if (r)		logf (LOG_WARN, "dict_lookup_grep err, trunc=*/!: %d", r);	    break;        }    }    if (!bases_ok)    {	zh->errCode = errCode;	zh->errString = errString;	return -1;    }    *term_sub = termp;    logf (LOG_DEBUG, "%d positions", grep_info->isam_p_indx);    return 1;}/* convert APT search term to UTF8 */static int zapt_term_to_utf8 (ZebraHandle zh, Z_AttributesPlusTerm *zapt,                              char *termz){    size_t sizez;    Z_Term *term = zapt->term;    switch (term->which)    {    case Z_Term_general:        if (zh->iconv_to_utf8 != 0)        {            char *inbuf = term->u.general->buf;            size_t inleft = term->u.general->len;            char *outbuf = termz;            size_t outleft = IT_MAX_WORD-1;            size_t ret;            ret = yaz_iconv(zh->iconv_to_utf8, &inbuf, &inleft,                        &outbuf, &outleft);            if (ret == (size_t)(-1))            {                ret = yaz_iconv(zh->iconv_to_utf8, 0, 0, 0, 0);                zh->errCode = 125;                return -1;            }            *outbuf = 0;        }        else        {            sizez = term->u.general->len;            if (sizez > IT_MAX_WORD-1)                sizez = IT_MAX_WORD-1;            memcpy (termz, term->u.general->buf, sizez);            termz[sizez] = '\0';        }        break;    case Z_Term_characterString:        sizez = strlen(term->u.characterString);        if (sizez > IT_MAX_WORD-1)            sizez = IT_MAX_WORD-1;        memcpy (termz, term->u.characterString, sizez);        termz[sizez] = '\0';        break;    default:        zh->errCode = 124;        return -1;    }    return 0;}/* convert APT SCAN term to internal cmap */static int trans_scan_term (ZebraHandle zh, Z_AttributesPlusTerm *zapt,                            char *termz, int reg_type){    char termz0[IT_MAX_WORD];    if (zapt_term_to_utf8(zh, zapt, termz0))        return -1;    /* error */    else    {        const char **map;        const char *cp = (const char *) termz0;        const char *cp_end = cp + strlen(cp);        const char *src;        int i = 0;        const char *space_map = NULL;        int len;                    while ((len = (cp_end - cp)) > 0)        {            map = zebra_maps_input (zh->reg->zebra_maps, reg_type, &cp, len);            if (**map == *CHR_SPACE)                space_map = *map;            else            {                if (i && space_map)                    for (src = space_map; *src; src++)                        termz[i++] = *src;                space_map = NULL;                for (src = *map; *src; src++)                    termz[i++] = *src;            }        }        termz[i] = '\0';    }    return 0;}static RSET rpn_prox (ZebraHandle zh, RSET *rset, int rset_no,		      int ordered, int exclusion, int relation, int distance){    int i;    RSFD *rsfd;    int  *more;    struct it_key **buf;    RSET result;    char prox_term[1024];    int length_prox_term = 0;    int min_nn = 10000000;    int term_index;    int term_type = Z_Term_characterString;    const char *flags = NULL;        rsfd = (RSFD *) xmalloc (sizeof(*rsfd)*rset_no);    more = (int *) xmalloc (sizeof(*more)*rset_no);    buf = (struct it_key **) xmalloc (sizeof(*buf)*rset_no);    *prox_term = '\0';    for (i = 0; i<rset_no; i++)    {	int j;	for (j = 0; j<rset[i]->no_rset_terms; j++)	{	    const char *nflags = rset[i]->rset_terms[j]->flags;	    char *term = rset[i]->rset_terms[j]->name;	    int lterm = strlen(term);	    if (lterm + length_prox_term < sizeof(prox_term)-1)	    {		if (length_prox_term)		    prox_term[length_prox_term++] = ' ';		strcpy (prox_term + length_prox_term, term);		length_prox_term += lterm;	    }	    if (min_nn > rset[i]->rset_terms[j]->nn)		min_nn = rset[i]->rset_terms[j]->nn;	    flags = nflags;            term_type = rset[i]->rset_terms[j]->type;            /* only if all term types are of type characterString .. */            /* the resulting term is of that type */            if (term_type != Z_Term_characterString)                term_type = Z_Term_general;	}    }    for (i = 0; i<rset_no; i++)    {	buf[i] = 0;	rsfd[i] = 0;    }    for (i = 0; i<rset_no; i++)    {	buf[i] = (struct it_key *) xmalloc (sizeof(**buf));	rsfd[i] = rset_open (rset[i], RSETF_READ);        if (!(more[i] = rset_read (rset[i], rsfd[i], buf[i], &term_index)))	    break;    }    if (i != rset_no)    {	/* at least one is empty ... return null set */	rset_null_parms parms;		parms.rset_term = rset_term_create (prox_term, length_prox_term,					    flags, term_type);	parms.rset_term->nn = 0;	result = rset_create (rset_kind_null, &parms);    }    else if (ordered && relation == 3 && exclusion == 0 && distance == 1)    {	/* special proximity case = phrase search ... */	rset_temp_parms parms;	RSFD rsfd_result;	parms.rset_term = rset_term_create (prox_term, length_prox_term,					    flags, term_type);	parms.rset_term->nn = min_nn;        parms.cmp = key_compare_it;	parms.key_size = sizeof (struct it_key);	parms.temp_path = res_get (zh->res, "setTmpDir");	result = rset_create (rset_kind_temp, &parms);	rsfd_result = rset_open (result, RSETF_WRITE);	while (*more)	{	    for (i = 1; i<rset_no; i++)	    {		int cmp;				if (!more[i])		{		    *more = 0;		    break;		}		cmp = key_compare_it (buf[i], buf[i-1]);		if (cmp > 1)		{		    more[i-1] = rset_read (rset[i-1], rsfd[i-1],					   buf[i-1], &term_index);		    break;		}		else if (cmp == 1)		{		    if (buf[i-1]->seqno+1 != buf[i]->seqno)		    {			more[i-1] = rset_read (rset[i-1], rsfd[i-1],					       buf[i-1], &term_index);			break;		    }		}		else		{		    more[i] = rset_read (rset[i], rsfd[i], buf[i],					 &term_index);		    break;		}	    }	    if (i == rset_no)	    {		rset_write (result, rsfd_result, buf[0]);		more[0] = rset_read (*rset, *rsfd, *buf, &term_index);	    }	}	rset_close (result, rsfd_result);    }    else if (rset_no == 2)    {	/* generic proximity case (two input sets only) ... */	rset_temp_parms parms;	RSFD rsfd_result;	yaz_log (LOG_LOG, "generic prox, dist=%d, relation=%d, ordered=%d"			  ", exclusion=%d",			  distance, relation, ordered, exclusion);	parms.rset_term = rset_term_create (prox_term, length_prox_term,					    flags, term_type);	parms.rset_term->nn = min_nn;        parms.cmp = key_compare_it;	parms.key_size = sizeof (struct it_key);	parms.temp_path = res_get (zh->res, "setTmpDir");	result = rset_create (rset_kind_temp, &parms);	rsfd_result = rset_open (result, RSETF_WRITE);	while (more[0] && more[1]) 	{	    int cmp = key_compare_it (buf[0], buf[1]);	    if (cmp < -1)		more[0] = rset_read (rset[0], rsfd[0], buf[0], &term_index);	    else if (cmp > 1)		more[1] = rset_read (rset[1], rsfd[1], buf[1], &term_index);	    else

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -