⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 charmap.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
 */static void fun_add_map(const char *s, void *data, int num){    chrwork *arg = (chrwork *) data;    assert(arg->map->input);    logf (LOG_DEBUG, "set map %.*s", (int) strlen(s), s);    set_map_string(arg->map->input, arg->map->nmem, s, strlen(s), arg->string,                   0);    for (s = arg->string; *s; s++)	logf (LOG_DEBUG, " %3d", (unsigned char) *s);}/* * Add a query map to the string contained in the argument. */static void fun_add_qmap(const char *s, void *data, int num){    chrwork *arg = (chrwork *) data;    assert(arg->map->q_input);    logf (LOG_DEBUG, "set qmap %.*s", (int) strlen(s), s);    set_map_string(arg->map->q_input, arg->map->nmem, s,		   strlen(s), arg->string, 0);    for (s = arg->string; *s; s++)	logf (LOG_DEBUG, " %3d", (unsigned char) *s);}static int scan_to_utf8 (yaz_iconv_t t, ucs4_t *from, size_t inlen,                        char *outbuf, size_t outbytesleft){    size_t inbytesleft = inlen * sizeof(ucs4_t);    char *inbuf = (char*) from;    size_t ret;       if (t == 0)        *outbuf++ = *from;  /* ISO-8859-1 is OK here */    else    {        ret = yaz_iconv (t, &inbuf, &inbytesleft, &outbuf, &outbytesleft);        if (ret == (size_t) (-1))        {            yaz_log (LOG_WARN|LOG_ERRNO, "bad unicode sequence");            return -1;        }    }    *outbuf = '\0';    return 0;}static int scan_string(char *s_native,                       yaz_iconv_t t_unicode, yaz_iconv_t t_utf8,		       void (*fun)(const char *c, void *data, int num),		       void *data, int *num){    char str[1024];    ucs4_t arg[512];    ucs4_t *s0, *s = arg;    ucs4_t c, begin, end;    size_t i;    if (t_unicode != 0)    {        char *outbuf = (char *) arg;        char *inbuf = s_native;        size_t outbytesleft = sizeof(arg)-4;        size_t inbytesleft = strlen(s_native);        size_t ret;        		ret = yaz_iconv(t_unicode, &inbuf, &inbytesleft,                        &outbuf, &outbytesleft);        if (ret == (size_t)(-1))            return -1;        i = (outbuf - (char*) arg)/sizeof(ucs4_t);    }    else    {         for (i = 0; s_native[i]; i++)            arg[i] = s_native[i] & 255; /* ISO-8859-1 conversion */    }    arg[i] = 0;      /* terminate */    if (s[0] == 0xfeff || s[0] == 0xfeff)  /* skip byte Order Mark */        s++;    while (*s)    {	switch (*s)	{	case '{':	    s++;	    begin = zebra_prim_w(&s);	    if (*s != '-')	    {		logf(LOG_FATAL, "Bad range in char-map");		return -1;	    }	    s++;	    end = zebra_prim_w(&s);	    if (end <= begin)	    {		logf(LOG_FATAL, "Bad range in char-map");		return -1;	    }	    s++;	    for (c = begin; c <= end; c++)	    {                if (scan_to_utf8 (t_utf8, &c, 1, str, sizeof(str)-1))                    return -1;		(*fun)(str, data, num ? (*num)++ : 0);	    }	    break;	case '[': s++; abort(); break;	case '(':            ++s;            s0 = s;            while (*s != ')' || s[-1] == '\\')                s++;	    *s = 0;            if (scan_to_utf8 (t_utf8, s0, s - s0, str, sizeof(str)-1))                return -1;	    (*fun)(str, data, num ? (*num)++ : 0);	    s++;	    break;	default:	    c = zebra_prim_w(&s);            if (scan_to_utf8 (t_utf8, &c, 1, str, sizeof(str)-1))                return -1;	    (*fun)(str, data, num ? (*num)++ : 0);	}    }    return 0;}chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only,                           const char *tabroot){    FILE *f;    char line[512], *argv[50];    chrmaptab res;    int lineno = 0;    int errors = 0;    int argc, num = (int) *CHR_BASE, i;    NMEM nmem;    yaz_iconv_t t_unicode = 0;    yaz_iconv_t t_utf8 = 0;    unsigned endian = 31;    const char *ucs4_native = "UCS-4";    if (*(char*) &endian == 31)      /* little endian? */        ucs4_native = "UCS-4LE";    t_utf8 = yaz_iconv_open ("UTF-8", ucs4_native);    logf (LOG_DEBUG, "maptab %s open", name);    if (!(f = yaz_fopen(tabpath, name, "r", tabroot)))    {	logf(LOG_WARN|LOG_ERRNO, "%s", name);	return 0;    }    nmem = nmem_create ();    res = (chrmaptab) nmem_malloc(nmem, sizeof(*res));    res->nmem = nmem;    res->input = (chr_t_entry *) nmem_malloc(res->nmem, sizeof(*res->input));    res->input->target = (unsigned char **)	nmem_malloc(res->nmem, sizeof(*res->input->target) * 2);    res->input->target[0] = (unsigned char*) CHR_UNKNOWN;    res->input->target[1] = 0;    res->input->children = (chr_t_entry **)	nmem_malloc(res->nmem, sizeof(res->input) * 256);    for (i = 0; i < 256; i++)    {	res->input->children[i] = (chr_t_entry *)	    nmem_malloc(res->nmem, sizeof(*res->input));	res->input->children[i]->children = 0;	res->input->children[i]->target = (unsigned char **)	    nmem_malloc (res->nmem, 2 * sizeof(unsigned char *));	res->input->children[i]->target[1] = 0;	if (map_only)	{	    res->input->children[i]->target[0] = (unsigned char *)		nmem_malloc (res->nmem, 2 * sizeof(unsigned char));	    res->input->children[i]->target[0][0] = i;	    res->input->children[i]->target[0][1] = 0;	}	else	    res->input->children[i]->target[0] = (unsigned char*) CHR_UNKNOWN;    }    res->q_input = (chr_t_entry *)	nmem_malloc(res->nmem, sizeof(*res->q_input));    res->q_input->target = 0;    res->q_input->children = 0;    for (i = *CHR_BASE; i < 256; i++)	res->output[i] = 0;    res->output[(int) *CHR_SPACE] = (unsigned char *) " ";    res->output[(int) *CHR_UNKNOWN] = (unsigned char*) "@";    res->base_uppercase = 0;    while (!errors && (argc = readconf_line(f, &lineno, line, 512, argv, 50)))	if (!map_only && !yaz_matchstr(argv[0], "lowercase"))	{	    if (argc != 2)	    {		logf(LOG_FATAL, "Syntax error in charmap");		++errors;	    }	    if (scan_string(argv[1], t_unicode, t_utf8, fun_addentry,                            res, &num) < 0)	    {		logf(LOG_FATAL, "Bad value-set specification");		++errors;	    }	    res->base_uppercase = num;	    res->output[(int) *CHR_SPACE + num] = (unsigned char *) " ";	    res->output[(int) *CHR_UNKNOWN + num] = (unsigned char*) "@";	    num = (int) *CHR_BASE;	}	else if (!map_only && !yaz_matchstr(argv[0], "uppercase"))	{	    if (!res->base_uppercase)	    {		logf(LOG_FATAL, "Uppercase directive with no lowercase set");		++errors;	    }	    if (argc != 2)	    {		logf(LOG_FATAL, "Missing arg for uppercase directive");		++errors;	    }	    if (scan_string(argv[1], t_unicode, t_utf8, fun_addentry,                            res, &num) < 0)	    {		logf(LOG_FATAL, "Bad value-set specification");		++errors;	    }	}	else if (!map_only && !yaz_matchstr(argv[0], "space"))	{	    if (argc != 2)	    {		logf(LOG_FATAL, "Syntax error in charmap");		++errors;	    }	    if (scan_string(argv[1], t_unicode, t_utf8,                            fun_addspace, res, 0) < 0)	    {		logf(LOG_FATAL, "Bad space specification");		++errors;	    }	}	else if (!yaz_matchstr(argv[0], "map"))	{	    chrwork buf;	    if (argc != 3)	    {		logf(LOG_FATAL, "charmap directive map requires 2 args");		++errors;	    }	    buf.map = res;	    buf.string[0] = '\0';	    if (scan_string(argv[2], t_unicode, t_utf8,                            fun_mkstring, &buf, 0) < 0)	    {		logf(LOG_FATAL, "Bad map target");		++errors;	    }	    if (scan_string(argv[1], t_unicode, t_utf8,                            fun_add_map, &buf, 0) < 0)	    {		logf(LOG_FATAL, "Bad map source");		++errors;	    }	}	else if (!yaz_matchstr(argv[0], "qmap"))	{	    chrwork buf;	    if (argc != 3)	    {		logf(LOG_FATAL, "charmap directive qmap requires 2 args");		++errors;	    }	    buf.map = res;	    buf.string[0] = '\0';	    if (scan_string(argv[2], t_unicode, t_utf8,                             fun_mkstring, &buf, 0) < 0)	    {		logf(LOG_FATAL, "Bad qmap target");		++errors;	    }	    if (scan_string(argv[1], t_unicode, t_utf8,                             fun_add_qmap, &buf, 0) < 0)	    {		logf(LOG_FATAL, "Bad qmap source");		++errors;	    }	}        else if (!yaz_matchstr(argv[0], "encoding"))        {	    /*	     * Fix me. When t_unicode==0 and use encoding directive in *.chr file the beheviour of the	     * zebra need to comment next part of code.	     */	    /*            if (t_unicode != 0)                yaz_iconv_close (t_unicode);            t_unicode = yaz_iconv_open (ucs4_native, argv[1]);	    */	    	    /*	     * Fix me. It is additional staff for conversion of characters from local encoding	     * of *.chr file to UTF-8 (internal encoding).	     * NOTE: The derective encoding must be first directive in *.chr file.	     */	    if (t_utf8 != 0)        	yaz_iconv_close(t_utf8);	    t_utf8 = yaz_iconv_open ("UTF-8", argv[1]);        }	else	{	    logf(LOG_WARN, "Syntax error at '%s' in %s", line, name);	}        yaz_fclose(f);    if (errors)    {	chrmaptab_destroy(res);	res = 0;    }    logf (LOG_DEBUG, "maptab %s close %d errors", name, errors);    if (t_utf8 != 0)        yaz_iconv_close(t_utf8);    if (t_unicode != 0)        yaz_iconv_close(t_unicode);    return res;}void chrmaptab_destroy(chrmaptab tab){    if (tab)	nmem_destroy (tab->nmem);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -