📄 wchar.c
字号:
typedef struct { mbstate_t tostate; mbstate_t fromstate; int tocodeset; int fromcodeset; int frombom; int tobom; int fromcodeset0; int frombom0; int tobom0; int skip_invalid_input; /* To support iconv -c option. */} _UC_iconv_t;#ifdef L_iconv#include <iconv.h>#include <string.h>#include <endian.h>#include <byteswap.h>#if (__BYTE_ORDER != __BIG_ENDIAN) && (__BYTE_ORDER != __LITTLE_ENDIAN)#error unsupported endianness for iconv#endif#ifndef __CTYPE_HAS_8_BIT_LOCALES#error currently iconv requires 8 bit locales#endif#ifndef __CTYPE_HAS_UTF_8_LOCALES#error currently iconv requires UTF-8 locales#endifenum { IC_WCHAR_T = 0xe0, IC_MULTIBYTE = 0xe0,#if __BYTE_ORDER == __BIG_ENDIAN IC_UCS_4 = 0xec, IC_UTF_32 = 0xe4, IC_UCS_2 = 0xe2, IC_UTF_16 = 0xea,#else IC_UCS_4 = 0xed, IC_UTF_32 = 0xe5, IC_UCS_2 = 0xe3, IC_UTF_16 = 0xeb,#endif IC_UTF_8 = 2, IC_ASCII = 1};/* For the multibyte * bit 0 means swap endian * bit 1 means 2 byte * bit 2 means 4 byte * */const unsigned char codesets[] = "\x0a\xe0""WCHAR_T\x00" /* superset of UCS-4 but platform-endian */#if __BYTE_ORDER == __BIG_ENDIAN "\x08\xec""UCS-4\x00" /* always BE */ "\x0a\xec""UCS-4BE\x00" "\x0a\xed""UCS-4LE\x00" "\x09\fe4""UTF-32\x00" /* platform endian with BOM */ "\x0b\xe4""UTF-32BE\x00" "\x0b\xe5""UTF-32LE\x00" "\x08\xe2""UCS-2\x00" /* always BE */ "\x0a\xe2""UCS-2BE\x00" "\x0a\xe3""UCS-2LE\x00" "\x09\xea""UTF-16\x00" /* platform endian with BOM */ "\x0b\xea""UTF-16BE\x00" "\x0b\xeb""UTF-16LE\x00"#elif __BYTE_ORDER == __LITTLE_ENDIAN "\x08\xed""UCS-4\x00" /* always BE */ "\x0a\xed""UCS-4BE\x00" "\x0a\xec""UCS-4LE\x00" "\x09\xf4""UTF-32\x00" /* platform endian with BOM */ "\x0b\xe5""UTF-32BE\x00" "\x0b\xe4""UTF-32LE\x00" "\x08\xe3""UCS-2\x00" /* always BE */ "\x0a\xe3""UCS-2BE\x00" "\x0a\xe2""UCS-2LE\x00" "\x09\xfa""UTF-16\x00" /* platform endian with BOM */ "\x0b\xeb""UTF-16BE\x00" "\x0b\xea""UTF-16LE\x00"#endif "\x08\x02""UTF-8\x00" "\x0b\x01""US-ASCII\x00" "\x07\x01""ASCII"; /* Must be last! (special case to save a nul) */static int find_codeset(const char *name){ const unsigned char *s; int codeset; for (s = codesets ; *s ; s += *s) { if (!strcasecmp(s+2, name)) { return s[1]; } } /* The following is ripped from find_locale in locale.c. */ /* TODO: maybe CODESET_LIST + *s ??? */ /* 7bit is 1, UTF-8 is 2, 8-bit is >= 3 */ codeset = 2; s = CODESET_LIST; do { ++codeset; /* Increment codeset first. */ if (!strcasecmp(CODESET_LIST+*s, name)) { return codeset; } } while (*++s); return 0; /* No matching codeset! */}iconv_t iconv_open(const char *tocode, const char *fromcode){ register _UC_iconv_t *px; int tocodeset, fromcodeset; if (((tocodeset = find_codeset(tocode)) != 0) && ((fromcodeset = find_codeset(fromcode)) != 0)) { if ((px = malloc(sizeof(_UC_iconv_t))) != NULL) { px->tocodeset = tocodeset; px->tobom0 = px->tobom = (tocodeset & 0x10) >> 4; px->fromcodeset0 = px->fromcodeset = fromcodeset; px->frombom0 = px->frombom = (fromcodeset & 0x10) >> 4; px->skip_invalid_input = px->tostate.mask = px->fromstate.mask = 0; return (iconv_t) px; } } else { __set_errno(EINVAL); } return (iconv_t)(-1);}int iconv_close(iconv_t cd){ free(cd); return 0;}size_t iconv(iconv_t cd, char **__restrict inbuf, size_t *__restrict inbytesleft, char **__restrict outbuf, size_t *__restrict outbytesleft){ _UC_iconv_t *px = (_UC_iconv_t *) cd; size_t nrcount, r; wchar_t wc, wc2; int inci, inco; assert(px != (_UC_iconv_t *)(-1)); assert(sizeof(wchar_t) == 4); if (!inbuf || !*inbuf) { /* Need to reinitialze conversion state. */ /* Note: For shift-state encodings we possibly need to output the * shift sequence to return to initial state! */ if ((px->fromcodeset & 0xf0) == 0xe0) { } px->tostate.mask = px->fromstate.mask = 0; px->fromcodeset = px->fromcodeset0; px->tobom = px->tobom0; px->frombom = px->frombom0; return 0; } nrcount = 0; while (*inbytesleft) { if (!*outbytesleft) { TOO_BIG: __set_errno(E2BIG); return (size_t) -1; } inci = inco = 1; if (px->fromcodeset >= IC_MULTIBYTE) { inci = (px->fromcodeset == IC_WCHAR_T) ? 4: (px->fromcodeset & 6); if (*inbytesleft < inci) goto INVALID; wc = (((unsigned int)((unsigned char)((*inbuf)[0]))) << 8) + ((unsigned char)((*inbuf)[1])); if (inci == 4) { wc = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8) + ((unsigned char)((*inbuf)[3])) + (wc << 16); if (!(px->fromcodeset & 1)) wc = bswap_32(wc); } else { if (!(px->fromcodeset & 1)) wc = bswap_16(wc); if (((px->fromcodeset & IC_UTF_16) == IC_UTF_16) && (((__uwchar_t)(wc - 0xd800U)) < (0xdc00U - 0xd800U)) ) { /* surrogate */ wc =- 0xd800U; if (*inbytesleft < 4) goto INVALID; wc2 = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8) + ((unsigned char)((*inbuf)[3])); if (!(px->fromcodeset & 1)) wc = bswap_16(wc2); if (((__uwchar_t)(wc2 -= 0xdc00U)) < (0xe0000U - 0xdc00U)) { goto ILLEGAL; } inci = 4; /* Change inci here in case skipping illegals. */ wc = 0x10000UL + (wc << 10) + wc2; } } if (px->frombom) { px->frombom = 0; if ((wc == 0xfeffU) || (wc == ((inci == 4) ? (((wchar_t) 0xfffe0000UL)) : ((wchar_t)(0xfffeUL)))) ) { if (wc != 0xfeffU) { px->fromcodeset ^= 1; /* toggle endianness */ wc = 0xfeffU; } if (!px->frombom) { goto BOM_SKIP_OUTPUT; } goto GOT_BOM; } } if (px->fromcodeset != IC_WCHAR_T) { if (((__uwchar_t) wc) > (((px->fromcodeset & IC_UCS_4) == IC_UCS_4) ? 0x7fffffffUL : 0x10ffffUL)#ifdef KUHN || (((__uwchar_t)(wc - 0xfffeU)) < 2) || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))#endif ) { goto ILLEGAL; } } } else if (px->fromcodeset == IC_UTF_8) { const char *p = *inbuf; r = _wchar_utf8sntowcs(&wc, 1, &p, *inbytesleft, &px->fromstate, 0); if (((ssize_t) r) <= 0) { /* either EILSEQ or incomplete or nul */ if (((ssize_t) r) < 0) { /* either EILSEQ or incomplete or nul */ assert((r == (size_t)(-1)) || (r == (size_t)(-2))); if (r == (size_t)(-2)) { INVALID: __set_errno(EINVAL); } else { px->fromstate.mask = 0; inci = 1; ILLEGAL: if (px->skip_invalid_input) { px->skip_invalid_input = 2; /* flag for iconv utility */ goto BOM_SKIP_OUTPUT; } __set_errno(EILSEQ); } return (size_t)(-1); }#ifdef __UCLIBC_MJN3_ONLY__#warning optimize this#endif if (p != NULL) { /* incomplet char case */ goto INVALID; } p = *inbuf + 1; /* nul */ } inci = p - *inbuf; } else if ((wc = ((unsigned char)(**inbuf))) >= 0x80) { /* Non-ASCII... */ if (px->fromcodeset == IC_ASCII) { /* US-ASCII codeset */ goto ILLEGAL; } else { /* some other 8-bit ascii-extension codeset */ const codeset_8_bit_t *c8b = __locale_mmap->codeset_8_bit + px->fromcodeset - 3; wc -= 0x80; wc = __global_locale.tbl8c2wc[ (c8b->idx8c2wc[wc >> Cc2wc_IDX_SHIFT] << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))]; if (!wc) { goto ILLEGAL; } } } if (px->tobom) { inci = 0; wc = 0xfeffU; GOT_BOM: px->tobom = 0; } if (px->tocodeset >= IC_MULTIBYTE) { inco = (px->tocodeset == IC_WCHAR_T) ? 4: (px->tocodeset & 6); if (*outbytesleft < inci) goto TOO_BIG; if (px->tocodeset != IC_WCHAR_T) { if (((__uwchar_t) wc) > (((px->tocodeset & IC_UCS_4) == IC_UCS_4) ? 0x7fffffffUL : 0x10ffffUL)#ifdef KUHN || (((__uwchar_t)(wc - 0xfffeU)) < 2) || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))#endif ) { REPLACE_32: wc = 0xfffd; ++nrcount; } } if (inco == 4) { if (px->tocodeset & 1) wc = bswap_32(wc); } else { if (((__uwchar_t)wc ) > 0xffffU) { if ((px->tocodeset & IC_UTF_16) != IC_UTF_16) { goto REPLACE_32; } if (*outbytesleft < (inco = 4)) goto TOO_BIG; wc2 = 0xdc00U + (wc & 0x3ff); wc = 0xd800U + ((wc >> 10) & 0x3ff); if (px->tocodeset & 1) { wc = bswap_16(wc); wc2 = bswap_16(wc2); } wc += (wc2 << 16); } else if (px->tocodeset & 1) wc = bswap_16(wc); } (*outbuf)[0] = (char)((unsigned char)(wc)); (*outbuf)[1] = (char)((unsigned char)(wc >> 8)); if (inco == 4) { (*outbuf)[2] = (char)((unsigned char)(wc >> 16)); (*outbuf)[3] = (char)((unsigned char)(wc >> 24)); } } else if (px->tocodeset == IC_UTF_8) { const wchar_t *pw = &wc; do { r = _wchar_wcsntoutf8s(*outbuf, *outbytesleft, &pw, 1); if (r != (size_t)(-1)) {#ifdef __UCLIBC_MJN3_ONLY__#warning what happens for a nul?#endif if (r == 0) { if (wc != 0) { goto TOO_BIG; } ++r; } break; } wc = 0xfffdU; ++nrcount; } while (1); inco = r; } else if (((__uwchar_t)(wc)) < 0x80) { CHAR_GOOD: **outbuf = wc; } else { if ((px->tocodeset != 0x01) && (wc <= Cwc2c_DOMAIN_MAX)) { const codeset_8_bit_t *c8b = __locale_mmap->codeset_8_bit + px->tocodeset - 3; __uwchar_t u; u = c8b->idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)]; u = __global_locale.tbl8wc2c[(u << Cwc2c_TI_SHIFT) + ((wc >> Cwc2c_TT_SHIFT) & ((1 << Cwc2c_TI_SHIFT)-1))]; wc = __global_locale.tbl8wc2c[Cwc2c_TI_LEN + (u << Cwc2c_TT_SHIFT) + (wc & ((1 << Cwc2c_TT_SHIFT)-1))]; if (wc) { goto CHAR_GOOD; } } **outbuf = '?'; ++nrcount; } *outbuf += inco; *outbytesleft -= inco; BOM_SKIP_OUTPUT: *inbuf += inci; *inbytesleft -= inci; } return nrcount;}#endif/**********************************************************************/#ifdef L_iconv_main#include <stdio.h>#include <stdlib.h>#include <string.h>#include <wchar.h>#include <iconv.h>#include <stdarg.h>#include <libgen.h>extern const unsigned char codesets[];#define IBUF BUFSIZ#define OBUF BUFSIZchar *progname;int hide_errors;static void error_msg(const char *fmt, ...) __attribute__ ((noreturn, format (printf, 1, 2)));static void error_msg(const char *fmt, ...) { va_list arg; if (!hide_errors) { fprintf(stderr, "%s: ", progname); va_start(arg, fmt); vfprintf(stderr, fmt, arg); va_end(arg); } exit(EXIT_FAILURE);}int main(int argc, char **argv){ FILE *ifile; FILE *ofile = stdout; const char *p; const char *s; static const char opt_chars[] = "tfocsl"; /* 012345 */ const char *opts[sizeof(opt_chars)]; /* last is infile name */ iconv_t ic; char ibuf[IBUF]; char obuf[OBUF]; char *pi; char *po; size_t ni, no, r, pos; hide_errors = 0; for (s = opt_chars ; *s ; s++) { opts[ s - opt_chars ] = NULL; } progname = *argv; while (--argc) { p = *++argv; if ((*p != '-') || (*++p == 0)) { break; } do { if ((s = strchr(opt_chars,*p)) == NULL) { USAGE: s = basename(progname); fprintf(stderr, "%s [-cs] -f fromcode -t tocode [-o outputfile] [inputfile ...]\n" " or\n%s -l\n", s, s); return EXIT_FAILURE; } if ((s - opt_chars) < 3) { if ((--argc == 0) || opts[s - opt_chars]) { goto USAGE; } opts[s - opt_chars] = *++argv; } else { opts[s - opt_chars] = p; } } while (*++p); } if (opts[5]) { /* -l */ fprintf(stderr, "Recognized codesets:\n"); for (s = codesets ; *s ; s += *s) { fprintf(stderr," %s\n", s+2); } s = CODESET_LIST; do { fprintf(stderr," %s\n", CODESET_LIST+ (unsigned char)(*s)); } while (*++s); return EXIT_SUCCESS; } if (opts[4]) { hide_errors = 1; } if (!opts[0] || !opts[1]) { goto USAGE; } if ((ic = iconv_open(opts[0],opts[1])) == ((iconv_t)(-1))) { error_msg( "unsupported codeset in %s -> %s conversion\n", opts[0], opts[1]); } if (opts[3]) { /* -c */ ((_UC_iconv_t *) ic)->skip_invalid_input = 1; } if ((s = opts[2]) != NULL) { if (!(ofile = fopen(s, "w"))) { error_msg( "couldn't open %s for writing\n", s); } } pos = ni = 0; do { if (!argc || ((**argv == '-') && !((*argv)[1]))) { ifile = stdin; /* we don't check for duplicates */ } else if (!(ifile = fopen(*argv, "r"))) { error_msg( "couldn't open %s for reading\n", *argv); } while ((r = fread(ibuf + ni, 1, IBUF - ni, ifile)) > 0) { pos += r; ni += r; no = OBUF; pi = ibuf; po = obuf; if ((r = iconv(ic, &pi, &ni, &po, &no)) == ((size_t)(-1))) { if ((errno != EINVAL) && (errno != E2BIG)) { error_msg( "iconv failed at pos %lu : %m\n", (unsigned long) (pos - ni)); } } if ((r = OBUF - no) > 0) { if (fwrite(obuf, 1, OBUF - no, ofile) < r) { error_msg( "write error\n"); } } if (ni) { /* still bytes in buffer! */ memmove(ibuf, pi, ni); } } if (ferror(ifile)) { error_msg( "read error\n"); } ++argv; if (ifile != stdin) { fclose(ifile); } } while (--argc > 0); iconv_close(ic); if (ni) { error_msg( "incomplete sequence\n"); } return (((_UC_iconv_t *) ic)->skip_invalid_input < 2) ? EXIT_SUCCESS : EXIT_FAILURE;}#endif/**********************************************************************/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -