📄 charcnv.c
字号:
/* Unix SMB/CIFS implementation. Character set conversion Extensions Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001 Copyright (C) Andrew Tridgell 2001 Copyright (C) Simo Sorce 2001 Copyright (C) Martin Pool 2003 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.*/#include "includes.h"/* We can parameterize this if someone complains.... JRA. */char lp_failed_convert_char(void){ return '_';}/** * @file * * @brief Character-set conversion routines built on our iconv. * * @note Samba's internal character set (at least in the 3.0 series) * is always the same as the one for the Unix filesystem. It is * <b>not</b> necessarily UTF-8 and may be different on machines that * need i18n filenames to be compatible with Unix software. It does * have to be a superset of ASCII. All multibyte sequences must start * with a byte with the high bit set. * * @sa lib/iconv.c */static smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];static BOOL conv_silent; /* Should we do a debug if the conversion fails ? *//** * Return the name of a charset to give to iconv(). **/static const char *charset_name(charset_t ch){ const char *ret = NULL; if (ch == CH_UCS2) ret = "UTF-16LE"; else if (ch == CH_UNIX) ret = lp_unix_charset(); else if (ch == CH_DOS) ret = lp_dos_charset(); else if (ch == CH_DISPLAY) ret = lp_display_charset(); else if (ch == CH_UTF8) ret = "UTF8";#if defined(HAVE_NL_LANGINFO) && defined(CODESET) if (ret && !strcmp(ret, "LOCALE")) { const char *ln = NULL;#ifdef HAVE_SETLOCALE setlocale(LC_ALL, "");#endif ln = nl_langinfo(CODESET); if (ln) { /* Check whether the charset name is supported by iconv */ smb_iconv_t handle = smb_iconv_open(ln,"UCS-2LE"); if (handle == (smb_iconv_t) -1) { DEBUG(5,("Locale charset '%s' unsupported, using ASCII instead\n", ln)); ln = NULL; } else { DEBUG(5,("Substituting charset '%s' for LOCALE\n", ln)); smb_iconv_close(handle); } } ret = ln; }#ifdef HAVE_SETLOCALE /* We set back the locale to C to get ASCII-compatible toupper/lower functions. For now we do not need any other POSIX localisations anyway. When we should really need localized string functions one day we need to write our own ascii_tolower etc. */ setlocale(LC_ALL, "C"); #endif#endif if (!ret || !*ret) ret = "ASCII"; return ret;}void lazy_initialize_conv(void){ static int initialized = False; if (!initialized) { initialized = True; load_case_tables(); init_iconv(); }}/** * Initialize iconv conversion descriptors. * * This is called the first time it is needed, and also called again * every time the configuration is reloaded, because the charset or * codepage might have changed. **/void init_iconv(void){ int c1, c2; BOOL did_reload = False; /* so that charset_name() works we need to get the UNIX<->UCS2 going first */ if (!conv_handles[CH_UNIX][CH_UCS2]) conv_handles[CH_UNIX][CH_UCS2] = smb_iconv_open(charset_name(CH_UCS2), "ASCII"); if (!conv_handles[CH_UCS2][CH_UNIX]) conv_handles[CH_UCS2][CH_UNIX] = smb_iconv_open("ASCII", charset_name(CH_UCS2)); for (c1=0;c1<NUM_CHARSETS;c1++) { for (c2=0;c2<NUM_CHARSETS;c2++) { const char *n1 = charset_name((charset_t)c1); const char *n2 = charset_name((charset_t)c2); if (conv_handles[c1][c2] && strcmp(n1, conv_handles[c1][c2]->from_name) == 0 && strcmp(n2, conv_handles[c1][c2]->to_name) == 0) continue; did_reload = True; if (conv_handles[c1][c2]) smb_iconv_close(conv_handles[c1][c2]); conv_handles[c1][c2] = smb_iconv_open(n2,n1); if (conv_handles[c1][c2] == (smb_iconv_t)-1) { DEBUG(0,("init_iconv: Conversion from %s to %s not supported\n", charset_name((charset_t)c1), charset_name((charset_t)c2))); if (c1 != CH_UCS2) { n1 = "ASCII"; } if (c2 != CH_UCS2) { n2 = "ASCII"; } DEBUG(0,("init_iconv: Attempting to replace with conversion from %s to %s\n", n1, n2 )); conv_handles[c1][c2] = smb_iconv_open(n2,n1); if (!conv_handles[c1][c2]) { DEBUG(0,("init_iconv: Conversion from %s to %s failed", n1, n2)); smb_panic("init_iconv: conv_handle initialization failed."); } } } } if (did_reload) { /* XXX: Does this really get called every time the dos * codepage changes? */ /* XXX: Is the did_reload test too strict? */ conv_silent = True; init_doschar_table(); init_valid_table(); conv_silent = False; }}/** * Convert string from one encoding to another, making error checking etc * Slow path version - uses (slow) iconv. * * @param src pointer to source string (multibyte or singlebyte) * @param srclen length of the source string in bytes * @param dest pointer to destination string (multibyte or singlebyte) * @param destlen maximal length allowed for string * @param allow_bad_conv determines if a "best effort" conversion is acceptable (never returns errors) * @returns the number of bytes occupied in the destination * * Ensure the srclen contains the terminating zero. * **/static size_t convert_string_internal(charset_t from, charset_t to, void const *src, size_t srclen, void *dest, size_t destlen, BOOL allow_bad_conv){ size_t i_len, o_len; size_t retval; const char* inbuf = (const char*)src; char* outbuf = (char*)dest; smb_iconv_t descriptor; lazy_initialize_conv(); descriptor = conv_handles[from][to]; if (srclen == (size_t)-1) { if (from == CH_UCS2) { srclen = (strlen_w((const smb_ucs2_t *)src)+1) * 2; } else { srclen = strlen((const char *)src)+1; } } if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) { if (!conv_silent) DEBUG(0,("convert_string_internal: Conversion not supported.\n")); return (size_t)-1; } i_len=srclen; o_len=destlen; again: retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len); if(retval==(size_t)-1) { const char *reason="unknown error"; switch(errno) { case EINVAL: reason="Incomplete multibyte sequence"; if (!conv_silent) DEBUG(3,("convert_string_internal: Conversion error: %s(%s)\n",reason,inbuf)); if (allow_bad_conv) goto use_as_is; break; case E2BIG: reason="No more room"; if (!conv_silent) { if (from == CH_UNIX) { DEBUG(3,("E2BIG: convert_string(%s,%s): srclen=%u destlen=%u - '%s'\n", charset_name(from), charset_name(to), (unsigned int)srclen, (unsigned int)destlen, (const char *)src)); } else { DEBUG(3,("E2BIG: convert_string(%s,%s): srclen=%u destlen=%u\n", charset_name(from), charset_name(to), (unsigned int)srclen, (unsigned int)destlen)); } } break; case EILSEQ: reason="Illegal multibyte sequence"; if (!conv_silent) DEBUG(3,("convert_string_internal: Conversion error: %s(%s)\n",reason,inbuf)); if (allow_bad_conv) goto use_as_is; break; default: if (!conv_silent) DEBUG(0,("convert_string_internal: Conversion error: %s(%s)\n",reason,inbuf)); break; } /* smb_panic(reason); */ } return destlen-o_len; use_as_is: /* * Conversion not supported. This is actually an error, but there are so * many misconfigured iconv systems and smb.conf's out there we can't just * fail. Do a very bad conversion instead.... JRA. */ { if (o_len == 0 || i_len == 0) return destlen - o_len; if (from == CH_UCS2 && to != CH_UCS2) { /* Can't convert from ucs2 to multibyte. Replace with the default fail char. */ if (i_len < 2) return destlen - o_len; if (i_len >= 2) { *outbuf = lp_failed_convert_char(); outbuf++; o_len--; inbuf += 2; i_len -= 2; } if (o_len == 0 || i_len == 0) return destlen - o_len; /* Keep trying with the next char... */ goto again; } else if (from != CH_UCS2 && to == CH_UCS2) { /* Can't convert to ucs2 - just widen by adding the default fail char then zero. */ if (o_len < 2) return destlen - o_len; outbuf[0] = lp_failed_convert_char(); outbuf[1] = '\0'; inbuf++; i_len--; outbuf += 2; o_len -= 2; if (o_len == 0 || i_len == 0) return destlen - o_len; /* Keep trying with the next char... */ goto again; } else if (from != CH_UCS2 && to != CH_UCS2) { /* Failed multibyte to multibyte. Just copy the default fail char and try again. */ outbuf[0] = lp_failed_convert_char(); inbuf++; i_len--; outbuf++; o_len--; if (o_len == 0 || i_len == 0) return destlen - o_len; /* Keep trying with the next char... */ goto again; } else { /* Keep compiler happy.... */ return destlen - o_len; } }}/** * Convert string from one encoding to another, making error checking etc * Fast path version - handles ASCII first. * * @param src pointer to source string (multibyte or singlebyte) * @param srclen length of the source string in bytes, or -1 for nul terminated. * @param dest pointer to destination string (multibyte or singlebyte) * @param destlen maximal length allowed for string - *NEVER* -1. * @param allow_bad_conv determines if a "best effort" conversion is acceptable (never returns errors) * @returns the number of bytes occupied in the destination * * Ensure the srclen contains the terminating zero. * * This function has been hand-tuned to provide a fast path. * Don't change unless you really know what you are doing. JRA. **/size_t convert_string(charset_t from, charset_t to, void const *src, size_t srclen, void *dest, size_t destlen, BOOL allow_bad_conv){ /* * NB. We deliberately don't do a strlen here if srclen == -1. * This is very expensive over millions of calls and is taken * care of in the slow path in convert_string_internal. JRA. */#ifdef DEVELOPER SMB_ASSERT(destlen != (size_t)-1);#endif if (srclen == 0) return 0; if (from != CH_UCS2 && to != CH_UCS2) { const unsigned char *p = (const unsigned char *)src; unsigned char *q = (unsigned char *)dest; size_t slen = srclen; size_t dlen = destlen; unsigned char lastp = '\0'; size_t retval = 0; /* If all characters are ascii, fast path here. */ while (slen && dlen) { if ((lastp = *p) <= 0x7f) { *q++ = *p++; if (slen != (size_t)-1) { slen--; } dlen--; retval++; if (!lastp) break; } else {#ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS goto general_case;#else return retval + convert_string_internal(from, to, p, slen, q, dlen, allow_bad_conv);#endif } } if (!dlen) { /* Even if we fast path we should note if we ran out of room. */ if (((slen != (size_t)-1) && slen) || ((slen == (size_t)-1) && lastp)) { errno = E2BIG; } } return retval; } else if (from == CH_UCS2 && to != CH_UCS2) { const unsigned char *p = (const unsigned char *)src; unsigned char *q = (unsigned char *)dest; size_t retval = 0; size_t slen = srclen; size_t dlen = destlen; unsigned char lastp = '\0'; /* If all characters are ascii, fast path here. */ while (((slen == (size_t)-1) || (slen >= 2)) && dlen) { if (((lastp = *p) <= 0x7f) && (p[1] == 0)) { *q++ = *p; if (slen != (size_t)-1) { slen -= 2; } p += 2; dlen--; retval++; if (!lastp) break; } else {#ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS goto general_case;#else return retval + convert_string_internal(from, to, p, slen, q, dlen, allow_bad_conv);#endif } } if (!dlen) { /* Even if we fast path we should note if we ran out of room. */ if (((slen != (size_t)-1) && slen) || ((slen == (size_t)-1) && lastp)) { errno = E2BIG; } } return retval; } else if (from != CH_UCS2 && to == CH_UCS2) { const unsigned char *p = (const unsigned char *)src; unsigned char *q = (unsigned char *)dest; size_t retval = 0; size_t slen = srclen; size_t dlen = destlen; unsigned char lastp = '\0'; /* If all characters are ascii, fast path here. */ while (slen && (dlen >= 2)) { if ((lastp = *p) <= 0x7F) { *q++ = *p++; *q++ = '\0'; if (slen != (size_t)-1) { slen--; } dlen -= 2;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -