iconv.c

来自「在Linux/Unix下面访问WINDOWS SQLSERVER 的ODBC驱动」· C语言 代码 · 共 1,453 行 · 第 1/3 页

C
1,453
字号
/* FreeTDS - Library of routines accessing Sybase and Microsoft databases * Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005  Brian Bruns * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. *//* * iconv.c, handle all the conversion stuff without spreading #if HAVE_ICONV_ALWAYS  * all over the other code */#if HAVE_CONFIG_H#include <config.h>#endif#include <stdarg.h>#include <stdio.h>#include <assert.h>#if HAVE_STRING_H#include <string.h>#endif /* HAVE_STRING_H */#if HAVE_ERRNO_H#include <errno.h>#endif#include "tds.h"#include "tdsiconv.h"#if HAVE_ICONV#include <iconv.h>#endif#ifdef DMALLOC#include <dmalloc.h>#endif/* define this for now; remove when done testing */#define HAVE_ICONV_ALWAYS 1TDS_RCSID(var, "$Id: iconv.c,v 1.132 2007/01/02 20:47:05 jklowden Exp $");#define CHARSIZE(charset) ( ((charset)->min_bytes_per_char == (charset)->max_bytes_per_char )? \				(charset)->min_bytes_per_char : 0 )#if !HAVE_ICONV_ALWAYSstatic int bytes_per_char(TDS_ENCODING * charset);#endifstatic const char *collate2charset(int sql_collate, int lcid);static int skip_one_input_sequence(iconv_t cd, const TDS_ENCODING * charset, const char **input, size_t * input_size);static int tds_iconv_info_init(TDSICONV * char_conv, const char *client_name, const char *server_name);static int tds_iconv_init(void);static int tds_canonical_charset(const char *charset_name);static void _iconv_close(iconv_t * cd);static void tds_iconv_info_close(TDSICONV * char_conv);/** * \ingroup libtds * \defgroup conv Charset conversion * Convert between different charsets. */#include "encodings.h"/* this will contain real iconv names */static const char *iconv_names[sizeof(canonic_charsets) / sizeof(canonic_charsets[0])];static int iconv_initialized = 0;static const char *ucs2name;enum{ POS_ISO1, POS_UTF8, POS_UCS2LE, POS_UCS2BE };/** * Initialize charset searching for UTF-8, UCS-2 and ISO8859-1 */static inttds_iconv_init(void){	int i;	iconv_t cd;	/* first entries should be constants */	assert(strcmp(canonic_charsets[POS_ISO1].name, "ISO-8859-1") == 0);	assert(strcmp(canonic_charsets[POS_UTF8].name, "UTF-8") == 0);	assert(strcmp(canonic_charsets[POS_UCS2LE].name, "UCS-2LE") == 0);	assert(strcmp(canonic_charsets[POS_UCS2BE].name, "UCS-2BE") == 0);	/* fast tests for GNU-iconv */	cd = tds_sys_iconv_open("ISO-8859-1", "UTF-8");	if (cd != (iconv_t) - 1) {		iconv_names[POS_ISO1] = "ISO-8859-1";		iconv_names[POS_UTF8] = "UTF-8";		tds_sys_iconv_close(cd);	} else {		/* search names for ISO8859-1 and UTF-8 */		for (i = 0; iconv_aliases[i].alias; ++i) {			int j;			if (iconv_aliases[i].canonic != POS_ISO1)				continue;			for (j = 0; iconv_aliases[j].alias; ++j) {				if (iconv_aliases[j].canonic != POS_UTF8)					continue;				cd = tds_sys_iconv_open(iconv_aliases[i].alias, iconv_aliases[j].alias);				if (cd != (iconv_t) - 1) {					iconv_names[POS_ISO1] = iconv_aliases[i].alias;					iconv_names[POS_UTF8] = iconv_aliases[j].alias;					tds_sys_iconv_close(cd);					break;				}			}			if (iconv_names[POS_ISO1])				break;		}		/* required characters not found !!! */		if (!iconv_names[POS_ISO1])			return 1;	}	/* now search for UCS-2 */	cd = tds_sys_iconv_open(iconv_names[POS_ISO1], "UCS-2LE");	if (cd != (iconv_t) - 1) {		iconv_names[POS_UCS2LE] = "UCS-2LE";		tds_sys_iconv_close(cd);	}	cd = tds_sys_iconv_open(iconv_names[POS_ISO1], "UCS-2BE");	if (cd != (iconv_t) - 1) {		iconv_names[POS_UCS2BE] = "UCS-2BE";		tds_sys_iconv_close(cd);	}	/* long search needed ?? */	if (!iconv_names[POS_UCS2LE] || !iconv_names[POS_UCS2BE]) {		for (i = 0; iconv_aliases[i].alias; ++i) {			if (strncmp(canonic_charsets[iconv_aliases[i].canonic].name, "UCS-2", 5) != 0)				continue;			cd = tds_sys_iconv_open(iconv_aliases[i].alias, iconv_names[POS_ISO1]);			if (cd != (iconv_t) - 1) {				char ib[1];				char ob[4];				size_t il, ol;				ICONV_CONST char *pib;				char *pob;				int byte_sequence = 0;				/* try to convert 'A' and check result */				ib[0] = 0x41;				pib = ib;				pob = ob;				il = 1;				ol = 4;				ob[0] = ob[1] = 0;				if (tds_sys_iconv(cd, &pib, &il, &pob, &ol) != (size_t) - 1) {					/* byte order sequence ?? */					if (ol == 0) {						ob[0] = ob[2];						byte_sequence = 1;						/* TODO save somewhere */					}					/* save name without sequence (if present) */					if (ob[0])						il = POS_UCS2LE;					else						il = POS_UCS2BE;					if (!iconv_names[il] || !byte_sequence)						iconv_names[il] = iconv_aliases[i].alias;				}				tds_sys_iconv_close(cd);			}		}	}	/* we need a UCS-2 (big endian or little endian) */	if (!iconv_names[POS_UCS2LE] && !iconv_names[POS_UCS2BE])		return 2;	ucs2name = iconv_names[POS_UCS2LE] ? iconv_names[POS_UCS2LE] : iconv_names[POS_UCS2BE];	for (i = 0; i < 4; ++i)		tdsdump_log(TDS_DBG_INFO1, "names for %s: %s\n", canonic_charsets[i].name,			    iconv_names[i] ? iconv_names[i] : "(null)");	/* success (it should always occurs) */	return 0;}/** * Get iconv name given canonic */static voidtds_get_iconv_name(int charset){	int i;	iconv_t cd;	assert(iconv_initialized);	/* try using canonic name and UTF-8 and UCS2 */	cd = tds_sys_iconv_open(iconv_names[POS_UTF8], canonic_charsets[charset].name);	if (cd != (iconv_t) - 1) {		iconv_names[charset] = canonic_charsets[charset].name;		tds_sys_iconv_close(cd);		return;	}	cd = tds_sys_iconv_open(ucs2name, canonic_charsets[charset].name);	if (cd != (iconv_t) - 1) {		iconv_names[charset] = canonic_charsets[charset].name;		tds_sys_iconv_close(cd);		return;	}	/* try all alternatives */	for (i = 0; iconv_aliases[i].alias; ++i) {		if (iconv_aliases[i].canonic != charset)			continue;		cd = tds_sys_iconv_open(iconv_names[POS_UTF8], iconv_aliases[i].alias);		if (cd != (iconv_t) - 1) {			iconv_names[charset] = iconv_aliases[i].alias;			tds_sys_iconv_close(cd);			return;		}		cd = tds_sys_iconv_open(ucs2name, iconv_aliases[i].alias);		if (cd != (iconv_t) - 1) {			iconv_names[charset] = iconv_aliases[i].alias;			tds_sys_iconv_close(cd);			return;		}	}	/* charset not found, use memcpy */	iconv_names[charset] = "";}static voidtds_iconv_reset(TDSICONV *conv){	/*	 * (min|max)_bytes_per_char can be used to divide	 * so init to safe values	 */	conv->server_charset.min_bytes_per_char = 1;	conv->server_charset.max_bytes_per_char = 1;	conv->client_charset.min_bytes_per_char = 1;	conv->client_charset.max_bytes_per_char = 1;	conv->server_charset.name = conv->client_charset.name = "";	conv->to_wire = (iconv_t) - 1;	conv->to_wire2 = (iconv_t) - 1;	conv->from_wire = (iconv_t) - 1;	conv->from_wire2 = (iconv_t) - 1;}/** * Allocate iconv stuff * \return 0 for success */inttds_iconv_alloc(TDSSOCKET * tds){	int i;	TDSICONV *char_conv;	assert(!tds->char_convs);	if (!(tds->char_convs = (TDSICONV **) malloc(sizeof(TDSICONV *) * (initial_char_conv_count + 1))))	return 1;	char_conv = (TDSICONV *) malloc(sizeof(TDSICONV) * initial_char_conv_count);	if (!char_conv) {		TDS_ZERO_FREE(tds->char_convs);		return 1;	}	memset(char_conv, 0, sizeof(TDSICONV) * initial_char_conv_count);	tds->char_conv_count = initial_char_conv_count + 1;	for (i = 0; i < initial_char_conv_count; ++i) {		tds->char_convs[i] = &char_conv[i];		tds_iconv_reset(&char_conv[i]);	}	/* chardata is just a pointer to another iconv info */	tds->char_convs[initial_char_conv_count] = tds->char_convs[client2server_chardata];	return 0;}/** * \addtogroup conv * @{  * Set up the initial iconv conversion descriptors. * When the socket is allocated, three TDSICONV structures are attached to iconv.   * They have fixed meanings: * 	\li 0. Client <-> UCS-2 (client2ucs2) * 	\li 1. Client <-> server single-byte charset (client2server_chardata) *	\li 2. ISO8859-1  <-> server meta data	(iso2server_metadata) * * Other designs that use less data are possible, but these three conversion needs are  * very often needed.  By reserving them, we avoid searching the array for our most common purposes. * * To solve different iconv names and portability problems FreeTDS maintains  * a list of aliases each charset.   *  * First we discover the names of our minimum required charsets (UTF-8, ISO8859-1 and UCS2).   * Later, as and when it's needed, we try to discover others. * * There is one list of canonic names (GNU iconv names) and two sets of aliases * (one for other iconv implementations and another for Sybase). For every * canonic charset name we cache the iconv name found during discovery.  */voidtds_iconv_open(TDSSOCKET * tds, const char *charset){	static const char UCS_2LE[] = "UCS-2LE";	const char *name;	int fOK, ret;	TDS_ENCODING *client = &tds->char_convs[client2ucs2]->client_charset;	TDS_ENCODING *server = &tds->char_convs[client2ucs2]->server_charset;#if !HAVE_ICONV_ALWAYS	strcpy(client->name, "ISO-8859-1");	strcpy(server->name, UCS_2LE);	bytes_per_char(client);	bytes_per_char(server);	return;#else	/* initialize */	if (!iconv_initialized) {		if ((ret = tds_iconv_init()) > 0) {			static const char names[][12] = { "ISO 8859-1", "UTF-8" };			assert(ret < 3);			tdsdump_log(TDS_DBG_FUNC, "error: tds_iconv_init() returned %d; "						  "could not find a name for %s that your iconv accepts.\n"						  "use: \"configure --disable-libiconv\"", ret, names[ret-1]);			assert(ret == 0);			return;		}		iconv_initialized = 1;	}	/* 	 * Client <-> UCS-2 (client2ucs2)	 */	tdsdump_log(TDS_DBG_FUNC, "iconv to convert client-side data to the \"%s\" character set\n", charset);	fOK = tds_iconv_info_init(tds->char_convs[client2ucs2], charset, UCS_2LE);	if (!fOK)		return;	/* 	 * How many UTF-8 bytes we need is a function of what the input character set is.	 * TODO This could definitely be more sophisticated, but it deals with the common case.	 */	if (client->min_bytes_per_char == 1 && client->max_bytes_per_char == 4 && server->max_bytes_per_char == 1) {		/* ie client is UTF-8 and server is ISO-8859-1 or variant. */		client->max_bytes_per_char = 3;	}	/* 	 * Client <-> server single-byte charset	 * TODO: the server hasn't reported its charset yet, so this logic can't work here.  	 *       not sure what to do about that yet.  	 */	tds->char_convs[client2server_chardata]->flags = TDS_ENCODING_MEMCPY;	if (tds->env.charset) {		fOK = tds_iconv_info_init(tds->char_convs[client2server_chardata], charset, tds->env.charset);		if (!fOK)			return;	}	/* 	 * ISO8859-1 <-> server meta data	 */	name = UCS_2LE;	if (tds->major_version < 7) {		name = "ISO-8859-1";		if (tds->env.charset)			name = tds->env.charset;	}	fOK = tds_iconv_info_init(tds->char_convs[iso2server_metadata], "ISO-8859-1", name);#endif}/** * Open iconv descriptors to convert between character sets (both directions). * 1.  Look up the canonical names of the character sets. * 2.  Look up their widths. * 3.  Ask iconv to open a conversion descriptor. * 4.  Fail if any of the above offer any resistance.   * \remarks The charset names written to \a iconv will be the canonical names,  *          not necessarily the names passed in.  */static inttds_iconv_info_init(TDSICONV * char_conv, const char *client_name, const char *server_name){	TDS_ENCODING *client = &char_conv->client_charset;	TDS_ENCODING *server = &char_conv->server_charset;	int server_canonical, client_canonical;	assert(client_name && server_name);	assert(char_conv->to_wire == (iconv_t) - 1);	assert(char_conv->to_wire2 == (iconv_t) - 1);	assert(char_conv->from_wire == (iconv_t) - 1);	assert(char_conv->from_wire2 == (iconv_t) - 1);	client_canonical = tds_canonical_charset(client_name);	server_canonical = tds_canonical_charset(server_name);	if (client_canonical < 0) {		tdsdump_log(TDS_DBG_FUNC, "tds_iconv_info_init: client charset name \"%s\" unrecognized\n", client->name);		return 0;	}	if (server_canonical < 0) {		tdsdump_log(TDS_DBG_FUNC, "tds_iconv_info_init: server charset name \"%s\" unrecognized\n", client->name);		return 0;	}	*client = canonic_charsets[client_canonical];	*server = canonic_charsets[server_canonical];	/* special case, same charset, no conversion */	if (client_canonical == server_canonical) {		char_conv->to_wire = (iconv_t) - 1;		char_conv->from_wire = (iconv_t) - 1;		char_conv->flags = TDS_ENCODING_MEMCPY;		return 1;	}	char_conv->flags = 0;	if (!iconv_names[server_canonical]) {		switch (server_canonical) {		case POS_UCS2LE:			server_canonical = POS_UCS2BE;			char_conv->flags = TDS_ENCODING_SWAPBYTE;			break;		case POS_UCS2BE:			server_canonical = POS_UCS2LE;			char_conv->flags = TDS_ENCODING_SWAPBYTE;			break;		}	}	/* get iconv names */	if (!iconv_names[client_canonical])		tds_get_iconv_name(client_canonical);	if (!iconv_names[server_canonical])		tds_get_iconv_name(server_canonical);	/* names available ?? */	if (!iconv_names[client_canonical][0] || !iconv_names[server_canonical][0]) {		char_conv->to_wire = (iconv_t) - 1;		char_conv->from_wire = (iconv_t) - 1;		char_conv->flags = TDS_ENCODING_MEMCPY;		tdsdump_log(TDS_DBG_FUNC, "tds_iconv_info_init: use memcpy to convert \"%s\"->\"%s\"\n", client->name,			    server->name);		return 0;	}	char_conv->to_wire = tds_sys_iconv_open(iconv_names[server_canonical], iconv_names[client_canonical]);	if (char_conv->to_wire == (iconv_t) - 1) {		tdsdump_log(TDS_DBG_FUNC, "tds_iconv_info_init: cannot convert \"%s\"->\"%s\"\n", client->name, server->name);	}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?