wchar.c

来自「从一个开源软件中摘取的正则表达式模块」· C语言代码 · 共 1,571 行 · 第 1/3 页
1,571 行
/* * conversion functions between pg_wchar and multibyte streams. * Tatsuo Ishii * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.66 2007/11/15 21:14:40 momjian Exp $ * *//* can be used in either frontend or backend */#include "postgres.h"#include "pg_wchar.h"/* * conversion to pg_wchar is done by "table driven." * to add an encoding support, define mb2wchar_with_len(), mblen(), dsplen() * for the particular encoding. Note that if the encoding is only * supported in the client, you don't need to define * mb2wchar_with_len() function (SJIS is the case). * * These functions generally assume that their input is validly formed. * The "verifier" functions, further down in the file, have to be more * paranoid.  We expect that mblen() does not need to examine more than * the first byte of the character to discover the correct length. * * Note: for the display output of psql to work properly, the return values * of the dsplen functions must conform to the Unicode standard. In particular * the NUL character is zero width and control characters are generally * width -1. It is recommended that non-ASCII encodings refer their ASCII * subset to the ASCII routines to ensure consistency. *//* * SQL/ASCII */static intpg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len){	int			cnt = 0;	while (len > 0 && *from)	{		*to++ = *from++;		len--;		cnt++;	}	*to = 0;	return cnt;}static intpg_ascii_mblen(const unsigned char *s){	return 1;}static intpg_ascii_dsplen(const unsigned char *s){	if (*s == '\0')		return 0;	if (*s < 0x20 || *s == 0x7f)		return -1;	return 1;}/* * EUC */static intpg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len){	int			cnt = 0;	while (len > 0 && *from)	{		if (*from == SS2 && len >= 2)	/* JIS X 0201 (so called "1 byte										 * KANA") */		{			from++;			*to = (SS2 << 8) | *from++;			len -= 2;		}		else if (*from == SS3 && len >= 3)		/* JIS X 0212 KANJI */		{			from++;			*to = (SS3 << 16) | (*from++ << 8);			*to |= *from++;			len -= 3;		}		else if (IS_HIGHBIT_SET(*from) && len >= 2)		/* JIS X 0208 KANJI */		{			*to = *from++ << 8;			*to |= *from++;			len -= 2;		}		else			/* must be ASCII */		{			*to = *from++;			len--;		}		to++;		cnt++;	}	*to = 0;	return cnt;}static inline intpg_euc_mblen(const unsigned char *s){	int			len;	if (*s == SS2)		len = 2;	else if (*s == SS3)		len = 3;	else if (IS_HIGHBIT_SET(*s))		len = 2;	else		len = 1;	return len;}static inline intpg_euc_dsplen(const unsigned char *s){	int			len;	if (*s == SS2)		len = 2;	else if (*s == SS3)		len = 2;	else if (IS_HIGHBIT_SET(*s))		len = 2;	else		len = pg_ascii_dsplen(s);	return len;}/* * EUC_JP */static intpg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len){	return pg_euc2wchar_with_len(from, to, len);}static intpg_eucjp_mblen(const unsigned char *s){	return pg_euc_mblen(s);}static intpg_eucjp_dsplen(const unsigned char *s){	int			len;	if (*s == SS2)		len = 1;	else if (*s == SS3)		len = 2;	else if (IS_HIGHBIT_SET(*s))		len = 2;	else		len = pg_ascii_dsplen(s);	return len;}/* * EUC_KR */static intpg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len){	return pg_euc2wchar_with_len(from, to, len);}static intpg_euckr_mblen(const unsigned char *s){	return pg_euc_mblen(s);}static intpg_euckr_dsplen(const unsigned char *s){	return pg_euc_dsplen(s);}/* * EUC_CN * */static intpg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len){	int			cnt = 0;	while (len > 0 && *from)	{		if (*from == SS2 && len >= 3)	/* code set 2 (unused?) */		{			from++;			*to = (SS2 << 16) | (*from++ << 8);			*to |= *from++;			len -= 3;		}		else if (*from == SS3 && len >= 3)		/* code set 3 (unsed ?) */		{			from++;			*to = (SS3 << 16) | (*from++ << 8);			*to |= *from++;			len -= 3;		}		else if (IS_HIGHBIT_SET(*from) && len >= 2)		/* code set 1 */		{			*to = *from++ << 8;			*to |= *from++;			len -= 2;		}		else		{			*to = *from++;			len--;		}		to++;		cnt++;	}	*to = 0;	return cnt;}static intpg_euccn_mblen(const unsigned char *s){	int			len;	if (IS_HIGHBIT_SET(*s))		len = 2;	else		len = 1;	return len;}static intpg_euccn_dsplen(const unsigned char *s){	int			len;	if (IS_HIGHBIT_SET(*s))		len = 2;	else		len = pg_ascii_dsplen(s);	return len;}/* * EUC_TW * */static intpg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len){	int			cnt = 0;	while (len > 0 && *from)	{		if (*from == SS2 && len >= 4)	/* code set 2 */		{			from++;			*to = (((uint32) SS2) << 24) | (*from++ << 16);			*to |= *from++ << 8;			*to |= *from++;			len -= 4;		}		else if (*from == SS3 && len >= 3)		/* code set 3 (unused?) */		{			from++;			*to = (SS3 << 16) | (*from++ << 8);			*to |= *from++;			len -= 3;		}		else if (IS_HIGHBIT_SET(*from) && len >= 2)		/* code set 2 */		{			*to = *from++ << 8;			*to |= *from++;			len -= 2;		}		else		{			*to = *from++;			len--;		}		to++;		cnt++;	}	*to = 0;	return cnt;}static intpg_euctw_mblen(const unsigned char *s){	int			len;	if (*s == SS2)		len = 4;	else if (*s == SS3)		len = 3;	else if (IS_HIGHBIT_SET(*s))		len = 2;	else		len = 1;	return len;}static intpg_euctw_dsplen(const unsigned char *s){	int			len;	if (*s == SS2)		len = 2;	else if (*s == SS3)		len = 2;	else if (IS_HIGHBIT_SET(*s))		len = 2;	else		len = pg_ascii_dsplen(s);	return len;}/* * JOHAB */static intpg_johab_mblen(const unsigned char *s){	return pg_euc_mblen(s);}static intpg_johab_dsplen(const unsigned char *s){	return pg_euc_dsplen(s);}/* * convert UTF8 string to pg_wchar (UCS-4) * caller must allocate enough space for "to", including a trailing zero! * len: length of from. * "from" not necessarily null terminated. */static intpg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len){	int			cnt = 0;	uint32		c1,				c2,				c3,				c4;	while (len > 0 && *from)	{		if ((*from & 0x80) == 0)		{			*to = *from++;			len--;		}		else if ((*from & 0xe0) == 0xc0)		{			if (len < 2)				break;			/* drop trailing incomplete char */			c1 = *from++ & 0x1f;			c2 = *from++ & 0x3f;			*to = (c1 << 6) | c2;			len -= 2;		}		else if ((*from & 0xf0) == 0xe0)		{			if (len < 3)				break;			/* drop trailing incomplete char */			c1 = *from++ & 0x0f;			c2 = *from++ & 0x3f;			c3 = *from++ & 0x3f;			*to = (c1 << 12) | (c2 << 6) | c3;			len -= 3;		}		else if ((*from & 0xf8) == 0xf0)		{			if (len < 4)				break;			/* drop trailing incomplete char */			c1 = *from++ & 0x07;			c2 = *from++ & 0x3f;			c3 = *from++ & 0x3f;			c4 = *from++ & 0x3f;			*to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;			len -= 4;		}		else		{			/* treat a bogus char as length 1; not ours to raise error */			*to = *from++;			len--;		}		to++;		cnt++;	}	*to = 0;	return cnt;}/* * Return the byte length of a UTF8 character pointed to by s * * Note: in the current implementation we do not support UTF8 sequences * of more than 4 bytes; hence do NOT return a value larger than 4. * We return "1" for any leading byte that is either flat-out illegal or * indicates a length larger than we support. * * pg_utf2wchar_with_len(), utf2ucs(), pg_utf8_islegal(), and perhaps * other places would need to be fixed to change this. */intpg_utf_mblen(const unsigned char *s){	int			len;	if ((*s & 0x80) == 0)		len = 1;	else if ((*s & 0xe0) == 0xc0)		len = 2;	else if ((*s & 0xf0) == 0xe0)		len = 3;	else if ((*s & 0xf8) == 0xf0)		len = 4;#ifdef NOT_USED	else if ((*s & 0xfc) == 0xf8)		len = 5;	else if ((*s & 0xfe) == 0xfc)		len = 6;#endif	else		len = 1;	return len;}/* * This is an implementation of wcwidth() and wcswidth() as defined in * "The Single UNIX Specification, Version 2, The Open Group, 1997" * <http://www.UNIX-systems.org/online.html> * * Markus Kuhn -- 2001-09-08 -- public domain * * customised for PostgreSQL * * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c */struct mbinterval{	unsigned short first;	unsigned short last;};/* auxiliary function for binary search in interval table */static intmbbisearch(pg_wchar ucs, const struct mbinterval * table, int max){	int			min = 0;	int			mid;	if (ucs < table[0].first || ucs > table[max].last)		return 0;	while (max >= min)	{		mid = (min + max) / 2;		if (ucs > table[mid].last)			min = mid + 1;		else if (ucs < table[mid].first)			max = mid - 1;		else			return 1;	}	return 0;}/* The following functions define the column width of an ISO 10646 * character as follows: * *	  - The null character (U+0000) has a column width of 0. * *	  - Other C0/C1 control characters and DEL will lead to a return *		value of -1. * *	  - Non-spacing and enclosing combining characters (general *		category code Mn or Me in the Unicode database) have a *		column width of 0. * *	  - Other format characters (general category code Cf in the Unicode *		database) and ZERO WIDTH SPACE (U+200B) have a column width of 0. * *	  - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) *		have a column width of 0. * *	  - Spacing characters in the East Asian Wide (W) or East Asian *		FullWidth (F) category as defined in Unicode Technical *		Report #11 have a column width of 2. * *	  - All remaining characters (including all printable *		ISO 8859-1 and WGL4 characters, Unicode control characters, *		etc.) have a column width of 1. * * This implementation assumes that wchar_t characters are encoded * in ISO 10646. */static int
wchar.c - 源码说明

本页面展示了「从一个开源软件中摘取的正则表达式模块」中的 wchar.c 源码文件，采用 C语言编程语言编写，共 1,571 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与开源软件相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?