📄 utf8.cpp

📁 一个开源的Flash 播放器,可以在Windows/Linux 上运行
💻 CPP
字号:
// utf8.cpp	-- Thatcher Ulrich 2004   -*- coding: utf-8;-*-

// This source code has been donated to the Public Domain.  Do
// whatever you want with it.  THE AUTHOR DOES NOT WARRANT THIS CODE.

// Utility code for dealing with UTF-8 encoded text.
//
// Much useful info at "UTF-8 and Unicode FAQ" http://www.cl.cam.ac.uk/~mgk25/unicode.html


#include "base/utf8.h"


Uint32	utf8::decode_next_unicode_character(const char** utf8_buffer)
{
	Uint32	uc;
	char	c;

	// Security considerations:
	//
	// If we hit a zero byte, we want to return 0 without stepping
	// the buffer pointer past the 0.
	//
	// If we hit an "overlong sequence"; i.e. a character encoded
	// in a longer multibyte string than is necessary, then we
	// need to discard the character.  This is so attackers can't
	// disguise dangerous characters or character sequences --
	// there is only one valid encoding for each character.
	//
	// If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
	// 0xFFFF } then we ignore them; they are not valid in UTF-8.

// This isn't actually an invalid character; it's a valid char that
// looks like an inverted question mark.
#define INVALID 0x0FFFD

#define FIRST_BYTE(mask, shift)		\
	uc = (c & (mask)) << (shift);

#define NEXT_BYTE(shift)						\
	c = **utf8_buffer;						\
	if (c == 0) return 0; /* end of buffer, do not advance */	\
	if ((c & 0xC0) != 0x80) return INVALID; /* standard check */	\
	(*utf8_buffer)++;						\
	uc |= (c & 0x3F) << shift;

	c = **utf8_buffer;
	if (c == 0) return 0;	// End of buffer.  Do not advance.

	(*utf8_buffer)++;
	if ((c & 0x80) == 0) return (Uint32) c;	// Conventional 7-bit ASCII.

	// Multi-byte sequences.
	if ((c & 0xE0) == 0xC0)
	{
		// Two-byte sequence.
		FIRST_BYTE(0x1F, 6);
		NEXT_BYTE(0);
		if (uc < 0x80) return INVALID;	// overlong
		return uc;
	}
	else if ((c & 0xF0) == 0xE0)
	{
		// Three-byte sequence.
		FIRST_BYTE(0x0F, 12);
		NEXT_BYTE(6);
		NEXT_BYTE(0);
		if (uc < 0x800) return INVALID;	// overlong
		if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID;	// not valid ISO 10646
		if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID;	// not valid ISO 10646
		return uc;
	}
	else if ((c & 0xF8) == 0xF0)
	{
		// Four-byte sequence.
		FIRST_BYTE(0x07, 18);
		NEXT_BYTE(12);
		NEXT_BYTE(6);
		NEXT_BYTE(0);
		if (uc < 0x010000) return INVALID;	// overlong
		return uc;
	}
	else if ((c & 0xFC) == 0xF8)
	{
		// Five-byte sequence.
		FIRST_BYTE(0x03, 24);
		NEXT_BYTE(18);
		NEXT_BYTE(12);
		NEXT_BYTE(6);
		NEXT_BYTE(0);
		if (uc < 0x0200000) return INVALID;	// overlong
		return uc;
	}
	else if ((c & 0xFE) == 0xFC)
	{
		// Six-byte sequence.
		FIRST_BYTE(0x01, 30);
		NEXT_BYTE(24);
		NEXT_BYTE(18);
		NEXT_BYTE(12);
		NEXT_BYTE(6);
		NEXT_BYTE(0);
		if (uc < 0x04000000) return INVALID;	// overlong
		return uc;
	}
	else
	{
		// Invalid.
		return INVALID;
	}
}


void	utf8::encode_unicode_character(char* buffer, int* index, Uint32 ucs_character)
{
	if (ucs_character <= 0x7F)
	{
		// Plain single-byte ASCII.
		buffer[(*index)++] = (char) ucs_character;
	}
	else if (ucs_character <= 0x7FF)
	{
		// Two bytes.
		buffer[(*index)++] = 0xC0 | (ucs_character >> 6);
		buffer[(*index)++] = 0x80 | ((ucs_character >> 0) & 0x3F);
	}
	else if (ucs_character <= 0xFFFF)
	{
		// Three bytes.
		buffer[(*index)++] = 0xE0 | (ucs_character >> 12);
		buffer[(*index)++] = 0x80 | ((ucs_character >> 6) & 0x3F);
		buffer[(*index)++] = 0x80 | ((ucs_character >> 0) & 0x3F);
	}
	else if (ucs_character <= 0x1FFFFF)
	{
		// Four bytes.
		buffer[(*index)++] = 0xF0 | (ucs_character >> 18);
		buffer[(*index)++] = 0x80 | ((ucs_character >> 12) & 0x3F);
		buffer[(*index)++] = 0x80 | ((ucs_character >> 6) & 0x3F);
		buffer[(*index)++] = 0x80 | ((ucs_character >> 0) & 0x3F);
	}
	else if (ucs_character <= 0x3FFFFFF)
	{
		// Five bytes.
		buffer[(*index)++] = 0xF8 | (ucs_character >> 24);
		buffer[(*index)++] = 0x80 | ((ucs_character >> 18) & 0x3F);
		buffer[(*index)++] = 0x80 | ((ucs_character >> 12) & 0x3F);
		buffer[(*index)++] = 0x80 | ((ucs_character >> 6) & 0x3F);
		buffer[(*index)++] = 0x80 | ((ucs_character >> 0) & 0x3F);
	}
	else if (ucs_character <= 0x7FFFFFFF)
	{
		// Six bytes.
		buffer[(*index)++] = 0xFC | (ucs_character >> 30);
		buffer[(*index)++] = 0x80 | ((ucs_character >> 24) & 0x3F);
		buffer[(*index)++] = 0x80 | ((ucs_character >> 18) & 0x3F);
		buffer[(*index)++] = 0x80 | ((ucs_character >> 12) & 0x3F);
		buffer[(*index)++] = 0x80 | ((ucs_character >> 6) & 0x3F);
		buffer[(*index)++] = 0x80 | ((ucs_character >> 0) & 0x3F);
	}
	else
	{
		// Invalid char; don't encode anything.
	}
}


#ifdef UTF8_UNIT_TEST

// Compile this test case with something like:
//
// gcc utf8.cpp -g -I.. -DUTF8_UNIT_TEST -lstdc++ -o utf8_test
//
//    or
//
// cl utf8.cpp -Zi -Od -DUTF8_UNIT_TEST -I..
//
// If possible, try running the test program with the first arg
// pointing at the file:
//
// http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
// 
// and examine the results by eye to make sure they are acceptable to
// you.


#include "base/utility.h"
#include <stdio.h>


bool	check_equal(const char* utf8_in, const Uint32* ucs_in)
{
	for (;;)
	{
		Uint32	next_ucs = *ucs_in++;
		Uint32	next_ucs_from_utf8 = utf8::decode_next_unicode_character(&utf8_in);
		if (next_ucs != next_ucs_from_utf8)
		{
			return false;
		}
		if (next_ucs == 0)
		{
			assert(next_ucs_from_utf8 == 0);
			break;
		}
	}

	return true;
}


void	log_ascii(const char* line)
{
	for (;;)
	{
		unsigned char	c = (unsigned char) *line++;
		if (c == 0)
		{
			// End of line.
			return;
		}
		else if (c != '\n'
			 && (c < 32 || c > 127))
		{
			// Non-printable as plain ASCII.
			printf("<0x%02X>", (int) c);
		}
		else
		{
			printf("%c", c);
		}
	}
}


void	log_ucs(const Uint32* line)
{
	for (;;)
	{
		Uint32	uc = *line++;
		if (uc == 0)
		{
			// End of line.
			return;
		}
		else if (uc != '\n'
			 && (uc < 32 || uc > 127))
		{
			// Non-printable as plain ASCII.
			printf("<U-%04X>", uc);
		}
		else
		{
			printf("%c", (char) uc);
		}
	}
}


int main(int argc, const char* argv[])
{
	// Simple canned test.
	{
		const char*	test8 = "Ignacio Castaño";
		const Uint32	test32[] =
			{
				0x49, 0x67, 0x6E, 0x61, 0x63,
				0x69, 0x6F, 0x20, 0x43, 0x61,
				0x73, 0x74, 0x61, 0xF1, 0x6F,
				0x00
			};

		assert(check_equal(test8, test32));
	}

	// If user passed an arg, try reading the file as UTF-8 encoded text.
	if (argc > 1)
	{
		const char*	filename = argv[1];
		FILE*	fp = fopen(filename, "rb");
		if (fp == NULL)
		{
			printf("Can't open file '%s'\n", filename);
			return 1;
		}

		// Read lines from the file, encode/decode them, and highlight discrepancies.
		const int LINE_SIZE = 200;	// max line size
		char	line_buffer_utf8[LINE_SIZE];
		char	reencoded_utf8[6 * LINE_SIZE];
		Uint32	line_buffer_ucs[LINE_SIZE];

		int	byte_counter = 0;
		for (;;)
		{
			int	c = fgetc(fp);
			if (c == EOF)
			{
				// Done.
				break;
			}
			line_buffer_utf8[byte_counter++] = c;
			if (c == '\n' || byte_counter >= LINE_SIZE - 2)
			{
				// End of line.  Process the line.
				line_buffer_utf8[byte_counter++] = 0;	// terminate.

				// Decode into UCS.
				const char*	p = line_buffer_utf8;
				Uint32*	q = line_buffer_ucs;
				for (;;)
				{
					Uint32	uc = utf8::decode_next_unicode_character(&p);
					*q++ = uc;

					assert(q < line_buffer_ucs + LINE_SIZE);
					assert(p < line_buffer_utf8 + LINE_SIZE);

					if (uc == 0) break;
				}

				// Encode back into UTF-8.
				q = line_buffer_ucs;
				int	index = 0;
				for (;;)
				{
					Uint32	uc = *q++;
					assert(index < LINE_SIZE * 6 - 6);
					int	last_index = index;
					utf8::encode_unicode_character(reencoded_utf8, &index, uc);
					assert(index <= last_index + 6);
					if (uc == 0) break;
				}

// This can be useful for debugging.
#if 0
				// Show the UCS and the re-encoded UTF-8.
				log_ucs(line_buffer_ucs);
				log_ascii(reencoded_utf8);
#endif // 0

				assert(check_equal(line_buffer_utf8, line_buffer_ucs));
				assert(check_equal(reencoded_utf8, line_buffer_ucs));

				// Start next line.
				byte_counter = 0;
			}
		}

		fclose(fp);
	}

	return 0;
}


#endif // UTF8_UNIT_TEST


// Local Variables:
// mode: C++
// c-basic-offset: 8 
// tab-width: 8
// indent-tabs-mode: t
// End:
💿 文件大小 604 K
👤 上传用户 maple_78
📂 所属分类 Linux/Unix编程
🏷️ 相关标签

#Windows #Flash #Linux #开源
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -