utf8encoding.cs

来自「没的没的没的没的没的没的没的没的没的没的没的没的没的没」· CS 代码 · 共 1,015 行 · 第 1/2 页
1,015 行
/* * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class. * * Copyright (C) 2001, 2002  Southern Storm Software, Pty Ltd. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */namespace System.Text{using System;[Serializable]public class UTF8Encoding : Encoding{	// Magic number used by Windows for UTF-8.	internal const int UTF8_CODE_PAGE = 65001;	// Internal state.	private bool emitIdentifier;	private bool throwOnInvalid;	// Constructors.	public UTF8Encoding()			: this(false, false) {}	public UTF8Encoding(bool encoderShouldEmitUTF8Identifier)			: this(encoderShouldEmitUTF8Identifier, false) {}	public UTF8Encoding(bool encoderShouldEmitUTF8Identifier,						bool throwOnInvalidBytes)			: base(UTF8_CODE_PAGE)			{				emitIdentifier = encoderShouldEmitUTF8Identifier;				throwOnInvalid = throwOnInvalidBytes;			}	// Internal version of "GetByteCount" which can handle a rolling	// state between multiple calls to this method.	private static int InternalGetByteCount(char[] chars, int index,											int count, uint leftOver,											bool flush)			{				// Validate the parameters.				if(chars == null)				{					throw new ArgumentNullException("chars");				}				if(index < 0 || index > chars.Length)				{					throw new ArgumentOutOfRangeException						("index", _("ArgRange_Array"));				}				if(count < 0 || count > (chars.Length - index))				{					throw new ArgumentOutOfRangeException						("count", _("ArgRange_Array"));				}				// Determine the lengths of all characters.				char ch;				int length = 0;				uint pair = leftOver;				while(count > 0)				{					ch = chars[index];					if(pair == 0)					{						if(ch < '\u0080')						{							++length;						}						else if(ch < '\u0800')						{							length += 2;						}						else if(ch >= '\uD800' && ch <= '\uDBFF')						{							// This is the start of a surrogate pair.							pair = (uint)ch;						}						else						{							length += 3;						}					}					else if(ch >= '\uDC00' && ch <= '\uDFFF')					{						// We have a surrogate pair.						length += 4;						pair = 0;					}					else					{						// We have a surrogate start followed by a						// regular character.  Technically, this is						// invalid, but we have to do something.						// We write out the surrogate start and then						// re-visit the current character again.						length += 3;						pair = 0;						continue;					}					++index;					--count;				}				if(flush && pair != 0)				{					// Flush the left-over surrogate pair start.					length += 3;				}				// Return the final length to the caller.				return length;			}	// Get the number of bytes needed to encode a character buffer.	public override int GetByteCount(char[] chars, int index, int count)			{				return InternalGetByteCount(chars, index, count, 0, true);			}	// Convenience wrappers for "GetByteCount".	public override int GetByteCount(String s)			{				// Validate the parameters.				if(s == null)				{					throw new ArgumentNullException("s");				}				// Determine the lengths of all characters.				char ch;				int index = 0;				int count = s.Length;				int length = 0;				uint pair;				while(count > 0)				{					ch = s[index++];					if(ch < '\u0080')					{						++length;					}					else if(ch < '\u0800')					{						length += 2;					}					else if(ch >= '\uD800' && ch <= '\uDBFF' && count > 1)					{						// This may be the start of a surrogate pair.						pair = (uint)(s[index]);						if(pair >= (uint)0xDC00 && pair <= (uint)0xDFFF)						{							length += 4;							++index;							--count;						}						else						{							length += 3;						}					}					else					{						length += 3;					}					--count;				}				// Return the final length to the caller.				return length;			}	// Internal version of "GetBytes" which can handle a rolling	// state between multiple calls to this method.	private static int InternalGetBytes(char[] chars, int charIndex,									    int charCount, byte[] bytes,									    int byteIndex, ref uint leftOver,										bool flush)			{				// Validate the parameters.				if(chars == null)				{					throw new ArgumentNullException("chars");				}				if(bytes == null)				{					throw new ArgumentNullException("bytes");				}				if(charIndex < 0 || charIndex > chars.Length)				{					throw new ArgumentOutOfRangeException						("charIndex", _("ArgRange_Array"));				}				if(charCount < 0 || charCount > (chars.Length - charIndex))				{					throw new ArgumentOutOfRangeException						("charCount", _("ArgRange_Array"));				}				if(byteIndex < 0 || byteIndex > bytes.Length)				{					throw new ArgumentOutOfRangeException						("byteIndex", _("ArgRange_Array"));				}				// Convert the characters into bytes.				char ch;				int length = bytes.Length;				uint pair;				uint left = leftOver;				int posn = byteIndex;				while(charCount > 0)				{					// Fetch the next UTF-16 character pair value.					ch = chars[charIndex++];					--charCount;					if(left == 0)					{						if(ch >= '\uD800' && ch <= '\uDBFF')						{							// This is the start of a surrogate pair.							left = (uint)ch;							continue;						}						else						{							// This is a regular character.							pair = (uint)ch;						}					}					else if(ch >= '\uDC00' && ch <= '\uDFFF')					{						// We have a surrogate pair.						pair = ((left - (uint)0xD800) << 10) +							   (((uint)ch) - (uint)0xDC00) +							   (uint)0x10000;						left = 0;					}					else					{						// We have a surrogate start followed by a						// regular character.  Technically, this is						// invalid, but we have to do something.						// We write out the surrogate start and then						// re-visit the current character again.						pair = (uint)left;						left = 0;						--charIndex;						++charCount;					}					// Encode the character pair value.					if(pair < (uint)0x0080)					{						if(posn >= length)						{							throw new ArgumentException								(_("Arg_InsufficientSpace"), "bytes");						}						bytes[posn++] = (byte)pair;					}					else if(pair < (uint)0x0800)					{						if((posn + 2) > length)						{							throw new ArgumentException								(_("Arg_InsufficientSpace"), "bytes");						}						bytes[posn++] = (byte)(0xC0 | (pair >> 6));						bytes[posn++] = (byte)(0x80 | (pair & 0x3F));					}					else if(pair < (uint)0x10000)					{						if((posn + 3) > length)						{							throw new ArgumentException								(_("Arg_InsufficientSpace"), "bytes");						}						bytes[posn++] = (byte)(0xE0 | (pair >> 12));						bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));						bytes[posn++] = (byte)(0x80 | (pair & 0x3F));					}					else					{						if((posn + 4) > length)						{							throw new ArgumentException								(_("Arg_InsufficientSpace"), "bytes");						}						bytes[posn++] = (byte)(0xF0 | (pair >> 18));						bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));						bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));						bytes[posn++] = (byte)(0x80 | (pair & 0x3F));					}				}				if(flush && left != 0)				{					// Flush the left-over surrogate pair start.					if((posn + 3) > length)					{						throw new ArgumentException							(_("Arg_InsufficientSpace"), "bytes");					}					bytes[posn++] = (byte)(0xE0 | (left >> 12));					bytes[posn++] = (byte)(0x80 | ((left >> 6) & 0x3F));					bytes[posn++] = (byte)(0x80 | (left & 0x3F));					left = 0;				}				leftOver = left;				// Return the final count to the caller.				return posn - byteIndex;			}	// Get the bytes that result from encoding a character buffer.	public override int GetBytes(char[] chars, int charIndex, int charCount,								 byte[] bytes, int byteIndex)			{				uint leftOver = 0;				return InternalGetBytes(chars, charIndex, charCount,									    bytes, byteIndex, ref leftOver, true);			}	// Convenience wrappers for "GetBytes".	public override int GetBytes(String s, int charIndex, int charCount,								 byte[] bytes, int byteIndex)			{				// Validate the parameters.				if(s == null)				{					throw new ArgumentNullException("s");				}				if(bytes == null)				{					throw new ArgumentNullException("bytes");				}				if(charIndex < 0 || charIndex > s.Length)				{					throw new ArgumentOutOfRangeException						("charIndex", _("ArgRange_StringIndex"));				}				if(charCount < 0 || charCount > (s.Length - charIndex))				{					throw new ArgumentOutOfRangeException						("charCount", _("ArgRange_StringRange"));				}				if(byteIndex < 0 || byteIndex > bytes.Length)				{					throw new ArgumentOutOfRangeException						("byteIndex", _("ArgRange_Array"));				}				// Convert the characters into bytes.				char ch;				int length = bytes.Length;				uint pair;				int posn = byteIndex;				while(charCount > 0)				{					// Fetch the next UTF-16 character pair value.					ch = s[charIndex++];					--charCount;					if(ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1)					{						// This may be the start of a surrogate pair.						pair = (uint)(s[charIndex]);						if(pair >= (uint)0xDC00 && pair <= (uint)0xDFFF)						{							pair = (pair - (uint)0xDC00) +								   ((((uint)ch) - (uint)0xD800) << 10) +								   (uint)0x10000;							++charIndex;							--charCount;						}						else						{							pair = (uint)ch;						}					}					else					{						pair = (uint)ch;					}					// Encode the character pair value.					if(pair < (uint)0x0080)					{						if(posn >= length)						{							throw new ArgumentException								(_("Arg_InsufficientSpace"), "bytes");						}						bytes[posn++] = (byte)pair;					}					else if(pair < (uint)0x0800)					{						if((posn + 2) > length)						{							throw new ArgumentException								(_("Arg_InsufficientSpace"), "bytes");						}						bytes[posn++] = (byte)(0xC0 | (pair >> 6));						bytes[posn++] = (byte)(0x80 | (pair & 0x3F));					}					else if(pair < (uint)0x10000)					{						if((posn + 3) > length)						{							throw new ArgumentException								(_("Arg_InsufficientSpace"), "bytes");						}						bytes[posn++] = (byte)(0xE0 | (pair >> 12));						bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));						bytes[posn++] = (byte)(0x80 | (pair & 0x3F));					}					else					{						if((posn + 4) > length)						{							throw new ArgumentException								(_("Arg_InsufficientSpace"), "bytes");						}						bytes[posn++] = (byte)(0xF0 | (pair >> 18));						bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));						bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));						bytes[posn++] = (byte)(0x80 | (pair & 0x3F));					}				}				// Return the final count to the caller.				return posn - byteIndex;			}	public override byte[] GetBytes(String s)			{				return base.GetBytes(s);			}	// Internal version of "GetCharCount" which can handle a rolling	// state between multiple calls to this method.	private static int InternalGetCharCount(byte[] bytes, int index, int count,										    uint leftOverBits,										    uint leftOverCount,										    bool throwOnInvalid, bool flush)			{				// Validate the parameters.				if(bytes == null)				{					throw new ArgumentNullException("bytes");				}				if(index < 0 || index > bytes.Length)				{					throw new ArgumentOutOfRangeException						("index", _("ArgRange_Array"));				}				if(count < 0 || count > (bytes.Length - index))				{					throw new ArgumentOutOfRangeException						("count", _("ArgRange_Array"));				}				// Determine the number of characters that we have.				uint ch;				int length = 0;				uint leftBits = leftOverBits;				uint leftSoFar = (leftOverCount & (uint)0x0F);				uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);				while(count > 0)				{					ch = (uint)(bytes[index++]);					--count;					if(leftSize == 0)					{						// Process a UTF-8 start character.						if(ch < (uint)0x0080)						{							// Single-byte UTF-8 character.							++length;						}						else if((ch & (uint)0xE0) == (uint)0xC0)						{							// Double-byte UTF-8 character.							leftBits = (ch & (uint)0x1F);							leftSoFar = 1;							leftSize = 2;						}						else if((ch & (uint)0xF0) == (uint)0xE0)						{							// Three-byte UTF-8 character.							leftBits = (ch & (uint)0x0F);							leftSoFar = 1;							leftSize = 3;						}						else if((ch & (uint)0xF8) == (uint)0xF0)						{							// Four-byte UTF-8 character.							leftBits = (ch & (uint)0x07);
utf8encoding.cs - 源码说明

本页面展示了「没的没的没的没的没的没的没的没的没的没的没的没的没的没的没的没的没的没的没的没的没的」中的 utf8encoding.cs 源码文件，采用 CS 编程语言编写，共 1,015 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?