utf8encoding.cs

来自「没的 没的 没的 没的 没的 没的 没的 没的 没的 没的 没的 没的 没的 没」· CS 代码 · 共 1,015 行 · 第 1/2 页

CS
1,015
字号
/* * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class. * * Copyright (C) 2001, 2002  Southern Storm Software, Pty Ltd. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */namespace System.Text{using System;[Serializable]public class UTF8Encoding : Encoding{	// Magic number used by Windows for UTF-8.	internal const int UTF8_CODE_PAGE = 65001;	// Internal state.	private bool emitIdentifier;	private bool throwOnInvalid;	// Constructors.	public UTF8Encoding()			: this(false, false) {}	public UTF8Encoding(bool encoderShouldEmitUTF8Identifier)			: this(encoderShouldEmitUTF8Identifier, false) {}	public UTF8Encoding(bool encoderShouldEmitUTF8Identifier,						bool throwOnInvalidBytes)			: base(UTF8_CODE_PAGE)			{				emitIdentifier = encoderShouldEmitUTF8Identifier;				throwOnInvalid = throwOnInvalidBytes;			}	// Internal version of "GetByteCount" which can handle a rolling	// state between multiple calls to this method.	private static int InternalGetByteCount(char[] chars, int index,											int count, uint leftOver,											bool flush)			{				// Validate the parameters.				if(chars == null)				{					throw new ArgumentNullException("chars");				}				if(index < 0 || index > chars.Length)				{					throw new ArgumentOutOfRangeException						("index", _("ArgRange_Array"));				}				if(count < 0 || count > (chars.Length - index))				{					throw new ArgumentOutOfRangeException						("count", _("ArgRange_Array"));				}				// Determine the lengths of all characters.				char ch;				int length = 0;				uint pair = leftOver;				while(count > 0)				{					ch = chars[index];					if(pair == 0)					{						if(ch < '\u0080')						{							++length;						}						else if(ch < '\u0800')						{							length += 2;						}						else if(ch >= '\uD800' && ch <= '\uDBFF')						{							// This is the start of a surrogate pair.							pair = (uint)ch;						}						else						{							length += 3;						}					}					else if(ch >= '\uDC00' && ch <= '\uDFFF')					{						// We have a surrogate pair.						length += 4;						pair = 0;					}					else					{						// We have a surrogate start followed by a						// regular character.  Technically, this is						// invalid, but we have to do something.						// We write out the surrogate start and then						// re-visit the current character again.						length += 3;						pair = 0;						continue;					}					++index;					--count;				}				if(flush && pair != 0)				{					// Flush the left-over surrogate pair start.					length += 3;				}				// Return the final length to the caller.				return length;			}	// Get the number of bytes needed to encode a character buffer.	public override int GetByteCount(char[] chars, int index, int count)			{				return InternalGetByteCount(chars, index, count, 0, true);			}	// Convenience wrappers for "GetByteCount".	public override int GetByteCount(String s)			{				// Validate the parameters.				if(s == null)				{					throw new ArgumentNullException("s");				}				// Determine the lengths of all characters.				char ch;				int index = 0;				int count = s.Length;				int length = 0;				uint pair;				while(count > 0)				{					ch = s[index++];					if(ch < '\u0080')					{						++length;					}					else if(ch < '\u0800')					{						length += 2;					}					else if(ch >= '\uD800' && ch <= '\uDBFF' && count > 1)					{						// This may be the start of a surrogate pair.						pair = (uint)(s[index]);						if(pair >= (uint)0xDC00 && pair <= (uint)0xDFFF)						{							length += 4;							++index;							--count;						}						else						{							length += 3;						}					}					else					{						length += 3;					}					--count;				}				// Return the final length to the caller.				return length;			}	// Internal version of "GetBytes" which can handle a rolling	// state between multiple calls to this method.	private static int InternalGetBytes(char[] chars, int charIndex,									    int charCount, byte[] bytes,									    int byteIndex, ref uint leftOver,										bool flush)			{				// Validate the parameters.				if(chars == null)				{					throw new ArgumentNullException("chars");				}				if(bytes == null)				{					throw new ArgumentNullException("bytes");				}				if(charIndex < 0 || charIndex > chars.Length)				{					throw new ArgumentOutOfRangeException						("charIndex", _("ArgRange_Array"));				}				if(charCount < 0 || charCount > (chars.Length - charIndex))				{					throw new ArgumentOutOfRangeException						("charCount", _("ArgRange_Array"));				}				if(byteIndex < 0 || byteIndex > bytes.Length)				{					throw new ArgumentOutOfRangeException						("byteIndex", _("ArgRange_Array"));				}				// Convert the characters into bytes.				char ch;				int length = bytes.Length;				uint pair;				uint left = leftOver;				int posn = byteIndex;				while(charCount > 0)				{					// Fetch the next UTF-16 character pair value.					ch = chars[charIndex++];					--charCount;					if(left == 0)					{						if(ch >= '\uD800' && ch <= '\uDBFF')						{							// This is the start of a surrogate pair.							left = (uint)ch;							continue;						}						else						{							// This is a regular character.							pair = (uint)ch;						}					}					else if(ch >= '\uDC00' && ch <= '\uDFFF')					{						// We have a surrogate pair.						pair = ((left - (uint)0xD800) << 10) +							   (((uint)ch) - (uint)0xDC00) +							   (uint)0x10000;						left = 0;					}					else					{						// We have a surrogate start followed by a						// regular character.  Technically, this is						// invalid, but we have to do something.						// We write out the surrogate start and then						// re-visit the current character again.						pair = (uint)left;						left = 0;						--charIndex;						++charCount;					}					// Encode the character pair value.					if(pair < (uint)0x0080)					{						if(posn >= length)						{							throw new ArgumentException								(_("Arg_InsufficientSpace"), "bytes");						}						bytes[posn++] = (byte)pair;					}					else if(pair < (uint)0x0800)					{						if((posn + 2) > length)						{							throw new ArgumentException								(_("Arg_InsufficientSpace"), "bytes");						}						bytes[posn++] = (byte)(0xC0 | (pair >> 6));						bytes[posn++] = (byte)(0x80 | (pair & 0x3F));					}					else if(pair < (uint)0x10000)					{						if((posn + 3) > length)						{							throw new ArgumentException								(_("Arg_InsufficientSpace"), "bytes");						}						bytes[posn++] = (byte)(0xE0 | (pair >> 12));						bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));						bytes[posn++] = (byte)(0x80 | (pair & 0x3F));					}					else					{						if((posn + 4) > length)						{							throw new ArgumentException								(_("Arg_InsufficientSpace"), "bytes");						}						bytes[posn++] = (byte)(0xF0 | (pair >> 18));						bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));						bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));						bytes[posn++] = (byte)(0x80 | (pair & 0x3F));					}				}				if(flush && left != 0)				{					// Flush the left-over surrogate pair start.					if((posn + 3) > length)					{						throw new ArgumentException							(_("Arg_InsufficientSpace"), "bytes");					}					bytes[posn++] = (byte)(0xE0 | (left >> 12));					bytes[posn++] = (byte)(0x80 | ((left >> 6) & 0x3F));					bytes[posn++] = (byte)(0x80 | (left & 0x3F));					left = 0;				}				leftOver = left;				// Return the final count to the caller.				return posn - byteIndex;			}	// Get the bytes that result from encoding a character buffer.	public override int GetBytes(char[] chars, int charIndex, int charCount,								 byte[] bytes, int byteIndex)			{				uint leftOver = 0;				return InternalGetBytes(chars, charIndex, charCount,									    bytes, byteIndex, ref leftOver, true);			}	// Convenience wrappers for "GetBytes".	public override int GetBytes(String s, int charIndex, int charCount,								 byte[] bytes, int byteIndex)			{				// Validate the parameters.				if(s == null)				{					throw new ArgumentNullException("s");				}				if(bytes == null)				{					throw new ArgumentNullException("bytes");				}				if(charIndex < 0 || charIndex > s.Length)				{					throw new ArgumentOutOfRangeException						("charIndex", _("ArgRange_StringIndex"));				}				if(charCount < 0 || charCount > (s.Length - charIndex))				{					throw new ArgumentOutOfRangeException						("charCount", _("ArgRange_StringRange"));				}				if(byteIndex < 0 || byteIndex > bytes.Length)				{					throw new ArgumentOutOfRangeException						("byteIndex", _("ArgRange_Array"));				}				// Convert the characters into bytes.				char ch;				int length = bytes.Length;				uint pair;				int posn = byteIndex;				while(charCount > 0)				{					// Fetch the next UTF-16 character pair value.					ch = s[charIndex++];					--charCount;					if(ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1)					{						// This may be the start of a surrogate pair.						pair = (uint)(s[charIndex]);						if(pair >= (uint)0xDC00 && pair <= (uint)0xDFFF)						{							pair = (pair - (uint)0xDC00) +								   ((((uint)ch) - (uint)0xD800) << 10) +								   (uint)0x10000;							++charIndex;							--charCount;						}						else						{							pair = (uint)ch;						}					}					else					{						pair = (uint)ch;					}					// Encode the character pair value.					if(pair < (uint)0x0080)					{						if(posn >= length)						{							throw new ArgumentException								(_("Arg_InsufficientSpace"), "bytes");						}						bytes[posn++] = (byte)pair;					}					else if(pair < (uint)0x0800)					{						if((posn + 2) > length)						{							throw new ArgumentException								(_("Arg_InsufficientSpace"), "bytes");						}						bytes[posn++] = (byte)(0xC0 | (pair >> 6));						bytes[posn++] = (byte)(0x80 | (pair & 0x3F));					}					else if(pair < (uint)0x10000)					{						if((posn + 3) > length)						{							throw new ArgumentException								(_("Arg_InsufficientSpace"), "bytes");						}						bytes[posn++] = (byte)(0xE0 | (pair >> 12));						bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));						bytes[posn++] = (byte)(0x80 | (pair & 0x3F));					}					else					{						if((posn + 4) > length)						{							throw new ArgumentException								(_("Arg_InsufficientSpace"), "bytes");						}						bytes[posn++] = (byte)(0xF0 | (pair >> 18));						bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));						bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));						bytes[posn++] = (byte)(0x80 | (pair & 0x3F));					}				}				// Return the final count to the caller.				return posn - byteIndex;			}	public override byte[] GetBytes(String s)			{				return base.GetBytes(s);			}	// Internal version of "GetCharCount" which can handle a rolling	// state between multiple calls to this method.	private static int InternalGetCharCount(byte[] bytes, int index, int count,										    uint leftOverBits,										    uint leftOverCount,										    bool throwOnInvalid, bool flush)			{				// Validate the parameters.				if(bytes == null)				{					throw new ArgumentNullException("bytes");				}				if(index < 0 || index > bytes.Length)				{					throw new ArgumentOutOfRangeException						("index", _("ArgRange_Array"));				}				if(count < 0 || count > (bytes.Length - index))				{					throw new ArgumentOutOfRangeException						("count", _("ArgRange_Array"));				}				// Determine the number of characters that we have.				uint ch;				int length = 0;				uint leftBits = leftOverBits;				uint leftSoFar = (leftOverCount & (uint)0x0F);				uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);				while(count > 0)				{					ch = (uint)(bytes[index++]);					--count;					if(leftSize == 0)					{						// Process a UTF-8 start character.						if(ch < (uint)0x0080)						{							// Single-byte UTF-8 character.							++length;						}						else if((ch & (uint)0xE0) == (uint)0xC0)						{							// Double-byte UTF-8 character.							leftBits = (ch & (uint)0x1F);							leftSoFar = 1;							leftSize = 2;						}						else if((ch & (uint)0xF0) == (uint)0xE0)						{							// Three-byte UTF-8 character.							leftBits = (ch & (uint)0x0F);							leftSoFar = 1;							leftSize = 3;						}						else if((ch & (uint)0xF8) == (uint)0xF0)						{							// Four-byte UTF-8 character.							leftBits = (ch & (uint)0x07);

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?