utf8encoding.cs
来自「没的 没的 没的 没的 没的 没的 没的 没的 没的 没的 没的 没的 没的 没」· CS 代码 · 共 1,015 行 · 第 1/2 页
CS
1,015 行
/* * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class. * * Copyright (C) 2001, 2002 Southern Storm Software, Pty Ltd. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */namespace System.Text{using System;[Serializable]public class UTF8Encoding : Encoding{ // Magic number used by Windows for UTF-8. internal const int UTF8_CODE_PAGE = 65001; // Internal state. private bool emitIdentifier; private bool throwOnInvalid; // Constructors. public UTF8Encoding() : this(false, false) {} public UTF8Encoding(bool encoderShouldEmitUTF8Identifier) : this(encoderShouldEmitUTF8Identifier, false) {} public UTF8Encoding(bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes) : base(UTF8_CODE_PAGE) { emitIdentifier = encoderShouldEmitUTF8Identifier; throwOnInvalid = throwOnInvalidBytes; } // Internal version of "GetByteCount" which can handle a rolling // state between multiple calls to this method. private static int InternalGetByteCount(char[] chars, int index, int count, uint leftOver, bool flush) { // Validate the parameters. if(chars == null) { throw new ArgumentNullException("chars"); } if(index < 0 || index > chars.Length) { throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array")); } if(count < 0 || count > (chars.Length - index)) { throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array")); } // Determine the lengths of all characters. char ch; int length = 0; uint pair = leftOver; while(count > 0) { ch = chars[index]; if(pair == 0) { if(ch < '\u0080') { ++length; } else if(ch < '\u0800') { length += 2; } else if(ch >= '\uD800' && ch <= '\uDBFF') { // This is the start of a surrogate pair. pair = (uint)ch; } else { length += 3; } } else if(ch >= '\uDC00' && ch <= '\uDFFF') { // We have a surrogate pair. length += 4; pair = 0; } else { // We have a surrogate start followed by a // regular character. Technically, this is // invalid, but we have to do something. // We write out the surrogate start and then // re-visit the current character again. length += 3; pair = 0; continue; } ++index; --count; } if(flush && pair != 0) { // Flush the left-over surrogate pair start. length += 3; } // Return the final length to the caller. return length; } // Get the number of bytes needed to encode a character buffer. public override int GetByteCount(char[] chars, int index, int count) { return InternalGetByteCount(chars, index, count, 0, true); } // Convenience wrappers for "GetByteCount". public override int GetByteCount(String s) { // Validate the parameters. if(s == null) { throw new ArgumentNullException("s"); } // Determine the lengths of all characters. char ch; int index = 0; int count = s.Length; int length = 0; uint pair; while(count > 0) { ch = s[index++]; if(ch < '\u0080') { ++length; } else if(ch < '\u0800') { length += 2; } else if(ch >= '\uD800' && ch <= '\uDBFF' && count > 1) { // This may be the start of a surrogate pair. pair = (uint)(s[index]); if(pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) { length += 4; ++index; --count; } else { length += 3; } } else { length += 3; } --count; } // Return the final length to the caller. return length; } // Internal version of "GetBytes" which can handle a rolling // state between multiple calls to this method. private static int InternalGetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, ref uint leftOver, bool flush) { // Validate the parameters. if(chars == null) { throw new ArgumentNullException("chars"); } if(bytes == null) { throw new ArgumentNullException("bytes"); } if(charIndex < 0 || charIndex > chars.Length) { throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array")); } if(charCount < 0 || charCount > (chars.Length - charIndex)) { throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array")); } if(byteIndex < 0 || byteIndex > bytes.Length) { throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array")); } // Convert the characters into bytes. char ch; int length = bytes.Length; uint pair; uint left = leftOver; int posn = byteIndex; while(charCount > 0) { // Fetch the next UTF-16 character pair value. ch = chars[charIndex++]; --charCount; if(left == 0) { if(ch >= '\uD800' && ch <= '\uDBFF') { // This is the start of a surrogate pair. left = (uint)ch; continue; } else { // This is a regular character. pair = (uint)ch; } } else if(ch >= '\uDC00' && ch <= '\uDFFF') { // We have a surrogate pair. pair = ((left - (uint)0xD800) << 10) + (((uint)ch) - (uint)0xDC00) + (uint)0x10000; left = 0; } else { // We have a surrogate start followed by a // regular character. Technically, this is // invalid, but we have to do something. // We write out the surrogate start and then // re-visit the current character again. pair = (uint)left; left = 0; --charIndex; ++charCount; } // Encode the character pair value. if(pair < (uint)0x0080) { if(posn >= length) { throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); } bytes[posn++] = (byte)pair; } else if(pair < (uint)0x0800) { if((posn + 2) > length) { throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); } bytes[posn++] = (byte)(0xC0 | (pair >> 6)); bytes[posn++] = (byte)(0x80 | (pair & 0x3F)); } else if(pair < (uint)0x10000) { if((posn + 3) > length) { throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); } bytes[posn++] = (byte)(0xE0 | (pair >> 12)); bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F)); bytes[posn++] = (byte)(0x80 | (pair & 0x3F)); } else { if((posn + 4) > length) { throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); } bytes[posn++] = (byte)(0xF0 | (pair >> 18)); bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F)); bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F)); bytes[posn++] = (byte)(0x80 | (pair & 0x3F)); } } if(flush && left != 0) { // Flush the left-over surrogate pair start. if((posn + 3) > length) { throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); } bytes[posn++] = (byte)(0xE0 | (left >> 12)); bytes[posn++] = (byte)(0x80 | ((left >> 6) & 0x3F)); bytes[posn++] = (byte)(0x80 | (left & 0x3F)); left = 0; } leftOver = left; // Return the final count to the caller. return posn - byteIndex; } // Get the bytes that result from encoding a character buffer. public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex) { uint leftOver = 0; return InternalGetBytes(chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true); } // Convenience wrappers for "GetBytes". public override int GetBytes(String s, int charIndex, int charCount, byte[] bytes, int byteIndex) { // Validate the parameters. if(s == null) { throw new ArgumentNullException("s"); } if(bytes == null) { throw new ArgumentNullException("bytes"); } if(charIndex < 0 || charIndex > s.Length) { throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex")); } if(charCount < 0 || charCount > (s.Length - charIndex)) { throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange")); } if(byteIndex < 0 || byteIndex > bytes.Length) { throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array")); } // Convert the characters into bytes. char ch; int length = bytes.Length; uint pair; int posn = byteIndex; while(charCount > 0) { // Fetch the next UTF-16 character pair value. ch = s[charIndex++]; --charCount; if(ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1) { // This may be the start of a surrogate pair. pair = (uint)(s[charIndex]); if(pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) { pair = (pair - (uint)0xDC00) + ((((uint)ch) - (uint)0xD800) << 10) + (uint)0x10000; ++charIndex; --charCount; } else { pair = (uint)ch; } } else { pair = (uint)ch; } // Encode the character pair value. if(pair < (uint)0x0080) { if(posn >= length) { throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); } bytes[posn++] = (byte)pair; } else if(pair < (uint)0x0800) { if((posn + 2) > length) { throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); } bytes[posn++] = (byte)(0xC0 | (pair >> 6)); bytes[posn++] = (byte)(0x80 | (pair & 0x3F)); } else if(pair < (uint)0x10000) { if((posn + 3) > length) { throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); } bytes[posn++] = (byte)(0xE0 | (pair >> 12)); bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F)); bytes[posn++] = (byte)(0x80 | (pair & 0x3F)); } else { if((posn + 4) > length) { throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); } bytes[posn++] = (byte)(0xF0 | (pair >> 18)); bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F)); bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F)); bytes[posn++] = (byte)(0x80 | (pair & 0x3F)); } } // Return the final count to the caller. return posn - byteIndex; } public override byte[] GetBytes(String s) { return base.GetBytes(s); } // Internal version of "GetCharCount" which can handle a rolling // state between multiple calls to this method. private static int InternalGetCharCount(byte[] bytes, int index, int count, uint leftOverBits, uint leftOverCount, bool throwOnInvalid, bool flush) { // Validate the parameters. if(bytes == null) { throw new ArgumentNullException("bytes"); } if(index < 0 || index > bytes.Length) { throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array")); } if(count < 0 || count > (bytes.Length - index)) { throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array")); } // Determine the number of characters that we have. uint ch; int length = 0; uint leftBits = leftOverBits; uint leftSoFar = (leftOverCount & (uint)0x0F); uint leftSize = ((leftOverCount >> 4) & (uint)0x0F); while(count > 0) { ch = (uint)(bytes[index++]); --count; if(leftSize == 0) { // Process a UTF-8 start character. if(ch < (uint)0x0080) { // Single-byte UTF-8 character. ++length; } else if((ch & (uint)0xE0) == (uint)0xC0) { // Double-byte UTF-8 character. leftBits = (ch & (uint)0x1F); leftSoFar = 1; leftSize = 2; } else if((ch & (uint)0xF0) == (uint)0xE0) { // Three-byte UTF-8 character. leftBits = (ch & (uint)0x0F); leftSoFar = 1; leftSize = 3; } else if((ch & (uint)0xF8) == (uint)0xF0) { // Four-byte UTF-8 character. leftBits = (ch & (uint)0x07);
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?