⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 identifyencoding.cs

📁 一)需求 很多情况下我们需要知道字节流的编码
💻 CS
📖 第 1 页 / 共 5 页
字号:
using System;

namespace Lion.Text
{
	#region Class IdentifyEncoding.....
	/// <summary>
	/// 检测字符编码的类
	/// <seealso cref="System.IO.Stream"/>
	/// <seealso cref="System.Uri"/>
	/// <seealso cref="System.IO.FileInfo"/>
	/// </summary>
	/// <remarks>
	/// <![CDATA[
	/// <strong>IdentifyEncoding</strong> 用来检测 <see cref="Uri"/>,<see cref="System.IO.FileInfo"/>,<see cref="sbyte"/> 字节数组的编码.
	/// Create By lion  <br/>
	/// 2005-02-21 22:00  <br/>
	///	Support .Net Framework v1.1.4322 <br/> 
	///	WebSite:www.lionsky.net(lion-a AT sohu.com) <br/> 
	/// ]]>
	/// </remarks>
	public class IdentifyEncoding
	{
		#region Fields.....

		// Frequency tables to hold the GB, Big5, and EUC-TW character
		// frequencies
		internal static int[][] GBFreq = new int[94][];
		internal static int[][] GBKFreq = new int[126][];
		internal static int[][] Big5Freq = new int[94][];
		internal static int[][] EUC_TWFreq = new int[94][];

		internal static string[] nicename = new string[]
			{
				"GB2312", "GBK", "HZ", "Big5", "CNS 11643", 
				"ISO 2022CN", "UTF-8", "Unicode", "ASCII", "OTHER"
			};

		#endregion

		#region Methods.....

		/// <summary>
		/// 初始化 <see cref="IdentifyEncoding"/> 的实例
		/// </summary>
		public IdentifyEncoding(){Initialize_Frequencies();}

		#region GetEncodingString.....
		/// <summary>
		/// 从指定的 <see cref="Uri"/> 中判断编码类型
		/// </summary>
		/// <param name="testurl">要判断的 <see cref="Uri"/> </param>
		/// <returns>返回编码类型("GB2312", "GBK", "HZ", "Big5", "CNS 11643", "ISO 2022CN", "UTF-8", "Unicode", "ASCII", "OTHER")</returns>
		/// <example>
		/// 以下示例演示了如何调用 <see cref="GetEncodingString"/> 方法:
		/// <code>
		///		IdentifyEncoding ide = new IdentifyEncoding();
		///		Response.Write(ide.GetEncodingString(new Uri("http://china5.nikkeibp.co.jp/china/news/com/200307/pr_com200307170131.html")));		
		/// </code>
		/// </example>
		public virtual string GetEncodingString(System.Uri testurl)
		{
			sbyte[] rawtext = new sbyte[1024];
			int bytesread = 0, byteoffset = 0;
			System.IO.Stream chinesestream;
			try
			{
				chinesestream = System.Net.WebRequest.Create(testurl.AbsoluteUri).GetResponse().GetResponseStream();
				while ((bytesread = ReadInput(chinesestream, ref rawtext, byteoffset, rawtext.Length - byteoffset)) > 0)
				{
					byteoffset += bytesread;
				}
				chinesestream.Close();
			}
			catch (System.Exception e)
			{
				System.Console.Error.WriteLine("Error loading or using URL " + e.ToString());
			}
			return GetEncodingString(rawtext);
		}

		/// <summary>
		/// 从指定的 <see cref="System.IO.FileInfo"/> 中判断编码类型
		/// </summary>
		/// <param name="testfile">要判断的 <see cref="System.IO.FileInfo"/> </param>
		/// <returns>返回编码类型("GB2312", "GBK", "HZ", "Big5", "CNS 11643", "ISO 2022CN", "UTF-8", "Unicode", "ASCII", "OTHER")</returns>
		/// <example>
		/// 以下示例演示了如何调用 <see cref="GetEncodingString"/> 方法:
		/// <code>
		///		IdentifyEncoding ide = new IdentifyEncoding();
		///		Response.Write(ide.GetEncodingString(new System.IO.FileInfo(@"C:\test.txt")));		
		/// </code>
		/// </example>
		public virtual string GetEncodingString(System.IO.FileInfo testfile)
		{
			System.IO.FileStream chinesefile;
			sbyte[] rawtext;
			rawtext = new sbyte[(int) FileLength(testfile)];
			try
			{
				chinesefile = new System.IO.FileStream(testfile.FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read);
				ReadInput(chinesefile, ref rawtext, 0, rawtext.Length);
			}
			catch (System.Exception e)
			{
				System.Console.Error.WriteLine("Error: " + e);
			}

			return GetEncodingString(rawtext);
		}


		/// <summary>
		/// 从指定的 <see cref="sbyte"/> 字节数组中判断编码类型
		/// </summary>
		/// <param name="rawtext">要判断的 <see cref="System.IO.FileInfo"/> </param>
		/// <returns>返回编码类型("GB2312", "GBK", "HZ", "Big5", "CNS 11643", "ISO 2022CN", "UTF-8", "Unicode", "ASCII", "OTHER")</returns>
		/// <example>
		/// 以下示例演示了如何调用 <see cref="GetEncodingString"/> 方法:
		/// <code>
		///		IdentifyEncoding ide = new IdentifyEncoding();
		///		Response.Write(ide.GetEncodingString(IdentifyEncoding.ToSByteArray(System.Text.Encoding.GetEncoding("gb2312").GetBytes("Lion互动网络(www.lionsky.net)"))));	
		/// </code>
		/// </example>
		public virtual string GetEncodingString(sbyte[] rawtext)
		{
			int[] scores;
			int index, maxscore = 0;
			int encoding_guess = 0;

			scores = new int[10];
			//分析编码的概率
			scores[0] = GB2312Probability(rawtext);
			scores[1] = GBKProbability(rawtext);
			scores[2] = HZProbability(rawtext);
			scores[3] = BIG5Probability(rawtext);
			scores[4] = ENCTWProbability(rawtext);
			scores[5] = ISO2022CNProbability(rawtext);
			scores[6] = UTF8Probability(rawtext);
			scores[7] = UnicodeProbability(rawtext);
			scores[8] = ASCIIProbability(rawtext);
			scores[9] = 0;

			// Tabulate Scores
			for (index = 0; index < 10; index++)
			{
				if (scores[index] > maxscore)
				{
					encoding_guess = index;
					maxscore = scores[index];
				}
			}

			// Return OTHER if nothing scored above 50
			if (maxscore <= 50)
			{
				encoding_guess = 9;
			}

			return nicename[encoding_guess];
		}
		#endregion

		#region About Probability.....
		#region GB2312Probability
		/// <summary>
		/// 判断是GB2312编码的可能性
		/// </summary>
		/// <param name="rawtext">要判断的 <see cref="sbyte"/> 字节数组</param>
		/// <returns>返回 0 至 100 之间的可能性</returns>
		internal virtual int GB2312Probability(sbyte[] rawtext)
		{
			int i, rawtextlen = 0;

			int dbchars = 1, gbchars = 1;
			long gbfreq = 0, totalfreq = 1;
			float rangeval = 0, freqval = 0;
			int row, column;

			// Stage 1:  Check to see if characters fit into acceptable ranges

			rawtextlen = rawtext.Length;
			for (i = 0; i < rawtextlen - 1; i++)
			{
				if (rawtext[i] >= 0)
				{
					//asciichars++;
				}
				else
				{
					dbchars++;
					if ((sbyte) Identity(0xA1) <= rawtext[i] && rawtext[i] <= (sbyte) Identity(0xF7) && (sbyte) Identity(0xA1) <= rawtext[i + 1] && rawtext[i + 1] <= (sbyte) Identity(0xFE))
					{
						gbchars++;
						totalfreq += 500;
						row = rawtext[i] + 256 - 0xA1;
						column = rawtext[i + 1] + 256 - 0xA1;
						if (GBFreq[row][column] != 0)
						{
							gbfreq += GBFreq[row][column];
						}
						else if (15 <= row && row < 55)
						{
							gbfreq += 200;
						}
					}
					i++;
				}
			}
			
			rangeval = 50*((float) gbchars/(float) dbchars);
			freqval = 50*((float) gbfreq/(float) totalfreq);

			
			return (int) (rangeval + freqval);
		}

		#endregion

		#region GBKProbability.....
		/// <summary>
		/// 判断是GBK编码的可能性
		/// </summary>
		/// <param name="rawtext">要判断的 <see cref="sbyte"/> 字节数组</param>
		/// <returns>返回 0 至 100 之间的可能性</returns>
		internal virtual int GBKProbability(sbyte[] rawtext)
		{
			int i, rawtextlen = 0;

			int dbchars = 1, gbchars = 1;
			long gbfreq = 0, totalfreq = 1;
			float rangeval = 0, freqval = 0;
			int row, column;

			// Stage 1:  Check to see if characters fit into acceptable ranges
			rawtextlen = rawtext.Length;
			for (i = 0; i < rawtextlen - 1; i++)
			{
				if (rawtext[i] >= 0)
				{
					//asciichars++;
				}
				else
				{
					dbchars++;
					if ((sbyte) Identity(0xA1) <= rawtext[i] && rawtext[i] <= (sbyte) Identity(0xF7) && (sbyte) Identity(0xA1) <= rawtext[i + 1] && rawtext[i + 1] <= (sbyte) Identity(0xFE))
					{
						gbchars++;
						totalfreq += 500;
						row = rawtext[i] + 256 - 0xA1;
						column = rawtext[i + 1] + 256 - 0xA1;
						
						if (GBFreq[row][column] != 0)
						{
							gbfreq += GBFreq[row][column];
						}
						else if (15 <= row && row < 55)
						{
							gbfreq += 200;
						}
					}
					else if ((sbyte) Identity(0x81) <= rawtext[i] && rawtext[i] <= (sbyte) Identity(0xFE) && (((sbyte) Identity(0x80) <= rawtext[i + 1] && rawtext[i + 1] <= (sbyte) Identity(0xFE)) || ((sbyte) 0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (sbyte) 0x7E)))
					{
						gbchars++;
						totalfreq += 500;
						row = rawtext[i] + 256 - 0x81;
						if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E)
						{
							column = rawtext[i + 1] - 0x40;
						}
						else
						{
							column = rawtext[i + 1] + 256 - 0x80;
						}
						
						if (GBKFreq[row][column] != 0)
						{
							gbfreq += GBKFreq[row][column];
						}
					}
					i++;
				}
			}
			
			rangeval = 50*((float) gbchars/(float) dbchars);
			freqval = 50*((float) gbfreq/(float) totalfreq);
			
			return (int) (rangeval + freqval) - 1;
		}

		#endregion

		#region HZProbability.....
		/// <summary>
		/// 判断是HZ编码的可能性
		/// </summary>
		/// <param name="rawtext">要判断的 <see cref="sbyte"/> 字节数组</param>
		/// <returns>返回 0 至 100 之间的可能性</returns>
		internal virtual int HZProbability(sbyte[] rawtext)
		{
			int i, rawtextlen;
			int hzchars = 0, dbchars = 1;
			long hzfreq = 0, totalfreq = 1;
			float rangeval = 0, freqval = 0;
			int hzstart = 0, hzend = 0;
			int row, column;

			rawtextlen = rawtext.Length;

			for (i = 0; i < rawtextlen; i++)
			{
				if (rawtext[i] == '~')
				{
					if (rawtext[i + 1] == '{')
					{
						hzstart++;
						i += 2;
						while (i < rawtextlen - 1)
						{
							if (rawtext[i] == 0x0A || rawtext[i] == 0x0D)
							{
								break;
							}
							else if (rawtext[i] == '~' && rawtext[i + 1] == '}')
							{
								hzend++;
								i++;
								break;
							}
							else if ((0x21 <= rawtext[i] && rawtext[i] <= 0x77) && (0x21 <= rawtext[i + 1] && rawtext[i + 1] <= 0x77))
							{
								hzchars += 2;
								row = rawtext[i] - 0x21;
								column = rawtext[i + 1] - 0x21;
								totalfreq += 500;
								if (GBFreq[row][column] != 0)
								{
									hzfreq += GBFreq[row][column];
								}
								else if (15 <= row && row < 55)
								{
									hzfreq += 200;
								}
							}
							else if (((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xF7) && ((byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xF7))
							{
								hzchars += 2;
								row = rawtext[i] + 256 - 0xA1;
								column = rawtext[i + 1] + 256 - 0xA1;
								totalfreq += 500;
								if (GBFreq[row][column] != 0)
								{
									hzfreq += GBFreq[row][column];
								}
								else if (15 <= row && row < 55)
								{
									hzfreq += 200;
								}
							}
							dbchars += 2;
							i += 2;
						}
					}
					else if (rawtext[i + 1] == '}')
					{
						hzend++;
						i++;
					}
					else if (rawtext[i + 1] == '~')
					{
						i++;
					}
				}
			}

			if (hzstart > 4)
			{
				rangeval = 50;
			}
			else if (hzstart > 1)
			{
				rangeval = 41;
			}
			else if (hzstart > 0)
			{
				// Only 39 in case the sequence happened to occur
				rangeval = 39; // in otherwise non-Hz text
			}
			else
			{
				rangeval = 0;
			}
			freqval = 50*((float) hzfreq/(float) totalfreq);

			
			return (int) (rangeval + freqval);
		}

		#endregion

		#region BIG5Probability.....
		/// <summary>
		/// 判断是BIG5编码的可能性
		/// </summary>
		/// <param name="rawtext">要判断的 <see cref="sbyte"/> 字节数组</param>
		/// <returns>返回 0 至 100 之间的可能性</returns>
		internal virtual int BIG5Probability(sbyte[] rawtext)
		{
			int i, rawtextlen = 0;
			int dbchars = 1, bfchars = 1;
			float rangeval = 0, freqval = 0;
			long bffreq = 0, totalfreq = 1;
			int row, column;

			// Check to see if characters fit into acceptable ranges

			rawtextlen = rawtext.Length;
			for (i = 0; i < rawtextlen - 1; i++)
			{
				if (rawtext[i] >= 0)
				{
					//asciichars++;
				}
				else
				{
					dbchars++;
					if ((sbyte) Identity(0xA1) <= rawtext[i] && rawtext[i] <= (sbyte) Identity(0xF9) && (((sbyte) 0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (sbyte) 0x7E) || ((sbyte) Identity(0xA1) <= rawtext[i + 1] && rawtext[i + 1] <= (sbyte) Identity(0xFE))))
					{
						bfchars++;
						totalfreq += 500;
						row = rawtext[i] + 256 - 0xA1;
						if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E)
						{
							column = rawtext[i + 1] - 0x40;
						}
						else
						{
							column = rawtext[i + 1] + 256 - 0x61;
						}
						if (Big5Freq[row][column] != 0)
						{
							bffreq += Big5Freq[row][column];
						}
						else if (3 <= row && row <= 37)
						{
							bffreq += 200;
						}
					}
					i++;
				}
			}
			
			rangeval = 50*((float) bfchars/(float) dbchars);
			freqval = 50*((float) bffreq/(float) totalfreq);

			
			return (int) (rangeval + freqval);
		}

		#endregion

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -