📄 identifyencoding.cs
字号:
#region ENCTWProbability.....
/// <summary>
/// 判断是CNS11643(台湾)编码的可能性
/// </summary>
/// <param name="rawtext">要判断的 <see cref="sbyte"/> 字节数组</param>
/// <returns>返回 0 至 100 之间的可能性</returns>
internal virtual int ENCTWProbability(sbyte[] rawtext)
{
int i, rawtextlen = 0;
int dbchars = 1, cnschars = 1;
long cnsfreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int row, column;
// Check to see if characters fit into acceptable ranges
// and have expected frequency of use
rawtextlen = rawtext.Length;
for (i = 0; i < rawtextlen - 1; i++)
{
if (rawtext[i] >= 0)
{
// in ASCII range
//asciichars++;
}
else
{
// high bit set
dbchars++;
if (i + 3 < rawtextlen && (sbyte) Identity(0x8E) == rawtext[i] && (sbyte) Identity(0xA1) <= rawtext[i + 1] && rawtext[i + 1] <= (sbyte) Identity(0xB0) && (sbyte) Identity(0xA1) <= rawtext[i + 2] && rawtext[i + 2] <= (sbyte) Identity(0xFE) && (sbyte) Identity(0xA1) <= rawtext[i + 3] && rawtext[i + 3] <= (sbyte) Identity(0xFE))
{
// Planes 1 - 16
cnschars++;
// These are all less frequent chars so just ignore freq
i += 3;
}
else if ((sbyte) Identity(0xA1) <= rawtext[i] && rawtext[i] <= (sbyte) Identity(0xFE) && (sbyte) Identity(0xA1) <= rawtext[i + 1] && rawtext[i + 1] <= (sbyte) Identity(0xFE))
{
cnschars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0xA1;
column = rawtext[i + 1] + 256 - 0xA1;
if (EUC_TWFreq[row][column] != 0)
{
cnsfreq += EUC_TWFreq[row][column];
}
else if (35 <= row && row <= 92)
{
cnsfreq += 150;
}
i++;
}
}
}
rangeval = 50*((float) cnschars/(float) dbchars);
freqval = 50*((float) cnsfreq/(float) totalfreq);
return (int) (rangeval + freqval);
}
#endregion
#region ISO2022CNProbability.....
/// <summary>
/// 判断是ISO2022CN编码的可能性
/// </summary>
/// <param name="rawtext">要判断的 <see cref="sbyte"/> 字节数组</param>
/// <returns>返回 0 至 100 之间的可能性</returns>
internal virtual int ISO2022CNProbability(sbyte[] rawtext)
{
int i, rawtextlen = 0;
int dbchars = 1, isochars = 1;
long isofreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int row, column;
// Check to see if characters fit into acceptable ranges
// and have expected frequency of use
rawtextlen = rawtext.Length;
for (i = 0; i < rawtextlen - 1; i++)
{
if (rawtext[i] == (sbyte) 0x1B && i + 3 < rawtextlen)
{
// Escape char ESC
if (rawtext[i + 1] == (sbyte) 0x24 && rawtext[i + 2] == 0x29 && rawtext[i + 3] == (sbyte) 0x41)
{
// GB Escape $ ) A
i += 4;
while (rawtext[i] != (sbyte) 0x1B)
{
dbchars++;
if ((0x21 <= rawtext[i] && rawtext[i] <= 0x77) && (0x21 <= rawtext[i + 1] && rawtext[i + 1] <= 0x77))
{
isochars++;
row = rawtext[i] - 0x21;
column = rawtext[i + 1] - 0x21;
totalfreq += 500;
if (GBFreq[row][column] != 0)
{
isofreq += GBFreq[row][column];
}
else if (15 <= row && row < 55)
{
isofreq += 200;
}
i++;
}
i++;
}
}
else if (i + 3 < rawtextlen && rawtext[i + 1] == (sbyte) 0x24 && rawtext[i + 2] == (sbyte) 0x29 && rawtext[i + 3] == (sbyte) 0x47)
{
// CNS Escape $ ) G
i += 4;
while (rawtext[i] != (sbyte) 0x1B)
{
dbchars++;
if ((sbyte) 0x21 <= rawtext[i] && rawtext[i] <= (sbyte) 0x7E && (sbyte) 0x21 <= rawtext[i + 1] && rawtext[i + 1] <= (sbyte) 0x7E)
{
isochars++;
totalfreq += 500;
row = rawtext[i] - 0x21;
column = rawtext[i + 1] - 0x21;
if (EUC_TWFreq[row][column] != 0)
{
isofreq += EUC_TWFreq[row][column];
}
else if (35 <= row && row <= 92)
{
isofreq += 150;
}
i++;
}
i++;
}
}
if (rawtext[i] == (sbyte) 0x1B && i + 2 < rawtextlen && rawtext[i + 1] == (sbyte) 0x28 && rawtext[i + 2] == (sbyte) 0x42)
{
// ASCII: ESC ( B
i += 2;
}
}
}
rangeval = 50*((float) isochars/(float) dbchars);
freqval = 50*((float) isofreq/(float) totalfreq);
return (int) (rangeval + freqval);
}
#endregion
#region UTF8Probability.....
/// <summary>
/// 判断是UTF8编码的可能性
/// </summary>
/// <param name="rawtext">要判断的 <see cref="sbyte"/> 字节数组</param>
/// <returns>返回 0 至 100 之间的可能性</returns>
internal virtual int UTF8Probability(sbyte[] rawtext)
{
int score = 0;
int i, rawtextlen = 0;
int goodbytes = 0, asciibytes = 0;
// Maybe also use UTF8 Byte Order Mark: EF BB BF
// Check to see if characters fit into acceptable ranges
rawtextlen = rawtext.Length;
for (i = 0; i < rawtextlen; i++)
{
if ((rawtext[i] & (sbyte) 0x7F) == rawtext[i])
{
// One byte
asciibytes++;
// Ignore ASCII, can throw off count
}
else if (- 64 <= rawtext[i] && rawtext[i] <= - 33 && i + 1 < rawtextlen && - 128 <= rawtext[i + 1] && rawtext[i + 1] <= - 65)
{
goodbytes += 2;
i++;
}
else if (- 32 <= rawtext[i] && rawtext[i] <= - 17 && i + 2 < rawtextlen && - 128 <= rawtext[i + 1] && rawtext[i + 1] <= - 65 && - 128 <= rawtext[i + 2] && rawtext[i + 2] <= - 65)
{
goodbytes += 3;
i += 2;
}
}
if (asciibytes == rawtextlen)
{
return 0;
}
score = (int) (100*((float) goodbytes/(float) (rawtextlen - asciibytes)));
// If not above 98, reduce to zero to prevent coincidental matches
// Allows for some (few) bad formed sequences
if (score > 98)
{
return score;
}
else if (score > 95 && goodbytes > 30)
{
return score;
}
else
{
return 0;
}
}
#endregion
#region UnicodeProbability.....
/// <summary>
/// 判断是Unicode编码的可能性
/// </summary>
/// <param name="rawtext">要判断的 <see cref="sbyte"/> 字节数组</param>
/// <returns>返回 0 至 100 之间的可能性</returns>
internal virtual int UnicodeProbability(sbyte[] rawtext)
{
//int score = 0;
//int i, rawtextlen = 0;
//int goodbytes = 0, asciibytes = 0;
if (((sbyte) Identity(0xFE) == rawtext[0] && (sbyte) Identity(0xFF) == rawtext[1]) || ((sbyte) Identity(0xFF) == rawtext[0] && (sbyte) Identity(0xFE) == rawtext[1]))
{
return 100;
}
return 0;
}
#endregion
#region ASCIIProbability.....
/// <summary>
/// 判断是ASCII编码的可能性
/// </summary>
/// <param name="rawtext">要判断的 <see cref="sbyte"/> 字节数组</param>
/// <returns>返回 0 至 100 之间的可能性</returns>
internal virtual int ASCIIProbability(sbyte[] rawtext)
{
int score = 70;
int i, rawtextlen;
rawtextlen = rawtext.Length;
for (i = 0; i < rawtextlen; i++)
{
if (rawtext[i] < 0)
{
score = score - 5;
}
else if (rawtext[i] == (sbyte) 0x1B)
{
// ESC (used by ISO 2022)
score = score - 5;
}
}
return score;
}
#endregion
#endregion
#region Initialize_Frequencies.....
/// <summary>
/// 初始化必要的条件
/// </summary>
internal virtual void Initialize_Frequencies()
{
int i;
if(GBFreq[0] == null)
{
for (i = 0; i < 94; i++)
{
GBFreq[i] = new int[94];
}
#region GBFreq[20][35] = 599;
GBFreq[49][26] = 598;
GBFreq[41][38] = 597;
GBFreq[17][26] = 596;
GBFreq[32][42] = 595;
GBFreq[39][42] = 594;
GBFreq[45][49] = 593;
GBFreq[51][57] = 592;
GBFreq[50][47] = 591;
GBFreq[42][90] = 590;
GBFreq[52][65] = 589;
GBFreq[53][47] = 588;
GBFreq[19][82] = 587;
GBFreq[31][19] = 586;
GBFreq[40][46] = 585;
GBFreq[24][89] = 584;
GBFreq[23][85] = 583;
GBFreq[20][28] = 582;
GBFreq[42][20] = 581;
GBFreq[34][38] = 580;
GBFreq[45][9] = 579;
GBFreq[54][50] = 578;
GBFreq[25][44] = 577;
GBFreq[35][66] = 576;
GBFreq[20][55] = 575;
GBFreq[18][85] = 574;
GBFreq[20][31] = 573;
GBFreq[49][17] = 572;
GBFreq[41][16] = 571;
GBFreq[35][73] = 570;
GBFreq[20][34] = 569;
GBFreq[29][44] = 568;
GBFreq[35][38] = 567;
GBFreq[49][9] = 566;
GBFreq[46][33] = 565;
GBFreq[49][51] = 564;
GBFreq[40][89] = 563;
GBFreq[26][64] = 562;
GBFreq[54][51] = 561;
GBFreq[54][36] = 560;
GBFreq[39][4] = 559;
GBFreq[53][13] = 558;
GBFreq[24][92] = 557;
GBFreq[27][49] = 556;
GBFreq[48][6] = 555;
GBFreq[21][51] = 554;
GBFreq[30][40] = 553;
GBFreq[42][92] = 552;
GBFreq[31][78] = 551;
GBFreq[25][82] = 550;
GBFreq[47][0] = 549;
GBFreq[34][19] = 548;
GBFreq[47][35] = 547;
GBFreq[21][63] = 546;
GBFreq[43][75] = 545;
GBFreq[21][87] = 544;
GBFreq[35][59] = 543;
GBFreq[25][34] = 542;
GBFreq[21][27] = 541;
GBFreq[39][26] = 540;
GBFreq[34][26] = 539;
GBFreq[39][52] = 538;
GBFreq[50][57] = 537;
GBFreq[37][79] = 536;
GBFreq[26][24] = 535;
GBFreq[22][1] = 534;
GBFreq[18][40] = 533;
GBFreq[41][33] = 532;
GBFreq[53][26] = 531;
GBFreq[54][86] = 530;
GBFreq[20][16] = 529;
GBFreq[46][74] = 528;
GBFreq[30][19] = 527;
GBFreq[45][35] = 526;
GBFreq[45][61] = 525;
GBFreq[30][9] = 524;
GBFreq[41][53] = 523;
GBFreq[41][13] = 522;
GBFreq[50][34] = 521;
GBFreq[53][86] = 520;
GBFreq[47][47] = 519;
GBFreq[22][28] = 518;
GBFreq[50][53] = 517;
GBFreq[39][70] = 516;
GBFreq[38][15] = 515;
GBFreq[42][88] = 514;
GBFreq[16][29] = 513;
GBFreq[27][90] = 512;
GBFreq[29][12] = 511;
GBFreq[44][22] = 510;
GBFreq[34][69] = 509;
GBFreq[24][10] = 508;
GBFreq[44][11] = 507;
GBFreq[39][92] = 506;
GBFreq[49][48] = 505;
GBFreq[31][46] = 504;
GBFreq[19][50] = 503;
GBFreq[21][14] = 502;
GBFreq[32][28] = 501;
GBFreq[18][3] = 500;
GBFreq[53][9] = 499;
GBFreq[34][80] = 498;
GBFreq[48][88] = 497;
GBFreq[46][53] = 496;
GBFreq[22][53] = 495;
GBFreq[28][10] = 494;
GBFreq[44][65] = 493;
GBFreq[20][10] = 492;
GBFreq[40][76] = 491;
GBFreq[47][8] = 490;
GBFreq[50][74] = 489;
GBFreq[23][62] = 488;
GBFreq[49][65] = 487;
GBFreq[28][87] = 486;
GBFreq[15][48] = 485;
GBFreq[22][7] = 484;
GBFreq[19][42] = 483;
GBFreq[41][20] = 482;
GBFreq[26][55] = 481;
GBFreq[21][93] = 480;
GBFreq[31][76] = 479;
GBFreq[34][31] = 478;
GBFreq[20][66] = 477;
GBFreq[51][33] = 476;
GBFreq[34][86] = 475;
GBFreq[37][67] = 474;
GBFreq[53][53] = 473;
GBFreq[40][88] = 472;
GBFreq[39][10] = 471;
GBFreq[24][3] = 470;
GBFreq[27][25] = 469;
GBFreq[26][15] = 468;
GBFreq[21][88] = 467;
GBFreq[52][62] = 466;
GBFreq[46][81] = 465;
GBFreq[38][72] = 464;
GBFreq[17][30] = 463;
GBFreq[52][92] = 462;
GBFreq[34][90] = 461;
GBFreq[21][7] = 460;
GBFreq[36][13] = 459;
GBFreq[45][41] = 458;
GBFreq[32][5] = 457;
GBFreq[26][89] = 456;
GBFreq[23][87] = 455;
GBFreq[20][39] = 454;
GBFreq[27][23] = 453;
GBFreq[25][59] = 452;
GBFreq[49][20] = 451;
GBFreq[54][77] = 450;
GBFreq[27][67] = 449;
GBFreq[47][33] = 448;
GBFreq[41][17] = 447;
GBFreq[19][81] = 446;
GBFreq[16][66] = 445;
GBFreq[45][26] = 444;
GBFreq[49][81] = 443;
GBFreq[53][55] = 442;
GBFreq[16][26] = 441;
GBFreq[54][62] = 440;
GBFreq[20][70] = 439;
GBFreq[42][35] = 438;
GBFreq[20][57] = 437;
GBFreq[34][36] = 436;
GBFreq[46][63] = 435;
GBFreq[19][45] = 434;
GBFreq[21][10] = 433;
GBFreq[52][93] = 432;
GBFreq[25][2] = 431;
GBFreq[30][57] = 430;
GBFreq[41][24] = 429;
GBFreq[28][43] = 428;
GBFreq[45][86] = 427;
GBFreq[51][56] = 426;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -