⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 latin1prober.cpp

📁 判断一串字符是属于什么字符集的程序
💻 CPP
字号:
/*	libcharguess	-	Guess the encoding/charset of a string    Copyright (C) 2003  Stephane Corbe <noubi@users.sourceforge.net>	Based on Mozilla sources    This library is free software; you can redistribute it and/or    modify it under the terms of the GNU Lesser General Public    License as published by the Free Software Foundation; either    version 2.1 of the License, or (at your option) any later version.    This library is distributed in the hope that it will be useful,    but WITHOUT ANY WARRANTY; without even the implied warranty of    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU    Lesser General Public License for more details.    You should have received a copy of the GNU Lesser General Public    License along with this library; if not, write to the Free Software    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/#include "Latin1Prober.h"#include "types.h"#define UDF    0        // undefined#define OTH    1        //other#define ASC    2        // ascii capital letter#define ASS    3        // ascii small letter#define ACV    4        // accent capital vowel#define ACO    5        // accent capital other#define ASV    6        // accent small vowel#define ASO    7        // accent small other#define CLASS_NUM   8    // total classesstatic unsigned char Latin1_CharToClass[] = {  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 00 - 07  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 08 - 0F  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 10 - 17  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 18 - 1F  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 20 - 27  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 28 - 2F  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 30 - 37  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 38 - 3F  OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 40 - 47  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 48 - 4F  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 50 - 57  ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH,   // 58 - 5F  OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 60 - 67  ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 68 - 6F  ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 70 - 77  ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH,   // 78 - 7F  OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH,   // 80 - 87  OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF,   // 88 - 8F  UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 90 - 97  OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO,   // 98 - 9F  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // A0 - A7  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // A8 - AF  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // B0 - B7  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // B8 - BF  ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO,   // C0 - C7  ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV,   // C8 - CF  ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH,   // D0 - D7  ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO,   // D8 - DF  ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO,   // E0 - E7  ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV,   // E8 - EF  ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH,   // F0 - F7  ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO,   // F8 - FF};/* 0 : illegal    1 : very unlikely    2 : normal    3 : very likely*/static unsigned char Latin1ClassModel[] = {/*      UDF OTH ASC ASS ACV ACO ASV ASO  *//*UDF*/  0,  0,  0,  0,  0,  0,  0,  0,/*OTH*/  0,  3,  3,  3,  3,  3,  3,  3,/*ASC*/  0,  3,  3,  3,  3,  3,  3,  3, /*ASS*/  0,  3,  3,  3,  1,  1,  3,  3,/*ACV*/  0,  3,  3,  3,  1,  2,  1,  2,/*ACO*/  0,  3,  3,  3,  3,  3,  3,  3, /*ASV*/  0,  3,  1,  3,  1,  1,  1,  3, /*ASO*/  0,  3,  1,  3,  1,  1,  3,  3,};void  nsLatin1Prober::Reset(void){  mState = eDetecting;  mLastCharClass = OTH;  for (int i = 0; i < FREQ_CAT_NUM; i++)    mFreqCounter[i] = 0;}PRBool nsLatin1Prober::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen){  //do filtering to reduce load to probers  char *newptr;  char *prevPtr, *curPtr;  PRBool isInTag = PR_FALSE;  newptr = *newBuf = (char*)PR_MALLOC(aLen);  if (!newptr)    return PR_FALSE;  for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)  {/*    if (*curPtr == '>')      isInTag = PR_FALSE;    else if (*curPtr == '<')      isInTag = PR_TRUE;*/    if (!(*curPtr & 0x80) &&        (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') )    {      if (curPtr > prevPtr && !isInTag) //current segment contains more than just a symbol                                         // and it is not inside a tag, keep it      {        while (prevPtr < curPtr) *newptr++ = *prevPtr++;          prevPtr++;        *newptr++ = ' ';      }      else        prevPtr = curPtr+1;    }  }  newLen = newptr - *newBuf;  return PR_TRUE;}nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen){  char *newBuf1;  PRUint32 newLen1;  if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {    newBuf1 = (char*)aBuf;    newLen1 = aLen;  }    unsigned char charClass;  unsigned char freq;  for (PRUint32 i = 0; i < newLen1; i++)  {    charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]];    freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass];    if (freq == 0) {      mState = eNotMe;      break;    }    mFreqCounter[freq]++;    mLastCharClass = charClass;  }  if (newBuf1 != aBuf)    PR_FREEIF(newBuf1);  return mState;}float nsLatin1Prober::GetConfidence(void){  if (mState == eNotMe)    return 0.01f;    float confidence;  PRUint32 total = 0;  for (PRInt32 i = 0; i < FREQ_CAT_NUM; i++)    total += mFreqCounter[i];  confidence = mFreqCounter[3]*1.0f / total;  confidence -= mFreqCounter[1]*20.0f/total;  if (confidence < 0.0f)    confidence = 0.0f;    // lower the confidence of latin1 so that other more accurate detector   // can take priority.  confidence *= 0.50f;  return confidence;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -