⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 utf_functions.cpp

📁 utf8转换工具
💻 CPP
字号:
/*
* Written by Boby Thomas Pazheparampil. (March 2007.)
* Platform independent code.  (I hope so)
* Tested with Windows 2000, XP, Cygwin and Linux Debian.
*/

#include "utf_functions.h"

/*************************************************************************
* @f Fnct			: EatupWhiteSpace
* @r Return			: Trimmed string without whitespaces at both ends of 
						strings
* Description       : Function to trim whitespaces from both ends of the
						strings.
* @author			: Boby thomas
**************************************************************************/
string EatupWhiteSpace(string sInput)
{
	int iStart = sInput.find_first_not_of(" \t\n");
	int iEnd = sInput.find_last_not_of(" \t\n");
	if(iStart == -1) // No non-spaces
		return "";

	return string(sInput, iStart, iEnd - iStart + 1);
}

/*************************************************************************
* @f Fnct			: hexchar2binary
* @r Return			: Binary string.
* Description       : Convert a charater to a binary string.
* @author			: Boby thomas
**************************************************************************/

string hexchar2binary(char c)
{
	switch(c)
	{
	case '0':
		return "0000";

	case '1':
		return "0001";

	case '2':
		return "0010";

	case '3':
		return "0011";

	case '4':
		return "0100";

	case '5':
		return "0101";

	case '6':
		return "0110";

	case '7':
		return "0111";

	case '8':
		return "1000";

	case '9':
		return "1001";

	case 'a':
		return "1010";

	case 'b':
		return "1011";

	case 'c':
		return "1100";

	case 'd':
		return "1101";

	case 'e':
		return "1110";

	case 'f':
		return "1111";
	}

	return "0000";
}

/*************************************************************************
* @f Fnct			: hex2binary
* @r Return			: Binary string.
* Description       : Convert stl string of hex values to a binary string.
* @author			: Boby thomas
**************************************************************************/

string hex2binary(string sAscii)
{
	string sBinary = "";

	for(unsigned int iCnt = 0;iCnt < sAscii.length();iCnt++)
	{
		char c = sAscii[iCnt];
		sBinary +=  hexchar2binary(c);

	}

	int iTmp = sBinary.find_first_not_of("0");
	if(iTmp == -1)
		return "0";

	sBinary = sBinary.substr(iTmp,9999);

	return sBinary;

}


/*************************************************************************
* @f Fnct			: binary8bit2hex
* @r Return			: Hexadecimal string.
* Description       : Convert stl binary string to of hex value string.
						Accept only 8bit binary. 
* @author			: Boby thomas
**************************************************************************/

string binary8bit2hex(string sBinary)
{
	double lVal = 0;
	long lPower = 0;
	char acBuffer[20];
	for(unsigned int iTmp = sBinary.length();iTmp > 0;iTmp--)
	{
		char c = sBinary[sBinary.length()-1-lPower];
		switch(c)
		{
		case '1':
			lVal += pow(2,lPower);
			break;

		case '0':
			break;
		default:
			cout<<"\nInvalid binary data";
		}
		lPower++;
	
	}

	sprintf(acBuffer,"%x",(int)lVal);
	string sHex = acBuffer;
	if(sHex.length() == 1)
		sHex = "0" + sHex;

	return sHex;

}

/*************************************************************************
* @f Fnct			: binary2hex
* @r Return			: Hexadecimal string.
* Description       : Convert stl binary string to hex value string.
						Accept binary string of any length. 
* @author			: Boby thomas
**************************************************************************/
string binary2hex(string sBinary)
{
	string sHex;
	int iStart = sBinary.find_first_of("1");
	if(iStart == -1)
		return "00";
	else
		sBinary = sBinary.substr(iStart,99999);

	while(sBinary.length() > 8)
	{
		string sOctect = sBinary.substr(sBinary.length()-8);
		sHex = binary8bit2hex(sOctect) + sHex;


		sBinary = sBinary.substr(0,sBinary.length()-8);
	}
	sHex = binary8bit2hex(sBinary) + sHex;
	return sHex;
}


/*************************************************************************
* @f Fnct			: convertHex2UTF
* @r Return			: single character UTF string.
* Description     : Convert stl hex charater string to corresponding
						UTF character string. Do not misunderstand this function
                  with a stream converter. This function converts only one 
                  character. 
                  For example 
                  "7f" return "7f"
                  "80" return "c280"
                  "fffd" return "efbfbd"

* @author			: Boby thomas
**************************************************************************/

string convertHex2UTF(string sHex)
{
	string sReturn = "";
	string sTemp;
	string sUTFBinary;
	unsigned int iLen = 6;

	sHex = EatupWhiteSpace(sHex);
	int iTmp = sHex.find_first_not_of("0123456789abcdef");
	if(iTmp != -1)
		sHex = sHex.substr(0,iTmp);

	//we have a binary array now.
	sHex = hex2binary(sHex);

	if(sHex.length() > 7)
	{
		while(iLen < sHex.length() )
		{
			while(sHex.length()<6)
				sHex = "0" + sHex;
			sUTFBinary = sHex.substr(sHex.length()-6,sHex.length()) + sUTFBinary;
			sUTFBinary = "10" + sUTFBinary;

			sHex = sHex.substr(0,sHex.length()-6);

			iLen--;
		}
		if(iLen > 6)
		{
			cout<<"Too long input...";
			return "error";
		}


		while(sHex.length() <= iLen)
			sHex = "0" + sHex;


		while(sHex.length() < 8)
			sHex = "1" + sHex;

		sUTFBinary = sHex + sUTFBinary;
	}
	else
		sUTFBinary = sHex;

	sReturn = binary2hex(sUTFBinary);



	return sReturn;
}

/*************************************************************************
* @f Fnct			: findLengthUTF
* @r Return			: single character. Normaly first character of a UTF stream.
						-1 for invalid UTF entry.
* Description       : Returns the number of characters in the UTF string.
						Say for example 0xc2  will return 2 since one more byte 
						following this will constitute the UTF character.
* @author			: Boby thomas
**************************************************************************/
long findLengthUTF(string sUTFFirstByte)
{
	long iLen = 6;
	char c;

	sUTFFirstByte = EatupWhiteSpace(sUTFFirstByte);
	int iTmp = sUTFFirstByte.find_first_not_of("0123456789abcdef");
	if(iTmp != -1)
		sUTFFirstByte = sUTFFirstByte.substr(0,iTmp);

	//we have a binary array now.
	sUTFFirstByte = hex2binary(sUTFFirstByte);

	while(sUTFFirstByte.length() < 8)
		sUTFFirstByte = "0"+sUTFFirstByte;

	string sHeader = sUTFFirstByte.substr(0,8);
	iLen = 0;
	iTmp = 0;
	while((c = sHeader[iTmp++]) != '0')
	{
		iLen ++;
		if(iLen == 8)
			return -1;
	}


	if(0==iLen)
		iLen = 1;

	return iLen;
}


/*************************************************************************
* @f Fnct			: convertUTF2Hex
* @r Return			: Hex value corresponding to the UTF chracter.
						"error" on invalid character.
* Description       : Returns the hex value corresponding to a UTF character.
						Do not misunderstand this function with a stream converter.
                  This function converts only one UTF-8 character. 
                  For example 
                  "7f" return "7f"
                  "c280" return "80"
                  "efbfbd" return "fffd"

* @author			: Boby thomas
**************************************************************************/
string convertUTF2Hex(string sUTF)
{
	string sReturn = "";
	string sBinary = "";
	unsigned int iLen = 6;
	char c;

	sUTF = EatupWhiteSpace(sUTF);
	int iStart = sUTF.find_first_not_of("0123456789abcdef");
	if(iStart != -1)
		sUTF = sUTF.substr(0,iStart);

	//we have a binary array now.
	sUTF = hex2binary(sUTF);

	while(sUTF.length()%8 != 0)
		sUTF = "0"+sUTF;


	string sHeader = sUTF.substr(0,8);
	iLen = 0;
	unsigned int iTmp = 0;
	while((c = sHeader[iTmp++]) != '0')
	{
		iLen ++;
		if(iLen == 8)
		{
			return "error";
		}
	}

	if(sUTF.length() < (iLen*8))
		return "error";


	if(0==iLen)
	{
		iLen = 1;
		sBinary = sHeader.substr(iTmp,9999);
	}
	else
	{
		sBinary = sHeader.substr(iTmp,9999);
		iTmp = 1;
		while(iTmp < iLen)
		{
			sHeader = sUTF.substr((iTmp*8),8);
			if((sHeader[0] != '1') || (sHeader[1] != '0') )
				return "error";

			sBinary += sHeader.substr(2,6);
			iTmp ++;
		}
		
	}
	sReturn = binary2hex(sBinary);
	return sReturn;
}


/*************************************************************************
* @f Fnct			: generateUTFFileDetails
* @r Return			: true - file could be a UTF file. 
						(No invalid UTF character in the file)
* Description       : This function evaluate a file for validity. Returns false
						if there a single occurance of a nonpossible character.
						Writes a file utfdetails_<filename> with all the utf 
						character details.
* @author			: Boby thomas
**************************************************************************/
bool generateUTFFileDetails(string sFileName)
{
	bool bSuccess = true;
	long lLength = 0;
	char HexBuffer[25] = {0};


	string UTFString;
	string UnicodeString;


	FILE * fpInput = fopen(sFileName.c_str(),"rb");
	if(fpInput == NULL)
	{
		cout<<"Failed to open file "<<sFileName.c_str()<<"\n";
		return false;
	}

	string sOutput = "utfdetails_" + sFileName;

	FILE * fpOutput = fopen(sOutput.c_str(),"wb");

	if(fpOutput == NULL)
	{
		cout<<"Failed to open output file "<<sOutput.c_str()<<"\n";
		return false;
	}

	int lChar;
	fprintf(fpOutput,"=================================================================\n");
	fprintf(fpOutput,"        ASCII       ||       BINARY       ||        UTF         \n");
	fprintf(fpOutput,"=================================================================\n");

	while(1)
	{
		lChar = fgetc(fpInput);
		if(-1 == lChar)
			break;

		if(lLength == 0)
		{
			UTFString = "";
			sprintf(HexBuffer,"%x",lChar);
			lLength = findLengthUTF(HexBuffer);
		}
		if(lLength == -1)
		{
			lLength = 0;
			bSuccess = false;
			fprintf(fpOutput,"Invalid UTF character. Not a possible first byte. Binary value:%s\n",HexBuffer);
			continue;
		}

		if(lLength > 0)
		{
			sprintf(HexBuffer,"%x",lChar);
			UTFString += HexBuffer;
			lLength--;
			if(lLength == 0)
			{
				string sResult = convertUTF2Hex(UTFString);
				
				if(sResult.compare("error") == 0)
					fprintf(fpOutput,"Invalid UTF character. Binary data:%s\n",UTFString.c_str());
				else if((convertHex2UTF(sResult).compare(UTFString) != 0) && (convertHex2UTF(sResult).compare("0"+UTFString) != 0) )
					fprintf(fpOutput,"Invalid UTF character. Binary data:%s\n",UTFString.c_str());
				else
					fprintf(fpOutput,"%18c  ||%18s  ||%18s  \n",(char)lChar,UTFString.c_str(),sResult.c_str());
			}
		}

	}


	if(fpInput)
		fclose(fpInput);

	if(fpOutput)
		fclose(fpOutput);

	return bSuccess;

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -