📄 utf8.cpp

📁 utf8编码
💻 CPP
📖 第 1 页 / 共 2 页
字号:
上一页 12
		ret = ::MultiByteToWideChar(CP_ACP,0,srcData,(int)srcCount,wstr,(int)destCount);
		if(ret == 0) ret = -1;
#else
		ret = ::mbstowcs(wstr, srcData,srcCount);
#endif
			if(ret < 0){
				throw UTF8ConvertError();
			}
			len = UNICODEToUTF8(wstr,ret,destData,destCount);
		}catch(...){
			free(wstr);
			throw;
		}
		free(wstr);
		return len;
	}
	DNC_DECLARE size_t UTF8ToANSI(custr srcData,size_t srcCount, astr destData,size_t destCount){
		wchar_t *wstr = (wchar_t*)malloc((destCount+1)*2);
		int ret = 0;
		try{
			size_t len = UTF8ToUNICODE(srcData,srcCount,wstr,destCount);
			wstr[len] = 0;
#if defined(_MSC_VER)
		ret = ::WideCharToMultiByte(CP_ACP,0,wstr,(int)len,destData,(int)len*2, NULL, NULL );
#else
		ret = ::wcstombs(destData,wstr,len*2);
#endif 
			if(ret < 0){
				throw UTF8ConvertError();
			}
		}catch(...){
			free(wstr);
			throw;
		}
		free(wstr);
		return ret;
	}
	DNC_DECLARE size_t UTF8ToUNICODE(custr srcData,size_t srcCount, wstr destData,size_t destCount){
		// Watch for pathological scenario. Shouldn't happen, but...
		if (!srcCount || !destCount)
			return 0;

		//
		//unsigned char charSizes[1024];
		unsigned int  bytesEaten;

		//
		//  Get pointers to our start and end points of the input and output
		//  buffers.
		//
		custr  srcPtr = srcData;
		custr  srcEnd = srcPtr + srcCount;
		wstr   outPtr = destData;
		wstr   outEnd = outPtr + destCount;
		//ustr   sizePtr = charSizes;



		//
		//  We now loop until we either run out of input data, or room to store
		//  output chars.
		//
		while ((srcPtr < srcEnd) && (outPtr < outEnd))
		{
			// Special-case ASCII, which is a leading byte value of <= 127
			if (*srcPtr <= 127)
			{
				*outPtr++ = wchar_t(*srcPtr++);
				//*sizePtr++ = 1;
				continue;
			}

			// See how many trailing src bytes this sequence is going to require
			const unsigned int trailingBytes = gUTFBytes[*srcPtr];

			//
			//  If there are not enough source bytes to do this one, then we
			//  are done. Note that we done >= here because we are implicitly
			//  counting the 1 byte we get no matter what.
			//
			//  If we break out here, then there is nothing to undo since we
			//  haven't updated any pointers yet.
			//
			if (srcPtr + trailingBytes >= srcEnd)
				break;

			// Looks ok, so lets build up the value
			// or at least let's try to do so--remembering that
			// we cannot assume the encoding to be valid:

			// first, test first byte
			if((gUTFByteIndicatorTest[trailingBytes] & *srcPtr) != gUTFByteIndicator[trailingBytes]) 
				throw UTF8FormatError(0,trailingBytes,*srcPtr,srcPtr-srcData);

			unsigned long tmpVal = *srcPtr++;
			tmpVal <<= 6;
			for(unsigned int i=1; i<trailingBytes; i++) 
			{
				if((*srcPtr & 0xC0) == 0x80) 
				{
					tmpVal += *srcPtr++; 
					tmpVal <<= 6;
				}else throw UTF8FormatError(i,trailingBytes,*srcPtr,srcPtr-srcData);
			}
			if((*srcPtr & 0xC0) == 0x80){
				tmpVal += *srcPtr++;
			}else throw UTF8FormatError(trailingBytes,trailingBytes,*srcPtr,srcPtr-srcData);

			// since trailingBytes comes from an array, this logic is redundant
			//  default :
			//      ThrowXML(TranscodingException, XMLExcepts::Trans_BadSrcSeq);
			//}
			tmpVal -= gUTFOffsets[trailingBytes];

			//
			//  If it will fit into a single char, then put it in. Otherwise
			//  encode it as a surrogate pair. If its not valid, use the
			//  replacement char.
			//
			if (!(tmpVal & 0xFFFF0000))	{
				//*sizePtr++ = trailingBytes + 1;
				*outPtr++ = wchar_t(tmpVal);
			}else if (tmpVal > 0x10FFFF){
				//
				//  If we've gotten more than 32 chars so far, then just break
				//  out for now and lets process those. When we come back in
				//  here again, we'll get no chars and throw an exception. This
				//  way, the error will have a line and col number closer to
				//  the real problem area.
				//
				if ((outPtr - destData) > 32)
					break;

				throw UTF8FormatError(srcPtr-srcData);
			}else{
				//
				//  If we have enough room to store the leading and trailing
				//  chars, then lets do it. Else, pretend this one never
				//  happened, and leave it for the next time. Since we don't
				//  update the bytes read until the bottom of the loop, by
				//  breaking out here its like it never happened.
				//
				if (outPtr + 1 >= outEnd)
					break;

				// Store the leading surrogate char
				tmpVal -= 0x10000;
				//*sizePtr++ = trailingBytes + 1;
				*outPtr++ = wchar_t((tmpVal >> 10) + 0xD800);

				//
				//  And then the treailing char. This one accounts for no
				//  bytes eaten from the source, so set the char size for this
				//  one to be zero.
				//
				//*sizePtr++ = 0;
				*outPtr++ = wchar_t(tmpVal & 0x3FF) + 0xDC00;
			}
		}

		// Update the bytes eaten
		bytesEaten = srcPtr - (unsigned char*)srcData;

		// Return the characters read
		return outPtr - destData;
	}


	DNC_DECLARE size_t UNICODEToUTF8(cwstr srcData,size_t srcCount,ustr destData,size_t destCount){
		// Watch for pathological scenario. Shouldn't happen, but...
		if (!srcCount || !destCount)
			return 0;

		//
		 unsigned int   charsEaten;

		//
		//  Get pointers to our start and end points of the input and output
		//  buffers.
		//
		cwstr    srcPtr = srcData;
		cwstr    srcEnd = srcPtr + srcCount;
		ustr     outPtr = destData;
		ustr     outEnd = destData + destCount;

		while (srcPtr < srcEnd)
		{
			//
			//  Tentatively get the next char out. We have to get it into a
			//  32 bit value, because it could be a surrogate pair.
			//
			unsigned long curVal = *srcPtr;

			//
			//  If its a leading surrogate, then lets see if we have the trailing
			//  available. If not, then give up now and leave it for next time.
			//
			unsigned int srcUsed = 1;
			if ((curVal >= 0xD800) && (curVal <= 0xDBFF))
			{
				if (srcPtr + 1 >= srcEnd)
					break;

				// Create the composite surrogate pair
				curVal = ((curVal - 0xD800) << 10)
						+ ((*(srcPtr + 1) - 0xDC00) + 0x10000);

				// And indicate that we ate another one
				srcUsed++;
			}

			// Figure out how many bytes we need
			unsigned int encodedBytes;
			if (curVal < 0x80)
				encodedBytes = 1;
			else if (curVal < 0x800)
				encodedBytes = 2;
			else if (curVal < 0x10000)
				encodedBytes = 3;
			else if (curVal < 0x200000)
				encodedBytes = 4;
			else if (curVal < 0x4000000)
				encodedBytes = 5;
			else if (curVal <= 0x7FFFFFFF)
				encodedBytes = 6;
			else
			{
				assert(0);
				// Else, use the replacement character
				//*outPtr++ = chSpace;
				//srcPtr += srcUsed;
				continue;
			}

			//
			//  If we cannot fully get this char into the output buffer,
			//  then leave it for the next time.
			//
			if (outPtr + encodedBytes > outEnd)
				break;

			// We can do it, so update the source index
			srcPtr += srcUsed;

			//
			//  And spit out the bytes. We spit them out in reverse order
			//  here, so bump up the output pointer and work down as we go.
			//
			outPtr += encodedBytes;
			switch(encodedBytes)
			{
				case 6 : *--outPtr = byte((curVal | 0x80UL) & 0xBFUL);
						curVal >>= 6;
				case 5 : *--outPtr = byte((curVal | 0x80UL) & 0xBFUL);
						curVal >>= 6;
				case 4 : *--outPtr = byte((curVal | 0x80UL) & 0xBFUL);
						curVal >>= 6;
				case 3 : *--outPtr = byte((curVal | 0x80UL) & 0xBFUL);
						curVal >>= 6;
				case 2 : *--outPtr = byte((curVal | 0x80UL) & 0xBFUL);
						curVal >>= 6;
				case 1 : *--outPtr = byte
						(
							curVal | gFirstByteMark[encodedBytes]
						);
			}

			// Add the encoded bytes back in again to indicate we've eaten them
			outPtr += encodedBytes;
		}

		// Fill in the chars we ate
		charsEaten = (srcPtr - srcData);

		// And return the bytes we filled in
		return (outPtr - destData);
	}

    DNC_DECLARE std::string UTF8ToANSI(const std::string &srcData){
        Array<char> destData(srcData.size()+1);
        size_t len = UTF8ToANSI((uchar*)srcData.c_str(),srcData.size(),destData,srcData.size()+1);
        destData[len] = 0;
        return destData.data();
    }
    DNC_DECLARE std::string ANSIToUTF8(const std::string &srcData){
        Array<uchar> destData(srcData.size()*3+1);
        size_t len = ANSIToUTF8(srcData.c_str(),srcData.size(),destData,srcData.size()*3+1);
        destData[len] = 0;
        return (char*)destData.data();
    }
}

#include "utf8.dnc"
上一页 12
💿 文件大小 8 K
👤 上传用户 zcflion
📂 所属分类 VC书籍
📄 代码行数 588 行
💻 语言类型 C++
🏷️ 相关标签

#utf8 #编码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -