📄 unicode.cc
字号:
//-----------------------------------------------------------------------------
// Torque Game Engine
// Copyright (C) GarageGames.com, Inc.
//-----------------------------------------------------------------------------
#include "core/unicode.h"
#include "core/frameAllocator.h"
#include "platform/profiler.h"
#include <stdio.h>
#pragma warning(disable:4068)
//-----------------------------------------------------------------------------
/// replacement character. Standard correct value is 0xFFFD.
#define kReplacementChar 0xFFFD
/// Look up table. Shift a byte >> 1, then look up how many bytes to expect after it.
/// Contains -1's for illegal values.
#pragma mark U8 firstByteLUT
U8 firstByteLUT[128] =
{
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x0F // single byte ascii
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x1F // single byte ascii
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x2F // single byte ascii
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x3F // single byte ascii
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4F // trailing utf8
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x5F // trailing utf8
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0x6F // first of 2
3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 6, 0, // 0x7F // first of 3,4,5,illegal in utf-8
};
/// Look up table. Shift a 16-bit word >> 10, then look up whether it is a surrogate,
/// and which part. 0 means non-surrogate, 1 means 1st in pair, 2 means 2nd in pair.
#pragma mark U8 surrogateLUT
U8 surrogateLUT[64] =
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x1F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x2F
0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, // 0x3F
};
/// Look up table. Feed value from firstByteLUT in, gives you
/// the mask for the data bits of that UTF-8 code unit.
#pragma mark U8 byteMask8LUT[]
U8 byteMask8LUT[] = { 0x3f, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; // last 0=6, 1=7, 2=5, 4, 3, 2, 1 bits
/// Mask for the data bits of a UTF-16 surrogate.
#pragma mark U8 byteMaskLow10
U16 byteMaskLow10 = 0x03ff;
#pragma mark -
//-----------------------------------------------------------------------------
inline bool isSurrogateRange(U32 codepoint)
{
return ( 0xd800 < codepoint && codepoint < 0xdfff );
}
inline bool isAboveBMP(U32 codepoint)
{
return ( codepoint > 0xFFFF );
}
#pragma mark -
//-----------------------------------------------------------------------------
const U32 convertUTF8toUTF16(const UTF8 *unistring, UTF16 *outbuffer, U32 len)
{
PROFILE_START(convertUTF8toUTF16);
U32 walked, nCodepoints;
UTF32 middleman;
nCodepoints=0;
while(*unistring != NULL && nCodepoints < len)
{
walked = 1;
middleman = oneUTF8toUTF32(unistring,&walked);
outbuffer[nCodepoints] = oneUTF32toUTF16(middleman);
unistring+=walked;
nCodepoints++;
}
nCodepoints = getMin(nCodepoints,len);
outbuffer[nCodepoints] = NULL;
PROFILE_END();
return nCodepoints;
}
//-----------------------------------------------------------------------------
const U32 convertUTF8toUTF32(const UTF8 *unistring, UTF32 *outbuffer, U32 len)
{
PROFILE_START(convertUTF8toUTF32);
U32 walked, nCodepoints;
nCodepoints=0;
while(*unistring != NULL && nCodepoints < len)
{
walked = 1;
outbuffer[nCodepoints] = oneUTF8toUTF32(unistring,&walked);
unistring+=walked;
nCodepoints++;
}
nCodepoints = getMin(nCodepoints,len);
outbuffer[nCodepoints] = NULL;
PROFILE_END();
return nCodepoints;
}
//-----------------------------------------------------------------------------
const U32 convertUTF16toUTF8( const UTF16 *unistring, UTF8 *outbuffer, U32 len)
{
PROFILE_START(convertUTF16toUTF8);
U32 walked, nCodeunits, codeunitLen;
UTF32 middleman;
nCodeunits=0;
while( *unistring != NULL && nCodeunits < len - 3)
{
walked = 1;
middleman = oneUTF16toUTF32(unistring,&walked);
codeunitLen = oneUTF32toUTF8(middleman, &outbuffer[nCodeunits]);
unistring += walked;
nCodeunits += codeunitLen;
}
nCodeunits = getMin(nCodeunits,len);
outbuffer[nCodeunits] = NULL;
PROFILE_END();
return nCodeunits;
}
//-----------------------------------------------------------------------------
const U32 convertUTF16toUTF32(const UTF16 *unistring, UTF32 *outbuffer, U32 len)
{
PROFILE_START(convertUTF16toUTF32);
U32 walked, nCodepoints;
nCodepoints=0;
while( *unistring != NULL && nCodepoints < len )
{
walked=1;
outbuffer[nCodepoints] = oneUTF16toUTF32(unistring,&walked);
unistring += walked;
nCodepoints++;
}
nCodepoints = getMin(nCodepoints,len);
outbuffer[nCodepoints] = NULL;
PROFILE_END();
return nCodepoints;
}
//-----------------------------------------------------------------------------
const U32 convertUTF32toUTF8( const UTF32 *unistring, UTF8 *outbuffer, U32 len)
{
PROFILE_START(convertUTF32toUTF8);
U32 nCodeunits, codeunitLen;
nCodeunits=0;
while( *unistring != NULL && nCodeunits < len - 3)
{
codeunitLen = oneUTF32toUTF8(*unistring, &outbuffer[nCodeunits]);
unistring++;
nCodeunits += codeunitLen;
}
nCodeunits = getMin(nCodeunits,len);
outbuffer[nCodeunits] = NULL;
PROFILE_END();
return nCodeunits;
}
//-----------------------------------------------------------------------------
const U32 convertUTF32toUTF16(const UTF32 *unistring, UTF16 *outbuffer, U32 len)
{
PROFILE_START(convertUTF32toUTF16);
U32 walked, nCodepoints;
nCodepoints=0;
while(*unistring != NULL && nCodepoints < len)
{
outbuffer[nCodepoints] = oneUTF32toUTF16(*unistring);
unistring++;
nCodepoints++;
}
nCodepoints = getMin(nCodepoints,len);
outbuffer[nCodepoints] = NULL;
PROFILE_END();
return nCodepoints;
}
#pragma mark -
//-----------------------------------------------------------------------------
// Functions that convert buffers of unicode code points
//-----------------------------------------------------------------------------
const UTF16* convertUTF8toUTF16( const UTF8* unistring)
{
PROFILE_START(convertUTF8toUTF16);
// allocate plenty of memory.
U32 nCodepoints, len = dStrlen(unistring);
FrameTemp<UTF16> buf(len);
// perform conversion
nCodepoints = convertUTF8toUTF16( unistring, buf, len);
// add 1 for the NULL terminator the converter promises it included.
nCodepoints++;
// allocate the return buffer, copy over, and return it.
UTF16 *ret = new UTF16(nCodepoints);
dMemcpy(ret, buf, nCodepoints);
PROFILE_END();
return ret;
}
//-----------------------------------------------------------------------------
const UTF32* convertUTF8toUTF32( const UTF8* unistring)
{
PROFILE_START(convertUTF8toUTF32);
// allocate plenty of memory.
U32 nCodepoints, len = dStrlen(unistring);
FrameTemp<UTF32> buf(len);
// perform conversion
nCodepoints = convertUTF8toUTF32( unistring, buf, len);
// add 1 for the NULL terminator the converter promises it included.
nCodepoints++;
// allocate the return buffer, copy over, and return it.
UTF32 *ret = new UTF32(nCodepoints);
dMemcpy(ret, buf, nCodepoints);
PROFILE_END();
return ret;
}
//-----------------------------------------------------------------------------
const UTF8* convertUTF16toUTF8( const UTF16* unistring)
{
PROFILE_START(convertUTF16toUTF8);
// allocate plenty of memory.
U32 nCodeunits, len = dStrlen(unistring) * 3;
FrameTemp<UTF8> buf(len);
// perform conversion
nCodeunits = convertUTF16toUTF8( unistring, buf, len);
// add 1 for the NULL terminator the converter promises it included.
nCodeunits++;
// allocate the return buffer, copy over, and return it.
UTF8 *ret = new UTF8(nCodeunits);
dMemcpy(ret, buf, nCodeunits);
PROFILE_END();
return ret;
}
//-----------------------------------------------------------------------------
const UTF32* convertUTF16toUTF32(const UTF16* unistring)
{
PROFILE_START(convertUTF16toUTF32);
// allocate plenty of memory.
U32 nCodepoints, len = dStrlen(unistring);
FrameTemp<UTF32> buf(len);
// perform conversion
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -