📄 unicode.cc
字号:
nCodepoints = convertUTF16toUTF32( unistring, buf, len);
// add 1 for the NULL terminator the converter promises it included.
nCodepoints++;
// allocate the return buffer, copy over, and return it.
UTF32 *ret = new UTF32(nCodepoints);
dMemcpy(ret, buf, nCodepoints);
PROFILE_END();
return ret;
}
//-----------------------------------------------------------------------------
const UTF8* convertUTF32toUTF8( const UTF32* unistring)
{
PROFILE_START(convertUTF32toUTF8);
// allocate plenty of memory.
U32 nCodeunits, len = dStrlen(unistring) * 3;
FrameTemp<UTF8> buf(len);
// perform conversion
nCodeunits = convertUTF32toUTF8( unistring, buf, len);
// add 1 for the NULL terminator the converter promises it included.
nCodeunits++;
// allocate the return buffer, copy over, and return it.
UTF8 *ret = new UTF8(nCodeunits);
dMemcpy(ret, buf, nCodeunits);
PROFILE_END();
return ret;
}
//-----------------------------------------------------------------------------
const UTF16* convertUTF32toUTF16(const UTF32* unistring)
{
PROFILE_START(convertUTF32toUTF16);
// allocate plenty of memory.
U32 nCodepoints, len = dStrlen(unistring);
FrameTemp<UTF16> buf(len);
// perform conversion
nCodepoints = convertUTF32toUTF16( unistring, buf, len);
// add 1 for the NULL terminator the converter promises it included.
nCodepoints++;
// allocate the return buffer, copy over, and return it.
UTF16 *ret = new UTF16(nCodepoints);
dMemcpy(ret, buf, nCodepoints);
PROFILE_END();
return ret;
}
#pragma mark -
//-----------------------------------------------------------------------------
// Functions that converts one unicode codepoint at a time
//-----------------------------------------------------------------------------
const UTF32 oneUTF8toUTF32( const UTF8* codepoint, U32 *unitsWalked)
{
PROFILE_START(oneUTF8toUTF32);
// codepoints 6 codeunits long are read, but do not convert correctly,
// and are filtered out anyway.
U32 expectedByteCount;
UTF32 ret = 0;
U8 codeunit;
// check the first byte ( a.k.a. codeunit ) .
unsigned char c = codepoint[0];
c = c >> 1;
expectedByteCount = firstByteLUT[c];
if(expectedByteCount > 0) // 0 or negative is illegal to start with
{
// process 1st codeunit
ret |= byteMask8LUT[expectedByteCount] & codepoint[0]; // bug?
// process trailing codeunits
for(U32 i=1;i<expectedByteCount; i++)
{
codeunit = codepoint[i];
if( firstByteLUT[codeunit>>1] == 0 )
{
ret <<= 6; // shift up 6
ret |= (codeunit & 0x3f); // mask in the low 6 bits of this codeunit byte.
}
else
{
// found a bad codepoint - did not get a medial where we wanted one.
// Dump the replacement, and claim to have parsed only 1 char,
// so that we'll dump a slew of replacements, instead of eating the next char.
ret = kReplacementChar;
expectedByteCount = 1;
break;
}
}
}
else
{
// found a bad codepoint - got a medial or an illegal codeunit.
// Dump the replacement, and claim to have parsed only 1 char,
// so that we'll dump a slew of replacements, instead of eating the next char.
ret = kReplacementChar;
expectedByteCount = 1;
}
if(unitsWalked != NULL)
*unitsWalked = expectedByteCount;
// codepoints in the surrogate range are illegal, and should be replaced.
if(isSurrogateRange(ret))
ret = kReplacementChar;
// codepoints outside the Basic Multilingual Plane add complexity to our UTF16 string classes,
// we've read them correctly so they wont foul the byte stream,
// but we kill them here to make sure they wont foul anything else
if(isAboveBMP(ret))
ret = kReplacementChar;
PROFILE_END();
return ret;
}
//-----------------------------------------------------------------------------
const UTF32 oneUTF16toUTF32(const UTF16* codepoint, U32 *unitsWalked)
{
PROFILE_START(oneUTF16toUTF32);
U8 expectedType;
U32 unitCount;
UTF32 ret = 0;
UTF16 codeunit1,codeunit2;
codeunit1 = codepoint[0];
expectedType = surrogateLUT[codeunit1 >> 10];
switch(expectedType)
{
case 0: // simple
ret = codeunit1;
unitCount = 1;
break;
case 1: // 2 surrogates
codeunit2 = codepoint[1];
if( surrogateLUT[codeunit2 >> 10] == 2)
{
ret = ((codeunit1 & byteMaskLow10 ) << 10) | (codeunit2 & byteMaskLow10);
unitCount = 2;
break;
}
// else, did not find a trailing surrogate where we expected one,
// so fall through to the error
case 2: // error
// found a trailing surrogate where we expected a codepoint or leading surrogate.
// Dump the replacement.
ret = kReplacementChar;
unitCount = 1;
break;
}
if(unitsWalked != NULL)
*unitsWalked = unitCount;
// codepoints in the surrogate range are illegal, and should be replaced.
if(isSurrogateRange(ret))
ret = kReplacementChar;
// codepoints outside the Basic Multilingual Plane add complexity to our UTF16 string classes,
// we've read them correctly so they wont foul the byte stream,
// but we kill them here to make sure they wont foul anything else
// NOTE: these are perfectly legal codepoints, we just dont want to deal with them.
if(isAboveBMP(ret))
ret = kReplacementChar;
PROFILE_END();
return ret;
}
//-----------------------------------------------------------------------------
const UTF16 oneUTF32toUTF16(const UTF32 codepoint)
{
// found a codepoint outside the codeable UTF-16 range!
// or, found an illegal codepoint!
if(codepoint >= 0x10FFFF || isSurrogateRange(codepoint))
return kReplacementChar;
// these are legal, we just dont want to deal with them.
if(isAboveBMP(codepoint))
return kReplacementChar;
return (UTF16)codepoint;
}
//-----------------------------------------------------------------------------
const U32 oneUTF32toUTF8(const UTF32 codepoint, UTF8 *threeByteCodeunitBuf)
{
PROFILE_START(oneUTF32toUTF8);
U32 bytecount = 0;
UTF8 *buf;
U32 working = codepoint;
buf = threeByteCodeunitBuf;
//-----------------
if(isSurrogateRange(working)) // found an illegal codepoint!
working = kReplacementChar;
//return oneUTF32toUTF8(kReplacementChar, threeByteCodeunitBuf);
if(isAboveBMP(working)) // these are legal, we just dont want to deal with them.
working = kReplacementChar;
//return oneUTF32toUTF8(kReplacementChar, threeByteCodeunitBuf);
//-----------------
if( working < (1 << 7)) // codeable in 7 bits
bytecount = 1;
else if( working < (1 << 11)) // codeable in 11 bits
bytecount = 2;
else if( working < (1 << 16)) // codeable in 16 bits
bytecount = 3;
AssertISV( bytecount > 0, "Error converting to UTF-8 in oneUTF32toUTF8(). isAboveBMP() should have caught this!");
//-----------------
U8 mask = byteMask8LUT[0]; // 0011 1111
U8 marker = ( ~mask << 1); // 1000 0000
// Process the low order bytes, shifting the codepoint down 6 each pass.
for( int i = bytecount-1; i > 0; i--)
{
threeByteCodeunitBuf[i] = marker | (working & mask);
working >>= 6;
}
// Process the 1st byte. filter based on the # of expected bytes.
mask = byteMask8LUT[bytecount];
marker = ( ~mask << 1 );
threeByteCodeunitBuf[0] = marker | working & mask;
PROFILE_END();
return bytecount;
}
//-----------------------------------------------------------------------------
const U32 dStrlen(const UTF16 *unistring)
{
U32 i = 0;
while(unistring[i] != NULL)
i++;
return i;
}
//-----------------------------------------------------------------------------
const U32 dStrlen(const UTF32 *unistring)
{
U32 i = 0;
while(unistring[i] != NULL)
i++;
return i;
}
/* alternate utf-8 decode impl for speed, no error checking,
left here for your amusement:
U32 codeunit = codepoint + expectedByteCount - 1;
U32 i = 0;
switch(expectedByteCount)
{
case 6: ret |= ( *(codeunit--) & 0x3f ); i++;
case 5: ret |= ( *(codeunit--) & 0x3f ) << (6 * i++);
case 4: ret |= ( *(codeunit--) & 0x3f ) << (6 * i++);
case 3: ret |= ( *(codeunit--) & 0x3f ) << (6 * i++);
case 2: ret |= ( *(codeunit--) & 0x3f ) << (6 * i++);
case 1: ret |= *(codeunit) & byteMask8LUT[expectedByteCount] << (6 * i);
}
*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -