📄 clasifyc.cpp
字号:
0x0C67, // TELUGU DIGIT ONE
0x0C68, // TELUGU DIGIT TWO
0x0C69, // TELUGU DIGIT THREE
0x0C6A, // TELUGU DIGIT FOUR
0x0C6B, // TELUGU DIGIT FIVE
0x0C6C, // TELUGU DIGIT SIX
0x0C6D, // TELUGU DIGIT SEVEN
0x0C6E, // TELUGU DIGIT EIGHT
0x0C6F, // TELUGU DIGIT NINE
0x0CE6, // KANNADA DIGIT ZERO
0x0CE7, // KANNADA DIGIT ONE
0x0CE8, // KANNADA DIGIT TWO
0x0CE9, // KANNADA DIGIT THREE
0x0CEA, // KANNADA DIGIT FOUR
0x0CEB, // KANNADA DIGIT FIVE
0x0CEC, // KANNADA DIGIT SIX
0x0CED, // KANNADA DIGIT SEVEN
0x0CEE, // KANNADA DIGIT EIGHT
0x0CEF, // KANNADA DIGIT NINE
0x0D66, // MALAYALAM DIGIT ZERO
0x0D67, // MALAYALAM DIGIT ONE
0x0D68, // MALAYALAM DIGIT TWO
0x0D69, // MALAYALAM DIGIT THREE
0x0D6A, // MALAYALAM DIGIT FOUR
0x0D6B, // MALAYALAM DIGIT FIVE
0x0D6C, // MALAYALAM DIGIT SIX
0x0D6D, // MALAYALAM DIGIT SEVEN
0x0D6E, // MALAYALAM DIGIT EIGHT
0x0D6F, // MALAYALAM DIGIT NINE
0x0E50, // THAI DIGIT ZERO
0x0E51, // THAI DIGIT ONE
0x0E52, // THAI DIGIT TWO
0x0E53, // THAI DIGIT THREE
0x0E54, // THAI DIGIT FOUR
0x0E55, // THAI DIGIT FIVE
0x0E56, // THAI DIGIT SIX
0x0E57, // THAI DIGIT SEVEN
0x0E58, // THAI DIGIT EIGHT
0x0E59, // THAI DIGIT NINE
0x0ED0, // LAO DIGIT ZERO
0x0ED1, // LAO DIGIT ONE
0x0ED2, // LAO DIGIT TWO
0x0ED3, // LAO DIGIT THREE
0x0ED4, // LAO DIGIT FOUR
0x0ED5, // LAO DIGIT FIVE
0x0ED6, // LAO DIGIT SIX
0x0ED7, // LAO DIGIT SEVEN
0x0ED8, // LAO DIGIT EIGHT
0x0ED9, // LAO DIGIT NINE
0xFF10, // FULLWIDTH DIGIT ZERO
0xFF11, // FULLWIDTH DIGIT ONE
0xFF12, // FULLWIDTH DIGIT TWO
0xFF13, // FULLWIDTH DIGIT THREE
0xFF14, // FULLWIDTH DIGIT FOUR
0xFF15, // FULLWIDTH DIGIT FIVE
0xFF16, // FULLWIDTH DIGIT SIX
0xFF17, // FULLWIDTH DIGIT SEVEN
0xFF18, // FULLWIDTH DIGIT EIGHT
0xFF19, // FULLWIDTH DIGIT NINE
0x3007, // IDEOGRAPHIC NUMBER ZERO
0x3021, // HANGZHOU NUMERAL ONE
0x3022, // HANGZHOU NUMERAL TWO
0x3023, // HANGZHOU NUMERAL THREE
0x3024, // HANGZHOU NUMERAL FOUR
0x3025, // HANGZHOU NUMERAL FIVE
0x3026, // HANGZHOU NUMERAL SIX
0x3027, // HANGZHOU NUMERAL SEVEN
0x3028, // HANGZHOU NUMERAL EIGHT
0x3029, // HANGZHOU NUMERAL NINE
0
};
// Characters included in unit symbol group
const WCHAR set13[] = {
0 //we use GetStringTypeEx
};
//Roman inter-word space
const WCHAR set14[] = {
0x0009, // TAB
0x0020, // SPACE
0x2002, // EN SPACE
0x2003, // EM SPACE
0x2004, // THREE-PER-EM SPACE
0x2005, // FOUR-PER-EM SPACE
0x2006, // SIX-PER-EM SPACE
0x2007, // FIGURE SPACE
0x2008, // PUNCTUATION SPACE
0x2009, // THIN SPACE
0x200A, // HAIR SPACE
0x200B, // ZERO WIDTH SPACE
WCH_EMBEDDING, // OBJECT EMBEDDING (0xFFFC)
0
};
// Roman characters
const WCHAR set15[] = {
0 //we use GetStringTypeEx
};
// So we can easily loop over all Kinsoku categories.
const LPCWSTR charCategories[] = {
set0,
set1,
set2,
set3,
set4,
set5,
set6,
set7,
set8,
set9,
set10,
set11,
set12,
set13,
set14,
set15
};
static const INT classifyChunkSize = 64;
static const INT indexSize = 65536 / classifyChunkSize;
static const INT classifyBitMapSize = indexSize / 8;
static const INT totalKinsokuCategories = 16;
static const INT bitmapShift = 6; // 16 - log(indexSize)/log(2)
typedef struct {
CHAR classifications[classifyChunkSize]; // must be unsigned bytes!
} ClassifyChunk;
static ClassifyChunk *classifyData; // Chunk array, sparse chrs
static BYTE *classifyIndex; // Indexes into chunk array
/*
* BOOL InitKinsokuClassify()
*
* @func
* Map the static character tables into a compact array for
* quick lookup of the characters Kinsoku classification.
*
* @comm
* Kinsoku classification is necessary for word breaking and
* may be neccessary for proportional line layout, Kinsoku style.
*
* @devnote
* We break the entire Unicode range in to chunks of characters.
* Not all of the chunks will have data in them. We do not
* maintain information on empty chunks, therefore we create
* a compact, contiguous array of chunks for only the chunks
* that do contain information. We prepend 1 empty chunk to the
* beginning of this array, where all of the empty chunks map to,
* this prevents a contiontional test on NULL data. The lookup
* will return 0 for any character not in the tables, so the client
* will then need to process the character further in such cases.
*
* @rdesc
* return TRUE if we successfully created the lookup table.
*/
BOOL InitKinsokuClassify()
{
TRACEBEGIN(TRCSUBSYSFE, TRCSCOPEINTERN, "InitKinsokuClassify");
WORD bitMapKey; // For calcing total chunks
BYTE bitData; // For calcing total chunks
WCHAR ch;
LPCWSTR pWChar; // Looping over char sets.
INT i, j, count; // Loop support.
BYTE classifyBitMap[classifyBitMapSize], // Temp bitmap.
*pIndex; // Index into chunk array.
// See how many chunks we'll need. We loop over all of the special
// characters
ZeroMemory(classifyBitMap, sizeof(classifyBitMap));
for (i = 0; i < totalKinsokuCategories; i++ )
{
pWChar = charCategories[i];
while ( ch = *pWChar++ )
{
bitMapKey = ch >> bitmapShift;
classifyBitMap[bitMapKey >> 3] |= 1 << (bitMapKey & 7);
}
}
// Now that we know how many chunks we'll need, allocate the memory.
count = 1 + CountMatchingBits((DWORD *)classifyBitMap, (DWORD *)classifyBitMap, sizeof(classifyBitMap)/sizeof(DWORD));
classifyData = (ClassifyChunk *) PvAlloc( sizeof(ClassifyChunk) * count, GMEM_ZEROINIT);
classifyIndex = (BYTE *) PvAlloc( sizeof(BYTE) * indexSize, GMEM_ZEROINIT);
// We failed if we did not get the memory.
if ( !classifyData || !classifyIndex )
return FALSE; // FAILED.
// Set Default missing value.
// NOTE - We are actually using fumemset instead of FillMemory api.
// In fumemset, the 2nd and 3rd params are different than those of FillMemory()
FillMemory( classifyData, -1, sizeof(ClassifyChunk) * count );
// Init the pointers to the chunks, which are really just indexes into
// a contiguous block of memory -- an one-based array of chunks.
pIndex = classifyIndex;
count = 1; // 1 based array.
for (i = 0; i < sizeof(classifyBitMap); i++ ) // Loop over all bytes.
{ // Get the bitmap data.
bitData = classifyBitMap[i]; // For each bit in the byte
for (j = 0; j < 8; j++, bitData >>= 1, pIndex++)
{
if(bitData & 1)
*pIndex = count++; // We used a chunk.
}
}
// Store the classifications of each character.
// Note: classifications are 1 based, a zero value
// means the category was not set.
for (i = 0; i < totalKinsokuCategories; i++ )
{
pWChar = charCategories[i]; // Loop over all chars in
while ( ch = *pWChar++ ) // category.
{
bitMapKey = ch >> bitmapShift;
Assert( classifyIndex[bitMapKey] > 0 );
Assert( classifyIndex[bitMapKey] < count );
classifyData[classifyIndex[bitMapKey]].
classifications[ ch & ( classifyChunkSize-1 )] = i;
}
}
return TRUE; // Successfully created.
}
void UninitKinsokuClassify()
{
TRACEBEGIN(TRCSUBSYSFE, TRCSCOPEINTERN, "UninitKinsokuClassify");
FreePv(classifyData);
FreePv(classifyIndex);
}
/*
* KinsokuClassify(ch)
*
* @func
* Kinsoku classify the character iff it was a given from
* one of the classification tables.
*
* @comm
* Hi order bits of ch are used to get an index value used to index
* into an array of chunks. Each chunk contains the classifications
* for that character as well as some number of characters adjacent
* to that character. The low order bits are used to index into
* the chunk of adjacent characters.
*
* @devnote
* Because of the way we constructed the array, all that we need to
* do is look up the data; no conditionals necessary.
*
* The routine is inline to avoid the call overhead. It is static
* because it only returns characters from the tables; i.e., this
* routine does NOT classify all Unicode characters.
*
* @rdesc
* Returns the classification.
*/
static inline INT
KinsokuClassify(
WCHAR ch ) // @parm char to classify.
{
TRACEBEGIN(TRCSUBSYSFE, TRCSCOPEINTERN, "KinsokuClassify");
return classifyData[ classifyIndex[ ch >> bitmapShift ] ].
classifications[ ch & ( classifyChunkSize-1 )];
}
/*
* BatchKinsokuClassify (pch, cch, outType3, kinsokuClassifications)
*
* @func
* Kinsoku classify each character of the given string.
*
* @comm
* The Kinsoku classifications are passed to the CanBreak() routine. We
* do process in batch to save on overhead.
*
* If the character is not in the Kinsoku classification tables then
* GetStringTypeEx is used to classify any remaining character.
*
* @rdesc
* Result in out param kinsokuClassifications.
* outType3 if caller wants the result to GetStringTypeEx
*/
void BatchKinsokuClassify (
const WCHAR *pch, // @parm char string.
INT cch, // @parm number of chars in string.
WORD *outType3, // @parm if caller wants result of GetStringTypeEx
INT * kinsokuClassifications ) // @parm Result of the classifications.
{
TRACEBEGIN(TRCSUBSYSFE, TRCSCOPEINTERN, "BatchKinsokuClassify");
INT iCategory;
WORD wRes[MAX_CLASSIFY_CHARS], *pcType3, cType3;
Assert( cch < MAX_CLASSIFY_CHARS );
Assert( pch );
Assert( kinsokuClassifications );
pcType3 = ( NULL != outType3 ) ? outType3 : wRes;
if (FALSE == W32->GetStringTypeEx(0, CT_CTYPE3, pch, cch, pcType3)) // In batch...
{
AssertSz(0, "W32->GetStringTypeEx failed.");
return;
}
while ( cch-- ) // For all ch...
{
WCHAR ch = *pch++;
if ( IsKorean( ch ) )
iCategory = 11;
else
{
iCategory = KinsokuClassify(ch);
if ( iCategory < 0) // If not classified
{ // then it is one of..
cType3 = *pcType3;
if ( cType3 & C3_SYMBOL )
iCategory = 13; // symbol chars,
else if ( cType3 & (C3_KATAKANA | C3_HIRAGANA | C3_IDEOGRAPH) )
iCategory = 11; // ideographic chars,
else
{
iCategory = 15; // all other chars.
}
}
}
pcType3++;
*kinsokuClassifications++ = iCategory;
}
}
/*
* CanBreak(class1, class2)
*
* @func
* Look into the truth table to see if two consecutive charcters
* can have a line break between them.
*
* @comm
* This determines whether two successive characters can break a line.
* The matrix is taken from JIS X4051 and is based on categorizing
* characters into 15 classifications.
*
* @devnote
* The table is 1 based.
*
* @rdesc
* Returns TRUE if the characters can be broken across a line.
*/
BOOL CanBreak(
INT class1, //@parm Kinsoku classification of character #1
INT class2 ) //@parm Kinsoku classification of following character.
{
TRACEBEGIN(TRCSUBSYSFE, TRCSCOPEINTERN, "CanBreak");
static const WORD br[16] = {// fedc ba98 7654 3210
0x0000, // 0 0000 0000 0000 0000
0x0000, // 1 0000 0000 0000 0000
0xfd82, // 2 1111 1101 1000 0010
0xfd82, // 3 1111 1101 1000 0010
0xfd82, // 4 1111 1101 1000 0010
0xfd82, // 5 1111 1101 1000 0010
0x6d82, // 6 0110 1101 1000 0010
0xfd02, // 7 1111 1101 0000 0010
0x0000, // 8 0000 0000 0000 0000
0xfd82, // 9 1111 1101 1000 0010
0xfd83, // a 1111 1101 1000 0011
0xfd82, // b 1111 1101 1000 0010
0x6d82, // c 0110 1101 1000 0010
0x5d82, // d 0101 1101 1000 0010
0xfd83, // e 1111 1101 1000 0011
0x4d82, // f 0100 1101 1000 0010
};
return (br[class1] >> class2) & 1;
}
/*
* IsURLDelimiter(ch)
*
* @func
* Punctuation characters are those of sets 0, 1, 2, 4, 5, and 6,
* and < or > which we consider to be brackets, not "less" or
* "greater" signs. On the other hand; "/" (in set 6) should not be
* a delimiter, but rather a part of the URL.
*
* @comm This function is used in URL detection
*
* @rdesc
* Returns TRUE if the character is a punctuation mark.
*/
BOOL IsURLDelimiter(
WCHAR ch)
{
if (ch > 255)
return TRUE;
INT iset = KinsokuClassify(ch);
return IN_RANGE(0, iset, 2) || IN_RANGE(4, iset, 6) && ch != L'/' ||
ch == L'<' || ch == L'>';
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -