⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 clasifyc.cpp

📁 Windows CE 6.0 Word Application 源码
💻 CPP
📖 第 1 页 / 共 2 页
字号:
	0x0C67, // TELUGU DIGIT ONE
	0x0C68, // TELUGU DIGIT TWO
	0x0C69, // TELUGU DIGIT THREE
	0x0C6A, // TELUGU DIGIT FOUR
	0x0C6B, // TELUGU DIGIT FIVE
	0x0C6C, // TELUGU DIGIT SIX
	0x0C6D, // TELUGU DIGIT SEVEN
	0x0C6E, // TELUGU DIGIT EIGHT
	0x0C6F, // TELUGU DIGIT NINE
	0x0CE6, // KANNADA DIGIT ZERO
	0x0CE7, // KANNADA DIGIT ONE
	0x0CE8, // KANNADA DIGIT TWO
	0x0CE9, // KANNADA DIGIT THREE
	0x0CEA, // KANNADA DIGIT FOUR
	0x0CEB, // KANNADA DIGIT FIVE
	0x0CEC, // KANNADA DIGIT SIX
	0x0CED, // KANNADA DIGIT SEVEN
	0x0CEE, // KANNADA DIGIT EIGHT
	0x0CEF, // KANNADA DIGIT NINE
	0x0D66, // MALAYALAM DIGIT ZERO
	0x0D67, // MALAYALAM DIGIT ONE
	0x0D68, // MALAYALAM DIGIT TWO
	0x0D69, // MALAYALAM DIGIT THREE
	0x0D6A, // MALAYALAM DIGIT FOUR
	0x0D6B, // MALAYALAM DIGIT FIVE
	0x0D6C, // MALAYALAM DIGIT SIX
	0x0D6D, // MALAYALAM DIGIT SEVEN
	0x0D6E, // MALAYALAM DIGIT EIGHT
	0x0D6F, // MALAYALAM DIGIT NINE
	0x0E50, // THAI DIGIT ZERO
	0x0E51, // THAI DIGIT ONE
	0x0E52, // THAI DIGIT TWO
	0x0E53, // THAI DIGIT THREE
	0x0E54, // THAI DIGIT FOUR
	0x0E55, // THAI DIGIT FIVE
	0x0E56, // THAI DIGIT SIX
	0x0E57, // THAI DIGIT SEVEN
	0x0E58, // THAI DIGIT EIGHT
	0x0E59, // THAI DIGIT NINE
	0x0ED0, // LAO DIGIT ZERO
	0x0ED1, // LAO DIGIT ONE
	0x0ED2, // LAO DIGIT TWO
	0x0ED3, // LAO DIGIT THREE
	0x0ED4, // LAO DIGIT FOUR
	0x0ED5, // LAO DIGIT FIVE
	0x0ED6, // LAO DIGIT SIX
	0x0ED7, // LAO DIGIT SEVEN
	0x0ED8, // LAO DIGIT EIGHT
	0x0ED9, // LAO DIGIT NINE
	0xFF10, // FULLWIDTH DIGIT ZERO
	0xFF11, // FULLWIDTH DIGIT ONE
	0xFF12, // FULLWIDTH DIGIT TWO
	0xFF13, // FULLWIDTH DIGIT THREE
	0xFF14, // FULLWIDTH DIGIT FOUR
	0xFF15, // FULLWIDTH DIGIT FIVE
	0xFF16, // FULLWIDTH DIGIT SIX
	0xFF17, // FULLWIDTH DIGIT SEVEN
	0xFF18, // FULLWIDTH DIGIT EIGHT
	0xFF19, // FULLWIDTH DIGIT NINE

	0x3007, // IDEOGRAPHIC NUMBER ZERO
	0x3021, // HANGZHOU NUMERAL ONE
	0x3022, // HANGZHOU NUMERAL TWO
	0x3023, // HANGZHOU NUMERAL THREE
	0x3024, // HANGZHOU NUMERAL FOUR
	0x3025, // HANGZHOU NUMERAL FIVE
	0x3026, // HANGZHOU NUMERAL SIX
	0x3027, // HANGZHOU NUMERAL SEVEN
	0x3028, // HANGZHOU NUMERAL EIGHT
	0x3029, // HANGZHOU NUMERAL NINE
	0
};

// Characters included in unit symbol group
const WCHAR set13[] = {
	0		//we use GetStringTypeEx
};

//Roman inter-word space
const WCHAR set14[] = {
	0x0009,	// TAB
	0x0020, // SPACE
	0x2002, // EN SPACE
	0x2003, // EM SPACE
	0x2004, // THREE-PER-EM SPACE
	0x2005, // FOUR-PER-EM SPACE
	0x2006, // SIX-PER-EM SPACE
	0x2007, // FIGURE SPACE
	0x2008, // PUNCTUATION SPACE
	0x2009, // THIN SPACE
	0x200A, // HAIR SPACE
	0x200B,  // ZERO WIDTH SPACE
	WCH_EMBEDDING, // OBJECT EMBEDDING (0xFFFC)
	0
};

// Roman characters
const WCHAR set15[] = {
	0		//we use GetStringTypeEx
};

// So we can easily loop over all Kinsoku categories.
const LPCWSTR charCategories[] = {
	set0,
	set1,
	set2,
	set3,
	set4,
	set5,
	set6,
	set7,
	set8,
	set9,
	set10,
	set11,
	set12,
	set13,
	set14,
	set15
};

static const INT classifyChunkSize = 64;
static const INT indexSize = 65536 / classifyChunkSize;
static const INT classifyBitMapSize = indexSize / 8;
static const INT totalKinsokuCategories = 16;
static const INT bitmapShift = 6; // 16 - log(indexSize)/log(2)

typedef struct {
	CHAR classifications[classifyChunkSize];		// must be unsigned bytes!
} ClassifyChunk;

static ClassifyChunk *classifyData;					// Chunk array, sparse chrs
static BYTE *classifyIndex;							// Indexes into chunk array


/*
 *	BOOL InitKinsokuClassify()
 *
 *	@func
 *		Map the static character tables into a compact array for
 *		quick lookup of the characters Kinsoku classification.
 *
 *	@comm
 *		Kinsoku classification is necessary for word breaking and
 *		may be neccessary for proportional line layout, Kinsoku style.
 *
 *	@devnote
 *		We break the entire Unicode range in to chunks of characters.
 *		Not all of the chunks will have data in them. We do not
 *		maintain information on empty chunks, therefore we create
 *		a compact, contiguous array of chunks for only the chunks
 *		that do contain information. We prepend 1 empty chunk to the
 *		beginning of this array, where all of the empty chunks map to,
 *		this prevents a contiontional test on NULL data. The lookup
 *		will return 0 for any character not in the tables, so the client
 *		will then need to process the character further in such cases.
 *
 *	@rdesc
 *		return TRUE if we successfully created the lookup table.
 */
BOOL InitKinsokuClassify()
{
	TRACEBEGIN(TRCSUBSYSFE, TRCSCOPEINTERN, "InitKinsokuClassify");

	WORD	bitMapKey;								// For calcing total chunks
	BYTE	bitData;								// For calcing total chunks
	WCHAR	ch;
	LPCWSTR pWChar;									// Looping over char sets.
	INT		i, j, count;							// Loop support.
	BYTE	classifyBitMap[classifyBitMapSize],		// Temp bitmap.
			*pIndex;								// Index into chunk array.

	// See how many chunks we'll need. We loop over all of the special
	//  characters
	ZeroMemory(classifyBitMap, sizeof(classifyBitMap));
	for (i = 0; i < totalKinsokuCategories; i++ )
	{
		pWChar = charCategories[i];
		while ( ch = *pWChar++ )
		{
			bitMapKey = ch >> bitmapShift;
			classifyBitMap[bitMapKey >> 3] |= 1 << (bitMapKey & 7);
		}
	}

	// Now that we know how many chunks we'll need, allocate the memory.
	count = 1 + CountMatchingBits((DWORD *)classifyBitMap, (DWORD *)classifyBitMap, sizeof(classifyBitMap)/sizeof(DWORD));
	classifyData = (ClassifyChunk *) PvAlloc( sizeof(ClassifyChunk) * count, GMEM_ZEROINIT);
	classifyIndex = (BYTE *) PvAlloc( sizeof(BYTE) * indexSize, GMEM_ZEROINIT);

	// We failed if we did not get the memory.
	if ( !classifyData || !classifyIndex )
		return FALSE;								// FAILED.

	// Set Default missing value.
	// NOTE - We are actually using fumemset instead of FillMemory api.
	// In fumemset, the 2nd and 3rd params are different than those of FillMemory()
	FillMemory( classifyData, -1, sizeof(ClassifyChunk) * count );  

	// Init the pointers to the chunks, which are really just indexes into
	//  a contiguous block of memory -- an one-based array of chunks.
	pIndex = classifyIndex;
	count = 1;										// 1 based array.
	for (i = 0; i < sizeof(classifyBitMap); i++ )	// Loop over all bytes.
	{												// Get the bitmap data.
		bitData = classifyBitMap[i];				// For each bit in the byte
		for (j = 0; j < 8; j++, bitData >>= 1, pIndex++)
		{
			if(bitData & 1)			
				*pIndex = count++;					// We used a chunk.
		}
	}
	
	// Store the classifications of each character.
	// Note: classifications are 1 based, a zero value
	//  means the category was not set.
	for (i = 0; i < totalKinsokuCategories; i++ )
	{
		pWChar = charCategories[i];					// Loop over all chars in
		while ( ch = *pWChar++ )					//  category.
		{
			bitMapKey = ch >> bitmapShift;
			Assert( classifyIndex[bitMapKey] > 0 );
			Assert( classifyIndex[bitMapKey] < count );

			classifyData[classifyIndex[bitMapKey]].
				classifications[ ch & ( classifyChunkSize-1 )] = i;
		}
	}
	return TRUE;									// Successfully created.
}

void UninitKinsokuClassify()
{
	TRACEBEGIN(TRCSUBSYSFE, TRCSCOPEINTERN, "UninitKinsokuClassify");

	FreePv(classifyData);
	FreePv(classifyIndex);
}

/*
 *	KinsokuClassify(ch)
 *
 *	@func
 *		Kinsoku classify the character iff it was a given from
 *		one of the classification tables.
 *
 *	@comm
 *		Hi order bits of ch are used to get an index value used to index
 *		into an array of chunks. Each chunk contains the classifications
 *		for that character as well as some number of characters adjacent
 *		to that character. The low order bits are used to index into
 *		the chunk of adjacent characters.
 *
 *	@devnote
 *		Because of the way we constructed the array, all that we need to
 *		do is look up the data; no conditionals necessary.
 *
 *		The routine is inline to avoid the call overhead. It is static
 *		because it only returns characters from the tables; i.e., this 
 *		routine does NOT classify all Unicode characters.
 *
 *	@rdesc
 *		Returns the classification.
 */
static inline INT
KinsokuClassify(
	WCHAR ch )	// @parm char to classify.
{
	TRACEBEGIN(TRCSUBSYSFE, TRCSCOPEINTERN, "KinsokuClassify");

	return classifyData[ classifyIndex[ ch >> bitmapShift ] ].
			classifications[ ch & ( classifyChunkSize-1 )];
}

/*
 *	BatchKinsokuClassify (pch, cch, outType3, kinsokuClassifications)
 *
 *	@func
 *		Kinsoku classify each character of the given string.
 *
 *	@comm
 *		The Kinsoku classifications are passed to the CanBreak() routine. We
 *		do process in batch to save on overhead.
 *
 *		If the character is not in the Kinsoku classification tables then
 *		GetStringTypeEx is used to classify any remaining character.
 *
 *	@rdesc
 *		Result in out param kinsokuClassifications.
 *		outType3 if caller wants the result to GetStringTypeEx
 */
void BatchKinsokuClassify (
	const WCHAR *pch,	// @parm char string.
	INT	  cch,			// @parm number of chars in string.
	WORD *outType3,		// @parm if caller wants result of GetStringTypeEx
	INT * kinsokuClassifications )	// @parm Result of the classifications.
{
	TRACEBEGIN(TRCSUBSYSFE, TRCSCOPEINTERN, "BatchKinsokuClassify");

	INT			iCategory;

	WORD		wRes[MAX_CLASSIFY_CHARS], *pcType3, cType3;

	Assert( cch < MAX_CLASSIFY_CHARS );
	Assert( pch );
	Assert( kinsokuClassifications );

	pcType3 = ( NULL != outType3 ) ? outType3 : wRes;
	if (FALSE == W32->GetStringTypeEx(0, CT_CTYPE3, pch, cch, pcType3)) // In batch...
	{
	    AssertSz(0, "W32->GetStringTypeEx failed.");
	    return;
	}    

	while ( cch-- )										// For all ch...
	{
		WCHAR	ch = *pch++; 
		if ( IsKorean( ch ) )
			iCategory = 11;									
		else
		{
			iCategory = KinsokuClassify(ch);
			if ( iCategory < 0)								// If not classified
			{												//  then it is one of..
				cType3 = *pcType3;
				if ( cType3 & C3_SYMBOL )
					iCategory = 13;							//  symbol chars,
				else if ( cType3 & (C3_KATAKANA | C3_HIRAGANA | C3_IDEOGRAPH) )
					iCategory = 11;							//  ideographic chars,
				else 
				{
					iCategory = 15;							//  all other chars.
				}
			}
		}
		pcType3++;
		*kinsokuClassifications++ = iCategory;
	}
}

/*
 *	CanBreak(class1, class2)
 *
 *	@func
 *		Look into the truth table to see if two consecutive charcters
 *		can have a line break between them.
 *
 *	@comm
 *		This determines whether two successive characters can break a line.
 *		The matrix is taken from JIS X4051 and is based on categorizing
 *		characters into 15 classifications.
 *
 *	@devnote
 *		The table is 1 based.
 *
 *	@rdesc
 *		Returns TRUE if the characters can be broken across a line.
 */
BOOL CanBreak(
	INT class1,		//@parm	Kinsoku classification of character #1
	INT class2 )	//@parm	Kinsoku classification of following character.
{
	TRACEBEGIN(TRCSUBSYSFE, TRCSCOPEINTERN, "CanBreak");

	static const WORD br[16] = {//   fedc ba98 7654 3210
		0x0000,					// 0 0000 0000 0000 0000
		0x0000,					// 1 0000 0000 0000 0000
		0xfd82,					// 2 1111 1101 1000 0010
		0xfd82,					// 3 1111 1101 1000 0010
		0xfd82,					// 4 1111 1101 1000 0010
		0xfd82,					// 5 1111 1101 1000 0010
		0x6d82,					// 6 0110 1101 1000 0010
		0xfd02,					// 7 1111 1101 0000 0010
		0x0000,					// 8 0000 0000 0000 0000
		0xfd82,					// 9 1111 1101 1000 0010
		0xfd83,					// a 1111 1101 1000 0011
		0xfd82,					// b 1111 1101 1000 0010
		0x6d82,					// c 0110 1101 1000 0010
		0x5d82,					// d 0101 1101 1000 0010
		0xfd83,					// e 1111 1101 1000 0011
		0x4d82,					// f 0100 1101 1000 0010
	};
	return (br[class1] >> class2) & 1;
}

/*
 *	IsURLDelimiter(ch)
 *
 *	@func
 *		Punctuation characters are those of sets 0, 1, 2, 4, 5, and 6,
 *		and < or > which we consider to be brackets, not "less" or
 *      "greater" signs. On the other hand; "/" (in set 6) should not be
 *		a delimiter, but rather a part of the URL.
 *
 *	@comm This function is used in URL detection
 *
 *	@rdesc
 *		Returns TRUE if the character is a punctuation mark.
 */
BOOL IsURLDelimiter(
	WCHAR ch)
{
	if (ch > 255)
		return TRUE;

	INT iset = KinsokuClassify(ch);
 
	return IN_RANGE(0, iset, 2) || IN_RANGE(4, iset, 6) && ch != L'/' ||
		   ch == L'<' || ch == L'>';
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -