📄 jclunicode.pas
字号:
// Character category data is quite a large block since every defined character in Unicode is assigned at least
// one category. Because of this we cannot use a sparse matrix to provide quick access as implemented for
// e.g. composition data.
// The approach used here is based on the fact that an application seldomly uses all characters defined in Unicode
// simultanously. In fact the opposite is true. Most application will use either Western Europe or Arabic or
// Far East character data, but very rarely all together. Based on this fact is the implementation of virtual
// memory using the systems paging file (aka file mapping) to load only into virtual memory what is used currently.
// The implementation is not yet finished and needs a lot of improvements yet.
type
// start and stop of a range of code points
TRange = record
Start,
Stop: Cardinal;
end;
TRangeArray = array of TRange;
TCategoriesArray = array of TCharacterCategories;
var
// character categories, stored in the system's swap file and mapped on demand
CategoriesLoaded: Boolean;
Categories: array [Byte] of TCategoriesArray;
procedure LoadCharacterCategories;
// Loads the character categories data (as saved by the Unicode database extractor, see also
// the comments about JclUnicode.res above).
var
Size: Integer;
Stream: TResourceStream;
Category: TCharacterCategory;
Buffer: TRangeArray;
First,
Second: Byte;
J, K: Integer;
begin
// Data already loaded?
if not CategoriesLoaded then
begin
// make sure no other code is currently modifying the global data area
LoadInProgress.Enter;
try
CategoriesLoaded := True;
Stream := TResourceStream.Create(HInstance, 'CATEGORIES', 'UNICODEDATA');
try
while Stream.Position < Stream.Size do
begin
// a) read which category is current in the stream
Stream.ReadBuffer(Category, 1);
// b) read the size of the ranges and the ranges themself
Stream.ReadBuffer(Size, 4);
if Size > 0 then
begin
SetLength(Buffer, Size);
Stream.ReadBuffer(Buffer[0], Size * SizeOf(TRange));
// c) go through every range and add the current category to each code point
for J := 0 to Size - 1 do
for K := Buffer[J].Start to Buffer[J].Stop do
begin
if K > $FFFF then
Break;
First := (K shr 8) and $FF;
Second := K and $FF;
// add second step array if not yet done
if Categories[First] = nil then
SetLength(Categories[First], 256);
Include(Categories[First, Second], Category);
end;
end;
end;
finally
Stream.Free;
end;
finally
LoadInProgress.Leave;
end;
end;
end;
function CategoryLookup(Code: Cardinal; Cats: TCharacterCategories): Boolean; overload;
// determines whether the Code is in the given category
var
First,
Second: Byte;
begin
// load property data if not already done
if not CategoriesLoaded then
LoadCharacterCategories;
First := (Code shr 8) and $FF;
Second := Code and $FF;
if Categories[First] <> nil then
Result := Categories[First, Second] * Cats <> []
else
Result := False;
end;
//----------------- support for case mapping -------------------------------------------------------
type
TCase = array [0..3] of TUCS4Array; // mapping for case fold, lower, title and upper in this order
TCaseArray = array of TCase;
var
// An array for all case mappings (including 1 to many casing if saved by the extraction program).
// The organization is a sparse, two stage matrix.
// SingletonMapping is to quickly return a single default mapping.
CaseDataLoaded: Boolean;
CaseMapping: array [Byte] of TCaseArray;
SingletonMapping: TUCS4Array;
procedure LoadCaseMappingData;
var
Stream: TResourceStream;
I, Code,
Size: Cardinal;
First,
Second: Byte;
begin
if not CaseDataLoaded then
begin
// make sure no other code is currently modifying the global data area
LoadInProgress.Enter;
try
SetLength(SingletonMapping, 1);
CaseDataLoaded := True;
Stream := TResourceStream.Create(HInstance, 'CASE', 'UNICODEDATA');
try
// the first entry in the stream is the number of entries in the case mapping table
Stream.ReadBuffer(Size, 4);
for I := 0 to Size - 1 do
begin
// a) read actual code point
Stream.ReadBuffer(Code, 4);
Assert(Code < $10000, LoadResString(@RsCasedUnicodeChar));
// if there is no high byte entry in the first stage table then create one
First := (Code shr 8) and $FF;
Second := Code and $FF;
if CaseMapping[First] = nil then
SetLength(CaseMapping[First], 256);
// b) read fold case array
Stream.ReadBuffer(Size, 4);
if Size > 0 then
begin
SetLength(CaseMapping[First, Second, 0], Size);
Stream.ReadBuffer(CaseMapping[First, Second, 0, 0], Size * SizeOf(UCS4));
end;
// c) read lower case array
Stream.ReadBuffer(Size, 4);
if Size > 0 then
begin
SetLength(CaseMapping[First, Second, 1], Size);
Stream.ReadBuffer(CaseMapping[First, Second, 1, 0], Size * SizeOf(UCS4));
end;
// d) read title case array
Stream.ReadBuffer(Size, 4);
if Size > 0 then
begin
SetLength(CaseMapping[First, Second, 2], Size);
Stream.ReadBuffer(CaseMapping[First, Second, 2, 0], Size * SizeOf(UCS4));
end;
// e) read upper case array
Stream.ReadBuffer(Size, 4);
if Size > 0 then
begin
SetLength(CaseMapping[First, Second, 3], Size);
Stream.ReadBuffer(CaseMapping[First, Second, 3, 0], Size * SizeOf(UCS4));
end;
end;
finally
Stream.Free;
end;
finally
LoadInProgress.Leave;
end;
end;
end;
function CaseLookup(Code: Cardinal; CaseType: Cardinal): TUCS4Array;
// Performs a lookup of the given code and returns its case mapping if found.
// CaseType must be 0 for case folding, 1 for lower case, 2 for title case and 3 for upper case, respectively.
// If Code could not be found (or there is no case mapping) then the result is a mapping of length 1 with the
// code itself. Otherwise an array of code points is returned which represent the mapping.
var
First,
Second: Byte;
begin
// load case mapping data if not already done
if not CaseDataLoaded then
LoadCaseMappingData;
First := (Code shr 8) and $FF;
Second := Code and $FF;
// Check first stage table whether there is a mapping for a particular block and
// (if so) then whether there is a mapping or not.
if (CaseMapping[First] = nil) or (CaseMapping[First, Second, CaseType] = nil) then
begin
SingletonMapping[0] := Code;
Result := SingletonMapping;
end
else
Result := CaseMapping[First, Second, CaseType];
end;
function UnicodeCaseFold(Code: UCS4): TUCS4Array;
// This function returnes an array of special case fold mappings if there is one defined for the given
// code, otherwise the lower case will be returned. This all applies only to cased code points.
// Uncased code points are returned unchanged.
begin
Result := CaseLookup(Code, 0);
end;
function UnicodeToUpper(Code: UCS4): TUCS4Array;
begin
Result := CaseLookup(Code, 3);
end;
function UnicodeToLower(Code: UCS4): TUCS4Array;
begin
Result := CaseLookup(Code, 1);
end;
function UnicodeToTitle(Code: UCS4): TUCS4Array;
begin
Result := CaseLookup(Code, 2);
end;
//----------------- support for decomposition ------------------------------------------------------
const
// constants for hangul composition and hangul-to-jamo decomposition
SBase = $AC00; // hangul syllables start code point
LBase = $1100; // leading syllable
VBase = $1161;
TBase = $11A7; // trailing syllable
LCount = 19;
VCount = 21;
TCount = 28;
NCount = VCount * TCount; // 588
SCount = LCount * NCount; // 11172
type
TDecompositions = array of TUCS4Array;
TDecompositionsArray = array [Byte] of TDecompositions;
var
// list of decompositions, organized (again) as two stage matrix
// Note: there are two tables, one for canonical decompositions and the other one
// for compatibility decompositions.
DecompositionsLoaded: Boolean;
CanonicalDecompositions,
CompatibleDecompositions: TDecompositionsArray;
procedure LoadDecompositionData;
var
Stream: TResourceStream;
I, Code,
Size: Cardinal;
First,
Second: Byte;
begin
if not DecompositionsLoaded then
begin
// make sure no other code is currently modifying the global data area
LoadInProgress.Enter;
try
DecompositionsLoaded := True;
Stream := TResourceStream.Create(HInstance, 'DECOMPOSITION', 'UNICODEDATA');
try
// determine how many decomposition entries we have
Stream.ReadBuffer(Size, 4);
for I := 0 to Size - 1 do
begin
Stream.ReadBuffer(Code, 4);
Assert((Code and not $40000000) < $10000, LoadResString(@RsDecomposedUnicodeChar));
// if there is no high byte entry in the first stage table then create one
First := (Code shr 8) and $FF;
Second := Code and $FF;
// insert into the correct table depending on bit 30
// (if set then it is a compatibility decomposition)
if Code and $40000000 <> 0 then
begin
if CompatibleDecompositions[First] = nil then
SetLength(CompatibleDecompositions[First], 256);
Stream.ReadBuffer(Size, 4);
if Size > 0 then
begin
SetLength(CompatibleDecompositions[First, Second], Size);
Stream.ReadBuffer(CompatibleDecompositions[First, Second, 0], Size * SizeOf(UCS4));
end;
end
else
begin
if CanonicalDecompositions[First] = nil then
SetLength(CanonicalDecompositions[First], 256);
Stream.ReadBuffer(Size, 4);
if Size > 0 then
begin
SetLength(CanonicalDecompositions[First, Second], Size);
Stream.ReadBuffer(CanonicalDecompositions[First, Second, 0], Size * SizeOf(UCS4));
end;
end;
end;
finally
Stream.Free;
end;
finally
LoadInProgress.Leave;
end;
end;
end;
function UnicodeDecomposeHangul(Code: UCS4): TUCS4Array;
// algorithmically decomposes hangul character
var
Rest: Integer;
begin
Dec(Code, SBase);
Rest := Code mod TCount;
if Rest = 0 then
SetLength(Result, 2)
else
SetLength(Result, 3);
Result[0] := LBase + (Code div NCount);
Result[1] := VBase + ((Code mod NCount) div TCount);
if Rest <> 0 then
Result[2] := TBase + Rest;
end;
function UnicodeDecompose(Code: UCS4; Compatible: Boolean): TUCS4Array;
var
First,
Second: Byte;
begin
// load decomposition data if not already done
if not DecompositionsLoaded then
LoadDecompositionData;
Result := nil;
// if the code is hangul then decomposition is algorithmically
if UnicodeIsHangul(Code) then
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -