📄 unicode.~pas
字号:
// utility functions
function CodePageFromLocale(Language: LCID): Integer;
function KeyboardCodePage: Word;
function KeyUnicode(C: Char): WideChar;
function CodeBlockFromChar(const C: WideChar): Cardinal;
function CodePageToWideString(A: AnsiString; CodePage: Word): WideString;
// WideString Conversion routines
function WideStringToUTF8(S: WideString): AnsiString;
function UTF8ToWideString(S: AnsiString): WideString;
//----------------------------------------------------------------------------------------------------------------------
implementation
// ~67K Unicode data for case mapping, decomposition, numbers etc.
// This data is loaded on demand which means only those parts will be put in memory which are needed
// by one of the lookup functions.
{$R Unicode.res}
uses
Consts, SyncObjs, SysUtils;
resourcestring
SUREBaseString = 'Error in regular expression: %s' + #13;
SUREUnexpectedEOS = 'Unexpected end of pattern.';
SURECharacterClassOpen = 'Character class not closed, '']'' is missing.';
SUREUnbalancedGroup = 'Unbalanced group expression, '')'' is missing.';
SUREInvalidCharProperty = 'A character property is invalid';
SUREInvalidRepeatRange = 'Invalid repeation range.';
SURERepeatRangeOpen = 'Repeation range not closed, ''}'' is missing.';
SUREExpressionEmpty = 'Expression is empty.';
type
TCompareFunc = function (W1, W2: WideString; Locale: LCID): Integer;
var
WideCompareText: TCompareFunc;
//----------------- Loader routines for resource data ------------------------------------------------------------------
const
// Values that can appear in the Mask1 parameter of the IsProperty function.
UC_MN = $00000001; // Mark, Non-Spacing
UC_MC = $00000002; // Mark, Spacing Combining
UC_ME = $00000004; // Mark, Enclosing
UC_ND = $00000008; // Number, Decimal Digit
UC_NL = $00000010; // Number, Letter
UC_NO = $00000020; // Number, Other
UC_ZS = $00000040; // Separator, Space
UC_ZL = $00000080; // Separator, Line
UC_ZP = $00000100; // Separator, Paragraph
UC_CC = $00000200; // Other, Control
UC_CF = $00000400; // Other, Format
UC_OS = $00000800; // Other, Surrogate
UC_CO = $00001000; // Other, private use
UC_CN = $00002000; // Other, not assigned
UC_LU = $00004000; // Letter, Uppercase
UC_LL = $00008000; // Letter, Lowercase
UC_LT = $00010000; // Letter, Titlecase
UC_LM = $00020000; // Letter, Modifier
UC_LO = $00040000; // Letter, Other
UC_PC = $00080000; // Punctuation, Connector
UC_PD = $00100000; // Punctuation, Dash
UC_PS = $00200000; // Punctuation, Open
UC_PE = $00400000; // Punctuation, Close
UC_PO = $00800000; // Punctuation, Other
UC_SM = $01000000; // Symbol, Math
UC_SC = $02000000; // Symbol, Currency
UC_SK = $04000000; // Symbol, Modifier
UC_SO = $08000000; // Symbol, Other
UC_L = $10000000; // Left-To-Right
UC_R = $20000000; // Right-To-Left
UC_EN = $40000000; // European Number
UC_ES = $80000000; // European Number Separator
// Values that can appear in the Mask2 parameter of the IsProperty function
UC_ET = $00000001; // European Number Terminator
UC_AN = $00000002; // Arabic Number
UC_CS = $00000004; // Common Number Separator
UC_B = $00000008; // Block Separator
UC_S = $00000010; // Segment (unit) Separator (this includes tab and vertical tab)
UC_WS = $00000020; // Whitespace
UC_ON = $00000040; // Other Neutrals
// Implementation specific character properties.
UC_CM = $00000080; // Composite
UC_NB = $00000100; // Non-Breaking
UC_SY = $00000200; // Symmetric
UC_HD = $00000400; // Hex Digit
UC_QM = $00000800; // Quote Mark
UC_MR = $00001000; // Mirroring
UC_SS = $00002000; // Space, other
UC_CP = $00004000; // Defined
// Added for UnicodeData-2.1.3.
UC_PI = $00008000; // Punctuation, Initial
UC_PF = $00010000; // Punctuation, Final
type
TUHeader = record
BOM: WideChar;
Count: Word;
case Boolean of
True:
(Bytes: Cardinal);
False:
(Len: array[0..1] of Word);
end;
TWordArray = array of Word;
TCardinalArray = array of Cardinal;
var
// As the global data can be accessed by several threads it should be guarded
// while the data is loaded.
LoadInProgress: TCriticalSection;
//----------------- internal support routines --------------------------------------------------------------------------
function SwapCardinal(C: Cardinal): Cardinal;
// swaps all bytes in C from MSB to LSB order
// EAX contains both parameter as well as result
asm
BSWAP EAX
end;
//----------------- support for character properties -------------------------------------------------------------------
var
PropertyOffsets: TWordArray;
PropertyRanges: TCardinalArray;
procedure LoadUnicodeTypeData;
// loads the character property data (as saved by the Unicode database extractor into the ctype.dat file)
var
I, Size: Integer;
Header: TUHeader;
Stream: TResourceStream;
begin
// make sure no other code is currently modifying the global data area
if LoadInProgress = nil then LoadInProgress := TCriticalSection.Create;
LoadInProgress.Enter;
// Data already loaded?
if PropertyOffsets = nil then
begin
Stream := TResourceStream.Create(HInstance, 'TYPE', 'UNICODE');
Stream.Read(Header, SizeOf(Header));
if Header.BOM = BOM_MSB_FIRST then
begin
Header.Count := Swap(Header.Count);
Header.Bytes := SwapCardinal(Header.Bytes);
end;
// Calculate the offset into the storage for the ranges. The offsets
// array is on a 4-byte boundary and one larger than the value provided in
// the header count field. This means the offset to the ranges must be
// calculated after aligning the count to a 4-byte boundary.
Size := (Header.Count + 1) * SizeOf(Word);
if (Size and 3) <> 0 then Inc(Size, 4 - (Size and 3));
// fill offsets array
SetLength(PropertyOffsets, Size div SizeOf(Word));
Stream.Read(PropertyOffsets[0], Size);
// Do an endian swap if necessary. Don't forget there is an extra node on the end with the final index.
if Header.BOM = BOM_MSB_FIRST then
begin
for I := 0 to Header.Count do
PropertyOffsets[I] := Swap(PropertyOffsets[I]);
end;
// Load the ranges. The number of elements is in the last array position of the offsets.
SetLength(PropertyRanges, PropertyOffsets[Header.Count]);
Stream.Read(PropertyRanges[0], PropertyOffsets[Header.Count] * SizeOf(Cardinal));
// Do an endian swap if necessary.
if Header.BOM = BOM_MSB_FIRST then
begin
for I := 0 to PropertyOffsets[Header.Count] - 1 do
PropertyRanges[I] := SwapCardinal(PropertyRanges[I]);
end;
Stream.Free;
end;
LoadInProgress.Leave;
end;
//----------------------------------------------------------------------------------------------------------------------
function PropertyLookup(Code, N: Cardinal): Boolean;
var
L, R, M: Integer;
begin
// load property data if not already done
if PropertyOffsets = nil then LoadUnicodeTypeData;
Result := False;
// There is an extra node on the end of the offsets to allow this routine
// to work right. If the index is 0xffff, then there are no nodes for the property.
L := PropertyOffsets[N];
if L <> $FFFF then
begin
// Locate the next offset that is not 0xffff. The sentinel at the end of
// the array is the max index value.
M := 1;
while ((Integer(N) + M) < High(PropertyOffsets)) and (PropertyOffsets[Integer(N) + M] = $FFFF) do Inc(M);
R := PropertyOffsets[Integer(N) + M] - 1;
while L <= R do
begin
// Determine a "mid" point and adjust to make sure the mid point is at
// the beginning of a range pair.
M := (L + R) shr 1;
Dec(M, M and 1);
if Code > PropertyRanges[M + 1] then L := M + 2
else
if Code < PropertyRanges[M] then R := M - 2
else
if (Code >= PropertyRanges[M]) and (Code <= PropertyRanges[M + 1]) then
begin
Result := True;
Break;
end;
end;
end;
end;
//----------------------------------------------------------------------------------------------------------------------
function IsProperty(Code, Mask1, Mask2: Cardinal): Boolean;
var
I: Cardinal;
Mask: Cardinal;
begin
Result := False;
if Mask1 <> 0 then
begin
Mask := 1;
for I := 0 to 31 do
begin
if ((Mask1 and Mask) <> 0) and PropertyLookup(Code, I) then
begin
Result := True;
Exit;
end;
Mask := Mask shl 1;
end;
end;
if Mask2 <> 0 then
begin
I := 32;
Mask := 1;
while I < Cardinal(High(PropertyOffsets)) do
begin
if ((Mask2 and Mask) <> 0) and PropertyLookup(Code, I) then
begin
Result := True;
Exit;
end;
Inc(I);
Mask := Mask shl 1;
end;
end;
end;
//----------------- support for case mapping ---------------------------------------------------------------------------
var
CaseMapSize: Cardinal;
CaseLengths: array[0..1] of Word;
CaseMap: TCardinalArray;
procedure LoadUnicodeCaseData;
var
Stream: TResourceStream;
I: Cardinal;
Header: TUHeader;
begin
// make sure no other code is currently modifying the global data area
if LoadInProgress = nil then LoadInProgress := TCriticalSection.Create;
LoadInProgress.Enter;
if CaseMap = nil then
begin
Stream := TResourceStream.Create(HInstance, 'CASE', 'UNICODE');
Stream.Read(Header, SizeOf(Header));
if Header.BOM = BOM_MSB_FIRST then
begin
Header.Count := Swap(Header.Count);
Header.Len[0] := Swap(Header.Len[0]);
Header.Len[1] := Swap(Header.Len[1]);
end;
// Set the node count and lengths of the upper and lower case mapping tables.
CaseMapSize := Header.Count * 3;
CaseLengths[0] := Header.Len[0] * 3;
CaseLengths[1] := Header.Len[1] * 3;
SetLength(CaseMap, CaseMapSize);
// Load the case mapping table.
Stream.Read(CaseMap[0], CaseMapSize * SizeOf(Cardinal));
// Do an endian swap if necessary.
if Header.BOM = BOM_MSB_FIRST then
for I := 0 to CaseMapSize -1 do CaseMap[I] := SwapCardinal(CaseMap[I]);
Stream.Free;
end;
LoadInProgress.Leave;
end;
//----------------------------------------------------------------------------------------------------------------------
function CaseLookup(Code: Cardinal; L, R, Field: Integer): Cardinal;
var
M: Integer;
begin
// load case mapping data if not already done
if CaseMap = nil then LoadUnicodeCaseData;
// Do the binary search.
while L <= R do
begin
// Determine a "mid" point and adjust to make sure the mid point is at
// the beginning of a case mapping triple.
M := (L + R) shr 1;
Dec(M, M mod 3);
if Code > CaseMap[M] then L := M + 3
else
if Code < CaseMap[M] then R := M - 3
else
if Code = CaseMap[M] then
begin
Result := CaseMap[M + Field];
Exit;
end;
end;
Result := Code;
end;
//----------------------------------------------------------------------------------------------------------------------
function UnicodeToUpper(Code: UCS4): UCS4;
var
Field,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -