📄 unicode.pas

📁 几十个处理Unicode的函数
💻 PAS
📖 第 1 页 / 共 5 页
字号:

// utility functions
function CodePageFromLocale(Language: LCID): Integer;
function KeyboardCodePage: Word;
function KeyUnicode(C: Char): WideChar;
function CodeBlockFromChar(const C: WideChar): Cardinal;
function CodePageToWideString(A: AnsiString; CodePage: Word): WideString;

// WideString Conversion routines
function WideStringToUTF8(S: WideString): AnsiString;
function UTF8ToWideString(S: AnsiString): WideString;

//----------------------------------------------------------------------------------------------------------------------

implementation

// ~67K Unicode data for case mapping, decomposition, numbers etc.
// This data is loaded on demand which means only those parts will be put in memory which are needed
// by one of the lookup functions.
{$R Unicode.res}

uses
  Consts, SyncObjs, SysUtils;

resourcestring
  SUREBaseString = 'Error in regular expression: %s' + #13;
  SUREUnexpectedEOS = 'Unexpected end of pattern.';
  SURECharacterClassOpen = 'Character class not closed, '']'' is missing.';
  SUREUnbalancedGroup = 'Unbalanced group expression, '')'' is missing.';
  SUREInvalidCharProperty = 'A character property is invalid';
  SUREInvalidRepeatRange = 'Invalid repeation range.';
  SURERepeatRangeOpen = 'Repeation range not closed, ''}'' is missing.';
  SUREExpressionEmpty = 'Expression is empty.';

type
  TCompareFunc = function (W1, W2: WideString; Locale: LCID): Integer;

var
  WideCompareText: TCompareFunc;

//----------------- Loader routines for resource data ------------------------------------------------------------------

const
  // Values that can appear in the Mask1 parameter of the IsProperty function.
  UC_MN = $00000001; // Mark, Non-Spacing
  UC_MC = $00000002; // Mark, Spacing Combining
  UC_ME = $00000004; // Mark, Enclosing
  UC_ND = $00000008; // Number, Decimal Digit
  UC_NL = $00000010; // Number, Letter
  UC_NO = $00000020; // Number, Other
  UC_ZS = $00000040; // Separator, Space
  UC_ZL = $00000080; // Separator, Line
  UC_ZP = $00000100; // Separator, Paragraph
  UC_CC = $00000200; // Other, Control
  UC_CF = $00000400; // Other, Format
  UC_OS = $00000800; // Other, Surrogate
  UC_CO = $00001000; // Other, private use
  UC_CN = $00002000; // Other, not assigned
  UC_LU = $00004000; // Letter, Uppercase
  UC_LL = $00008000; // Letter, Lowercase
  UC_LT = $00010000; // Letter, Titlecase
  UC_LM = $00020000; // Letter, Modifier
  UC_LO = $00040000; // Letter, Other
  UC_PC = $00080000; // Punctuation, Connector
  UC_PD = $00100000; // Punctuation, Dash
  UC_PS = $00200000; // Punctuation, Open
  UC_PE = $00400000; // Punctuation, Close
  UC_PO = $00800000; // Punctuation, Other
  UC_SM = $01000000; // Symbol, Math
  UC_SC = $02000000; // Symbol, Currency
  UC_SK = $04000000; // Symbol, Modifier
  UC_SO = $08000000; // Symbol, Other
  UC_L  = $10000000; // Left-To-Right
  UC_R  = $20000000; // Right-To-Left
  UC_EN = $40000000; // European Number
  UC_ES = $80000000; // European Number Separator

  // Values that can appear in the Mask2 parameter of the IsProperty function
  UC_ET = $00000001; // European Number Terminator
  UC_AN = $00000002; // Arabic Number
  UC_CS = $00000004; // Common Number Separator
  UC_B  = $00000008; // Block Separator
  UC_S  = $00000010; // Segment (unit) Separator (this includes tab and vertical tab)
  UC_WS = $00000020; // Whitespace
  UC_ON = $00000040; // Other Neutrals

  // Implementation specific character properties.
  UC_CM = $00000080; // Composite
  UC_NB = $00000100; // Non-Breaking
  UC_SY = $00000200; // Symmetric
  UC_HD = $00000400; // Hex Digit
  UC_QM = $00000800; // Quote Mark
  UC_MR = $00001000; // Mirroring
  UC_SS = $00002000; // Space, other

  UC_CP = $00004000; // Defined

  // Added for UnicodeData-2.1.3.
  UC_PI = $00008000; // Punctuation, Initial
  UC_PF = $00010000; // Punctuation, Final

type
  TUHeader = record
    BOM: WideChar;
    Count: Word;
    case Boolean of
      True:
        (Bytes: Cardinal);
      False:
        (Len: array[0..1] of Word);
  end;

  TWordArray = array of Word;
  TCardinalArray = array of Cardinal;

var
  // As the global data can be accessed by several threads it should be guarded
  // while the data is loaded.
  LoadInProgress: TCriticalSection;

//----------------- internal support routines --------------------------------------------------------------------------

function SwapCardinal(C: Cardinal): Cardinal; 

// swaps all bytes in C from MSB to LSB order
// EAX contains both parameter as well as result

asm
              BSWAP EAX
end;

//----------------- support for character properties -------------------------------------------------------------------

var
  PropertyOffsets: TWordArray;
  PropertyRanges: TCardinalArray;

procedure LoadUnicodeTypeData;

// loads the character property data (as saved by the Unicode database extractor into the ctype.dat file)

var
  I, Size: Integer;
  Header: TUHeader;
  Stream: TResourceStream;

begin
  // make sure no other code is currently modifying the global data area
  if LoadInProgress = nil then LoadInProgress := TCriticalSection.Create;
  LoadInProgress.Enter;

  // Data already loaded?
  if PropertyOffsets = nil then
  begin
    Stream := TResourceStream.Create(HInstance, 'TYPE', 'UNICODE');
    Stream.Read(Header, SizeOf(Header));

    if Header.BOM = BOM_MSB_FIRST then
    begin
      Header.Count := Swap(Header.Count);
      Header.Bytes := SwapCardinal(Header.Bytes);
    end;

    // Calculate the offset into the storage for the ranges.  The offsets
    // array is on a 4-byte boundary and one larger than the value provided in
    // the header count field. This means the offset to the ranges must be
    // calculated after aligning the count to a 4-byte boundary.
    Size := (Header.Count + 1) * SizeOf(Word);
    if (Size and 3) <> 0 then Inc(Size, 4 - (Size and 3));

    // fill offsets array
    SetLength(PropertyOffsets, Size div SizeOf(Word));
    Stream.Read(PropertyOffsets[0], Size);

    // Do an endian swap if necessary.  Don't forget there is an extra node on the end with the final index.
    if Header.BOM = BOM_MSB_FIRST then
    begin
      for I := 0 to Header.Count do
          PropertyOffsets[I] := Swap(PropertyOffsets[I]);
    end;

    // Load the ranges.  The number of elements is in the last array position of the offsets.
    SetLength(PropertyRanges, PropertyOffsets[Header.Count]);
    Stream.Read(PropertyRanges[0], PropertyOffsets[Header.Count] * SizeOf(Cardinal));

    // Do an endian swap if necessary.
    if Header.BOM = BOM_MSB_FIRST then
    begin
      for I := 0 to PropertyOffsets[Header.Count] - 1 do
        PropertyRanges[I] := SwapCardinal(PropertyRanges[I]);
    end;
    Stream.Free;
  end;
  LoadInProgress.Leave;
end;

//----------------------------------------------------------------------------------------------------------------------

function PropertyLookup(Code, N: Cardinal): Boolean;

var
  L, R, M: Integer;
  
begin
  // load property data if not already done
  if PropertyOffsets = nil then LoadUnicodeTypeData;
  
  Result := False;
  // There is an extra node on the end of the offsets to allow this routine
  // to work right.  If the index is 0xffff, then there are no nodes for the property.
  L := PropertyOffsets[N];
  if L <> $FFFF then
  begin
    // Locate the next offset that is not 0xffff.  The sentinel at the end of
    // the array is the max index value.
    M := 1;
    while ((Integer(N) + M) < High(PropertyOffsets)) and (PropertyOffsets[Integer(N) + M] = $FFFF) do Inc(M);

    R := PropertyOffsets[Integer(N) + M] - 1;

    while L <= R do
    begin
      // Determine a "mid" point and adjust to make sure the mid point is at
      // the beginning of a range pair.
      M := (L + R) shr 1;
      Dec(M, M and 1);
      if Code > PropertyRanges[M + 1] then L := M + 2
                                      else
        if Code < PropertyRanges[M] then R := M - 2
                                    else
          if (Code >= PropertyRanges[M]) and (Code <= PropertyRanges[M + 1]) then
          begin
            Result := True;
            Break;
          end;
    end;
  end;
end;

//----------------------------------------------------------------------------------------------------------------------

function IsProperty(Code, Mask1, Mask2: Cardinal): Boolean;

var
  I: Cardinal;
  Mask: Cardinal;
  
begin
  Result := False;
  if Mask1 <> 0 then
  begin
    Mask := 1;
    for I := 0 to 31 do
    begin
      if ((Mask1 and Mask) <> 0) and PropertyLookup(Code, I) then
      begin
        Result := True;
        Exit;
      end;
      Mask := Mask shl 1;
    end;
  end;

  if Mask2 <> 0 then
  begin
    I := 32;
    Mask := 1;
    while I < Cardinal(High(PropertyOffsets)) do
    begin
      if ((Mask2 and Mask) <> 0) and PropertyLookup(Code, I) then
      begin
        Result := True;
        Exit;
      end;
      Inc(I);
      Mask := Mask shl 1;
    end;
  end;
end;

//----------------- support for case mapping ---------------------------------------------------------------------------
var
  CaseMapSize: Cardinal;
  CaseLengths: array[0..1] of Word;
  CaseMap: TCardinalArray;

procedure LoadUnicodeCaseData;

var
  Stream: TResourceStream;
  I: Cardinal;
  Header: TUHeader;
  
begin
  // make sure no other code is currently modifying the global data area
  if LoadInProgress = nil then LoadInProgress := TCriticalSection.Create;
  LoadInProgress.Enter;

  if CaseMap = nil then
  begin
    Stream := TResourceStream.Create(HInstance, 'CASE', 'UNICODE');
    Stream.Read(Header, SizeOf(Header));

    if Header.BOM = BOM_MSB_FIRST then
    begin
      Header.Count := Swap(Header.Count);
      Header.Len[0] := Swap(Header.Len[0]);
      Header.Len[1] := Swap(Header.Len[1]);
    end;

    // Set the node count and lengths of the upper and lower case mapping tables.
    CaseMapSize := Header.Count * 3;
    CaseLengths[0] := Header.Len[0] * 3;
    CaseLengths[1] := Header.Len[1] * 3;

    SetLength(CaseMap, CaseMapSize);

    // Load the case mapping table.
    Stream.Read(CaseMap[0], CaseMapSize * SizeOf(Cardinal));

    // Do an endian swap if necessary.
    if Header.BOM = BOM_MSB_FIRST then
      for I := 0 to CaseMapSize -1 do CaseMap[I] := SwapCardinal(CaseMap[I]);
    Stream.Free;
  end;
  LoadInProgress.Leave;
end;

//----------------------------------------------------------------------------------------------------------------------

function CaseLookup(Code: Cardinal; L, R, Field: Integer): Cardinal;

var
  M: Integer;

begin
  // load case mapping data if not already done
  if CaseMap = nil then LoadUnicodeCaseData;

  // Do the binary search.
  while L <= R do
  begin
    // Determine a "mid" point and adjust to make sure the mid point is at
    // the beginning of a case mapping triple.
    M := (L + R) shr 1;
    Dec(M, M mod 3);
    if Code > CaseMap[M] then L := M + 3
                         else
      if Code < CaseMap[M] then R := M - 3
                           else
        if Code = CaseMap[M] then
        begin
          Result := CaseMap[M + Field];
          Exit;
        end;
  end;

  Result := Code;
end;

//----------------------------------------------------------------------------------------------------------------------

function UnicodeToUpper(Code: UCS4): UCS4;

var
  Field,
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -