📄 unicode.pas

📁 几十个处理Unicode的函数
💻 PAS
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
unit Unicode;

// Copyright (c) 1999, 2000 Mike Lischke (public@lischke-online.de)
// Portions Copyright (c) 1999, 2000 Azret Botash (az)
//
// 01-APR-2000 ml:
//   preparation for public release
// FEB-MAR 2000 version 2.0 beta
//   - Unicode regular expressions (URE) search class (TURESearch)
//   - generic search engine base class for both the Boyer-Moore and the RE search class
//   - whole word only search in UTBM, bug fixes in UTBM
//   - string decompositon (including hangul)
// OCT/99 - JAN/2000 ml: version 1.0 
//   - basic Unicode implementation, more than 100 WideString/UCS2 and UCS4 core functions
//   - TWideStrings and TWideStringList classes
//   - Unicode Tuned Boyer-Moore search class (TUTBMSearch)
//   - low and high level Unicode/Wide* functions
//   - low level Unicode UCS4 data import and functions
//   - helper functions
//----------------------------------------------------------------------------------------------------------------------
// This unit contains routines and classes to manage and work with Unicode/WideStrings strings.
// You need Delphi 4 or higher to compile this code.
//
// Unicode encodings and wide strings:
// Currently there are several encoding schemes defined which describe (among others) the code size and (resulting from
// this) the usable value pool. Delphi supports the wide character data type for Unicode which corresponds to
// UCS2 (UTF-16 coding scheme). This scheme uses 2 bytes to store character values and can therefor handle up to
// 65536 characters. Another scheme is UCS4 (UTF-32 coding scheme) which uses 4 bytes per character. The first 65536
// code points correspond directly to those of UCS2. Other code points are mainly used for character surrogates.
// To provide support for UCS2 (WideChar in Delphi) as well as UCS4 the library is splitted into two parts. The low
// level part accepts and returns UCS4 characters while the high level part deals directly with WideChar/WideString
// data types. Additionally, UCS2 is defined as being WideChar to retain maximum compatibility.
//
// Publicy available low level functions are all preceded by "Unicode..." (e.g. in UnicodeToUpper) while
// the high level functions use the Str... or Wide... naming scheme (e.g. WideUpCase and WideUpperCase).
//
//----------------------------------------------------------------------------------------------------------------------
// Open issues:
//   - Keep in mind that this unit is still in beta state. In particular the URE class does not yet work for all cases.
//   - Yet to do things in the URE class are:
//     - check all character classes if they match correctly
//     - optimize rebuild of DFA (build only when pattern changes)
//     - set flag parameter of ExecuteURE
//     - add \d     any decimal digit
//           \D     any character that is not a decimal digit
//           \s     any whitespace character
//           \S     any character that is not a whitespace character
//           \w     any "word" character
//           \W     any "non-word" character
//   - For a perfect text search both the text to be searched through as well as the pattern must be normalized
//     to allow to match, say, accented and unaccented characters or the ligature fi with the letter combination fi etc.
//     Normalization is usually done by decomposing the string and optionally compose it again, but I had not yet the
//     opportunity to go through the composition stuff.
//   - The wide string classes still compare text with functions provided by the particular system. This works usually
//     fine under WinNT/W2K (although also there are limitations like maximum text lengths). Under Win9x conversions
//     from and to MBCS are necessary which are bound to a particular locale and so very limited in general use. 
//     These comparisons should be changed so that the code in this unit is used. This requires, though, a working
//     composition implementation.

interface

uses
  Windows, Classes,RTLConsts;

const
  // definitions of often used characters:
  // Note: Use them only for tests of a certain character not to determine character classes like
  //       white spaces as in Unicode are often many code points defined being in a certain class.
  //       Hence your best option is to use the various UnicodeIs* functions.

  // can't use identifier "Null" here as this is already in a special Variant identifier
  WideNull = WideChar(#0);
  Tabulator = WideChar(#9);
  Space = WideChar(#32);

  // logical line breaks
  LF = WideChar($A);
  LineFeed = WideChar($A);
  VerticalTab = WideChar($B);
  FormFeed = WideChar($C);
  CR = WideChar($D);
  CarriageReturn = WideChar($D);
  CRLF: WideString = #$D#$A;
  LineSeparator = WideChar($2028);
  ParagraphSeparator = WideChar($2029);

  // byte order marks for strings
  // Unicode text files should contain $FFFE as first character to identify such a file clearly. Depending on the system
  // where the file was created on this appears either in big endian or little endian style.
  BOM_LSB_FIRST = WideChar($FEFF); // this is how the BOM appears on x86 systems when written by a x86 system
  BOM_MSB_FIRST = WideChar($FFFE);

type
  // Unicode transformation formats (UTF) data types
  UTF7 = Char;
  UTF8 = Char;
  UTF16 = WideChar;
  UTF32 = Cardinal;

  // UTF conversion schemes (UCS) data types
  PUCS4 = ^UCS4;
  UCS4 = Cardinal;
  PUCS2 = PWideChar;
  UCS2 = WideChar;

const
  ReplacementCharacter: UCS4 = $0000FFFD;
  MaximumUCS2: UCS4 = $0000FFFF;
  MaximumUTF16: UCS4 = $0010FFFF;
  MaximumUCS4: UCS4 = $7FFFFFFF;
                          
  SurrogateHighStart: UCS4 = $D800;
  SurrogateHighEnd: UCS4 = $DBFF;
  SurrogateLowStart: UCS4 = $DC00;
  SurrogateLowEnd: UCS4 = $DFFF;

type
  PCardinal = ^Cardinal;

  TWideStrings = class;

  TSearchFlags = set of (
    sfCaseSensitive,         // match letter case
    sfIgnoreNonSpacing,      // ignore non-spacing characters in search
    sfSpaceCompress,         // handle several consecutive white spaces as one white space
                             // (this applies to the pattern as well as the search text)
    sfWholeWordOnly);        // match only text at end/start and/or surrounded by white spaces

  // a generic search class defininition used for tuned Boyer-Moore and Unicode regular expression searches
  TSearchEngine = class
  private
    FResults: TList;      // 2 entries for each result (start and stop position)
    FOwner: TWideStrings; // at the moment unused, perhaps later to access strings faster
  protected
    function GetCount: Integer; virtual;
  public
    constructor Create(AOwner: TWideStrings); virtual;
    destructor Destroy; override;

    procedure AddResult(Start, Stop: Cardinal); virtual;
    procedure Clear; virtual;
    procedure ClearResults; virtual;
    procedure DeleteResult(Index: Cardinal); virtual;
    procedure FindPrepare(const Pattern: WideString; Options: TSearchFlags); overload; virtual; abstract;
    procedure FindPrepare(const Pattern: PWideChar; PatternLength: Cardinal; Options: TSearchFlags); overload; virtual; abstract;
    function FindFirst(const Text: WideString; var Start, Stop: Cardinal): Boolean; overload; virtual; abstract;
    function FindFirst(const Text: PWideChar; TextLen: Cardinal; var Start, Stop: Cardinal): Boolean; overload; virtual; abstract;
    function FindAll(const Text: WideString): Boolean; overload; virtual; abstract;
    function FindAll(const Text: PWideChar; TextLen: Cardinal): Boolean; overload; virtual; abstract;
    procedure GetResult(Index: Cardinal; var Start, Stop: Integer); virtual;

    property Count: Integer read GetCount;
  end;


  // The Unicode Tuned Boyer-Moore (UTBM) search implementation is an extended translation created from a free package
  // written by Mark Leisher (mleisher@crl.nmsu.edu).
  //
  // The code handles high and low surrogates as well as case (in)dependency, can ignore non-spacing characters and
  // allows optionally to return whole words only.

  // single pattern character
  PUTBMChar = ^TUTBMChar;
  TUTBMChar = record
    LoCase,
    UpCase,
    TitleCase: UCS4;
  end;

  PUTBMSkip = ^TUTBMSkip;
  TUTBMSkip = record
    BMChar: PUTBMChar;
    SkipValues: Integer;
  end;

  TUTBMSearch = class(TSearchEngine)
  private
    FFlags: TSearchFlags;
    FPattern: PUTBMChar;
    FPatternUsed,
    FPatternSize,
    FPatternLength: Cardinal;
    FSkipValues: PUTBMSkip;
    FSkipsUsed: Integer;
    FMD4: Cardinal;
  protected
    procedure ClearPattern;
    procedure Compile(Pattern: PUCS2; PatternLength: Integer; Flags: TSearchFlags);
    function Find(Text: PUCS2; TextLen: Cardinal; var MatchStart, MatchEnd: Cardinal): Boolean;
    function GetSkipValue(TextStart, TextEnd: PUCS2): Cardinal;
    function Match(Text, Start, Stop: PUCS2; var MatchStart, MatchEnd: Cardinal): Boolean;
  public
    constructor Create(AOwner: TWideStrings); override;
    destructor Destroy; override;

    procedure Clear; override;
    procedure FindPrepare(const Pattern: WideString; Options: TSearchFlags); override;
    procedure FindPrepare(const Pattern: PWideChar; PatternLength: Cardinal; Options: TSearchFlags); override;
    function FindFirst(const Text: WideString; var Start, Stop: Cardinal): Boolean; override;
    function FindFirst(const Text: PWideChar; TextLen: Cardinal; var Start, Stop: Cardinal): Boolean; override;
    function FindAll(const Text: WideString): Boolean; override;
    function FindAll(const Text: PWideChar; TextLen: Cardinal): Boolean; override;
  end;

  
  // Regular expression search engine for text in UCS2 form taking surrogates into account.
  // This implementation is an improved translation from the URE package written by Mark Leisher (mleisher@crl.nmsu.edu)
  // who used a variation of the RE->DFA algorithm done by Mark Hopkins (markh@csd4.csd.uwm.edu).
  // Assumptions:
  //   o  Regular expression and text already normalized.
  //   o  Conversion to lower case assumes a 1-1 mapping.
  //
  // Definitions:
  //   Separator - any one of U+2028, U+2029, NL, CR.
  //
  // Operators:
  //   .      - match any character
  //   *      - match zero or more of the last subexpression
  //   +      - match one or more of the last subexpression
  //   ?      - match zero or one of the last subexpression
  //   ()     - subexpression grouping
  //   {m, n} - match at least m occurences and up to n occurences
  //            Note: both values can be 0 or ommitted which denotes then a unlimiting bound
  //            {,} and {0,} and {0, 0} correspond to *
  //            {, 1} and {0, 1} correspond to ?
  //            {1,} and {1, 0} correspond to +
  //   {m}    - match exactly m occurences
  //
  //   Notes:
  //     o  The "." operator normally does not match separators, but a flag is
  //        available that will allow this operator to match a separator.
  //
  // Literals and Constants:
  //   c       - literal UCS2 character
  //   \x....  - hexadecimal number of up to 4 digits
  //   \X....  - hexadecimal number of up to 4 digits
  //   \u....  - hexadecimal number of up to 4 digits
  //   \U....  - hexadecimal number of up to 4 digits
  //
  // Character classes:
  //   [...]           - Character class
  //   [^...]          - Negated character class
  //   \pN1,N2,...,Nn  - Character properties class
  //   \PN1,N2,...,Nn  - Negated character properties class
  //
  //   POSIX character classes recognized:
  //     :alnum:
  //     :alpha:
  //     :cntrl:
  //     :digit:
  //     :graph:
  //     :lower:
  //     :print:
  //     :punct:
  //     :space:
  //     :upper:
  //     :xdigit:
  //
  //   Notes:
  //     o  Character property classes are \p or \P followed by a comma separated
  //        list of integers between 1 and 32.  These integers are references to
  //        the following character properties:
  //
  //         N	Character Property
  //         --------------------------
  //         1	_URE_NONSPACING
  //         2	_URE_COMBINING
  //         3	_URE_NUMDIGIT
  //         4	_URE_NUMOTHER
  //         5	_URE_SPACESEP
  //         6	_URE_LINESEP
  //         7	_URE_PARASEP
  //         8	_URE_CNTRL
  //         9	_URE_PRIVATE
  //         10	_URE_UPPER   (note: upper, lower and titel case classes need to have case
  //         11	_URE_LOWER          sensitive search be enabled to match correctly!)
  //         12	_URE_TITLE
  //         13	_URE_MODIFIER
  //         14	_URE_OTHERLETTER
  //         15	_URE_DASHPUNCT
  //         16	_URE_OPENPUNCT
  //         17	_URE_CLOSEPUNCT
  //         18	_URE_OTHERPUNCT
  //         19	_URE_MATHSYM
  //         20	_URE_CURRENCYSYM
  //         21	_URE_OTHERSYM
  //         22	_URE_LTR
  //         23	_URE_RTL
  //         24	_URE_EURONUM
  //         25	_URE_EURONUMSEP
  //         26	_URE_EURONUMTERM
  //         27	_URE_ARABNUM
  //         28	_URE_COMMONSEP
  //         29	_URE_BLOCKSEP
  //         30	_URE_SEGMENTSEP
  //         31	_URE_WHITESPACE
  //         32	_URE_OTHERNEUT
  //
  //     o  Character classes can contain literals, constants, and character
  //        property classes. Example:
  //
  //        [abc\U10A\p1,3,4]

  // structure used to handle a compacted range of characters
  PRange = ^TRange;
  TRange = record
    MinCode,
    MaxCode: UCS4;
  end;

  TCClass = record
    Ranges: array of TRange;
    RangesUsed: Integer;
  end;

  // either a single character or a list of character classes
  TSymbol = record
    Chr: UCS4;
    CCL: TCClass;
  end;

  // this is a general element structure used for expressions and stack elements
  TElement = record
    OnStack: Boolean;
    AType,
    LHS,
    RHS: Cardinal;
  end;

  // this is a structure used to track a list or a stack of states
  PStateList = ^TStateList;
  TStateList = record
    List: array of Cardinal;
    ListUsed: Integer;
  end;

  // structure to track the list of unique states for a symbol during reduction
  PSymbolTableEntry = ^TSymbolTableEntry;
  TSymbolTableEntry = record
    ID,
    AType: Cardinal;
    Mods,
    Props: Cardinal;
    Symbol: TSymbol;
    States: TStateList;
  end;

  // structure to hold a single State
  PState = ^TState;
  TState = record
    ID: Cardinal;
    Accepting: Boolean;
    StateList: TStateList;
    Transitions: array of TElement;
    TransitionsUsed: Integer;
  end;

  // structure used for keeping lists of states
  TStateTable = record
    States: array of TState;
    StatesUsed: Integer;
  end;

  // structure to track pairs of FDFA states when equivalent states are merged
  TEquivalent = record
    Left, Right: Cardinal;
12 3 4 5 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -