📄 unicode.pas
字号:
unit Unicode;
// Copyright (c) 1999, 2000 Mike Lischke (public@lischke-online.de)
// Portions Copyright (c) 1999, 2000 Azret Botash (az)
//
// 01-APR-2000 ml:
// preparation for public release
// FEB-MAR 2000 version 2.0 beta
// - Unicode regular expressions (URE) search class (TURESearch)
// - generic search engine base class for both the Boyer-Moore and the RE search class
// - whole word only search in UTBM, bug fixes in UTBM
// - string decompositon (including hangul)
// OCT/99 - JAN/2000 ml: version 1.0
// - basic Unicode implementation, more than 100 WideString/UCS2 and UCS4 core functions
// - TWideStrings and TWideStringList classes
// - Unicode Tuned Boyer-Moore search class (TUTBMSearch)
// - low and high level Unicode/Wide* functions
// - low level Unicode UCS4 data import and functions
// - helper functions
//----------------------------------------------------------------------------------------------------------------------
// This unit contains routines and classes to manage and work with Unicode/WideStrings strings.
// You need Delphi 4 or higher to compile this code.
//
// Unicode encodings and wide strings:
// Currently there are several encoding schemes defined which describe (among others) the code size and (resulting from
// this) the usable value pool. Delphi supports the wide character data type for Unicode which corresponds to
// UCS2 (UTF-16 coding scheme). This scheme uses 2 bytes to store character values and can therefor handle up to
// 65536 characters. Another scheme is UCS4 (UTF-32 coding scheme) which uses 4 bytes per character. The first 65536
// code points correspond directly to those of UCS2. Other code points are mainly used for character surrogates.
// To provide support for UCS2 (WideChar in Delphi) as well as UCS4 the library is splitted into two parts. The low
// level part accepts and returns UCS4 characters while the high level part deals directly with WideChar/WideString
// data types. Additionally, UCS2 is defined as being WideChar to retain maximum compatibility.
//
// Publicy available low level functions are all preceded by "Unicode..." (e.g. in UnicodeToUpper) while
// the high level functions use the Str... or Wide... naming scheme (e.g. WideUpCase and WideUpperCase).
//
//----------------------------------------------------------------------------------------------------------------------
// Open issues:
// - Keep in mind that this unit is still in beta state. In particular the URE class does not yet work for all cases.
// - Yet to do things in the URE class are:
// - check all character classes if they match correctly
// - optimize rebuild of DFA (build only when pattern changes)
// - set flag parameter of ExecuteURE
// - add \d any decimal digit
// \D any character that is not a decimal digit
// \s any whitespace character
// \S any character that is not a whitespace character
// \w any "word" character
// \W any "non-word" character
// - For a perfect text search both the text to be searched through as well as the pattern must be normalized
// to allow to match, say, accented and unaccented characters or the ligature fi with the letter combination fi etc.
// Normalization is usually done by decomposing the string and optionally compose it again, but I had not yet the
// opportunity to go through the composition stuff.
// - The wide string classes still compare text with functions provided by the particular system. This works usually
// fine under WinNT/W2K (although also there are limitations like maximum text lengths). Under Win9x conversions
// from and to MBCS are necessary which are bound to a particular locale and so very limited in general use.
// These comparisons should be changed so that the code in this unit is used. This requires, though, a working
// composition implementation.
interface
uses
Windows, Classes,RTLConsts;
const
// definitions of often used characters:
// Note: Use them only for tests of a certain character not to determine character classes like
// white spaces as in Unicode are often many code points defined being in a certain class.
// Hence your best option is to use the various UnicodeIs* functions.
// can't use identifier "Null" here as this is already in a special Variant identifier
WideNull = WideChar(#0);
Tabulator = WideChar(#9);
Space = WideChar(#32);
// logical line breaks
LF = WideChar($A);
LineFeed = WideChar($A);
VerticalTab = WideChar($B);
FormFeed = WideChar($C);
CR = WideChar($D);
CarriageReturn = WideChar($D);
CRLF: WideString = #$D#$A;
LineSeparator = WideChar($2028);
ParagraphSeparator = WideChar($2029);
// byte order marks for strings
// Unicode text files should contain $FFFE as first character to identify such a file clearly. Depending on the system
// where the file was created on this appears either in big endian or little endian style.
BOM_LSB_FIRST = WideChar($FEFF); // this is how the BOM appears on x86 systems when written by a x86 system
BOM_MSB_FIRST = WideChar($FFFE);
type
// Unicode transformation formats (UTF) data types
UTF7 = Char;
UTF8 = Char;
UTF16 = WideChar;
UTF32 = Cardinal;
// UTF conversion schemes (UCS) data types
PUCS4 = ^UCS4;
UCS4 = Cardinal;
PUCS2 = PWideChar;
UCS2 = WideChar;
const
ReplacementCharacter: UCS4 = $0000FFFD;
MaximumUCS2: UCS4 = $0000FFFF;
MaximumUTF16: UCS4 = $0010FFFF;
MaximumUCS4: UCS4 = $7FFFFFFF;
SurrogateHighStart: UCS4 = $D800;
SurrogateHighEnd: UCS4 = $DBFF;
SurrogateLowStart: UCS4 = $DC00;
SurrogateLowEnd: UCS4 = $DFFF;
type
PCardinal = ^Cardinal;
TWideStrings = class;
TSearchFlags = set of (
sfCaseSensitive, // match letter case
sfIgnoreNonSpacing, // ignore non-spacing characters in search
sfSpaceCompress, // handle several consecutive white spaces as one white space
// (this applies to the pattern as well as the search text)
sfWholeWordOnly); // match only text at end/start and/or surrounded by white spaces
// a generic search class defininition used for tuned Boyer-Moore and Unicode regular expression searches
TSearchEngine = class
private
FResults: TList; // 2 entries for each result (start and stop position)
FOwner: TWideStrings; // at the moment unused, perhaps later to access strings faster
protected
function GetCount: Integer; virtual;
public
constructor Create(AOwner: TWideStrings); virtual;
destructor Destroy; override;
procedure AddResult(Start, Stop: Cardinal); virtual;
procedure Clear; virtual;
procedure ClearResults; virtual;
procedure DeleteResult(Index: Cardinal); virtual;
procedure FindPrepare(const Pattern: WideString; Options: TSearchFlags); overload; virtual; abstract;
procedure FindPrepare(const Pattern: PWideChar; PatternLength: Cardinal; Options: TSearchFlags); overload; virtual; abstract;
function FindFirst(const Text: WideString; var Start, Stop: Cardinal): Boolean; overload; virtual; abstract;
function FindFirst(const Text: PWideChar; TextLen: Cardinal; var Start, Stop: Cardinal): Boolean; overload; virtual; abstract;
function FindAll(const Text: WideString): Boolean; overload; virtual; abstract;
function FindAll(const Text: PWideChar; TextLen: Cardinal): Boolean; overload; virtual; abstract;
procedure GetResult(Index: Cardinal; var Start, Stop: Integer); virtual;
property Count: Integer read GetCount;
end;
// The Unicode Tuned Boyer-Moore (UTBM) search implementation is an extended translation created from a free package
// written by Mark Leisher (mleisher@crl.nmsu.edu).
//
// The code handles high and low surrogates as well as case (in)dependency, can ignore non-spacing characters and
// allows optionally to return whole words only.
// single pattern character
PUTBMChar = ^TUTBMChar;
TUTBMChar = record
LoCase,
UpCase,
TitleCase: UCS4;
end;
PUTBMSkip = ^TUTBMSkip;
TUTBMSkip = record
BMChar: PUTBMChar;
SkipValues: Integer;
end;
TUTBMSearch = class(TSearchEngine)
private
FFlags: TSearchFlags;
FPattern: PUTBMChar;
FPatternUsed,
FPatternSize,
FPatternLength: Cardinal;
FSkipValues: PUTBMSkip;
FSkipsUsed: Integer;
FMD4: Cardinal;
protected
procedure ClearPattern;
procedure Compile(Pattern: PUCS2; PatternLength: Integer; Flags: TSearchFlags);
function Find(Text: PUCS2; TextLen: Cardinal; var MatchStart, MatchEnd: Cardinal): Boolean;
function GetSkipValue(TextStart, TextEnd: PUCS2): Cardinal;
function Match(Text, Start, Stop: PUCS2; var MatchStart, MatchEnd: Cardinal): Boolean;
public
constructor Create(AOwner: TWideStrings); override;
destructor Destroy; override;
procedure Clear; override;
procedure FindPrepare(const Pattern: WideString; Options: TSearchFlags); override;
procedure FindPrepare(const Pattern: PWideChar; PatternLength: Cardinal; Options: TSearchFlags); override;
function FindFirst(const Text: WideString; var Start, Stop: Cardinal): Boolean; override;
function FindFirst(const Text: PWideChar; TextLen: Cardinal; var Start, Stop: Cardinal): Boolean; override;
function FindAll(const Text: WideString): Boolean; override;
function FindAll(const Text: PWideChar; TextLen: Cardinal): Boolean; override;
end;
// Regular expression search engine for text in UCS2 form taking surrogates into account.
// This implementation is an improved translation from the URE package written by Mark Leisher (mleisher@crl.nmsu.edu)
// who used a variation of the RE->DFA algorithm done by Mark Hopkins (markh@csd4.csd.uwm.edu).
// Assumptions:
// o Regular expression and text already normalized.
// o Conversion to lower case assumes a 1-1 mapping.
//
// Definitions:
// Separator - any one of U+2028, U+2029, NL, CR.
//
// Operators:
// . - match any character
// * - match zero or more of the last subexpression
// + - match one or more of the last subexpression
// ? - match zero or one of the last subexpression
// () - subexpression grouping
// {m, n} - match at least m occurences and up to n occurences
// Note: both values can be 0 or ommitted which denotes then a unlimiting bound
// {,} and {0,} and {0, 0} correspond to *
// {, 1} and {0, 1} correspond to ?
// {1,} and {1, 0} correspond to +
// {m} - match exactly m occurences
//
// Notes:
// o The "." operator normally does not match separators, but a flag is
// available that will allow this operator to match a separator.
//
// Literals and Constants:
// c - literal UCS2 character
// \x.... - hexadecimal number of up to 4 digits
// \X.... - hexadecimal number of up to 4 digits
// \u.... - hexadecimal number of up to 4 digits
// \U.... - hexadecimal number of up to 4 digits
//
// Character classes:
// [...] - Character class
// [^...] - Negated character class
// \pN1,N2,...,Nn - Character properties class
// \PN1,N2,...,Nn - Negated character properties class
//
// POSIX character classes recognized:
// :alnum:
// :alpha:
// :cntrl:
// :digit:
// :graph:
// :lower:
// :print:
// :punct:
// :space:
// :upper:
// :xdigit:
//
// Notes:
// o Character property classes are \p or \P followed by a comma separated
// list of integers between 1 and 32. These integers are references to
// the following character properties:
//
// N Character Property
// --------------------------
// 1 _URE_NONSPACING
// 2 _URE_COMBINING
// 3 _URE_NUMDIGIT
// 4 _URE_NUMOTHER
// 5 _URE_SPACESEP
// 6 _URE_LINESEP
// 7 _URE_PARASEP
// 8 _URE_CNTRL
// 9 _URE_PRIVATE
// 10 _URE_UPPER (note: upper, lower and titel case classes need to have case
// 11 _URE_LOWER sensitive search be enabled to match correctly!)
// 12 _URE_TITLE
// 13 _URE_MODIFIER
// 14 _URE_OTHERLETTER
// 15 _URE_DASHPUNCT
// 16 _URE_OPENPUNCT
// 17 _URE_CLOSEPUNCT
// 18 _URE_OTHERPUNCT
// 19 _URE_MATHSYM
// 20 _URE_CURRENCYSYM
// 21 _URE_OTHERSYM
// 22 _URE_LTR
// 23 _URE_RTL
// 24 _URE_EURONUM
// 25 _URE_EURONUMSEP
// 26 _URE_EURONUMTERM
// 27 _URE_ARABNUM
// 28 _URE_COMMONSEP
// 29 _URE_BLOCKSEP
// 30 _URE_SEGMENTSEP
// 31 _URE_WHITESPACE
// 32 _URE_OTHERNEUT
//
// o Character classes can contain literals, constants, and character
// property classes. Example:
//
// [abc\U10A\p1,3,4]
// structure used to handle a compacted range of characters
PRange = ^TRange;
TRange = record
MinCode,
MaxCode: UCS4;
end;
TCClass = record
Ranges: array of TRange;
RangesUsed: Integer;
end;
// either a single character or a list of character classes
TSymbol = record
Chr: UCS4;
CCL: TCClass;
end;
// this is a general element structure used for expressions and stack elements
TElement = record
OnStack: Boolean;
AType,
LHS,
RHS: Cardinal;
end;
// this is a structure used to track a list or a stack of states
PStateList = ^TStateList;
TStateList = record
List: array of Cardinal;
ListUsed: Integer;
end;
// structure to track the list of unique states for a symbol during reduction
PSymbolTableEntry = ^TSymbolTableEntry;
TSymbolTableEntry = record
ID,
AType: Cardinal;
Mods,
Props: Cardinal;
Symbol: TSymbol;
States: TStateList;
end;
// structure to hold a single State
PState = ^TState;
TState = record
ID: Cardinal;
Accepting: Boolean;
StateList: TStateList;
Transitions: array of TElement;
TransitionsUsed: Integer;
end;
// structure used for keeping lists of states
TStateTable = record
States: array of TState;
StatesUsed: Integer;
end;
// structure to track pairs of FDFA states when equivalent states are merged
TEquivalent = record
Left, Right: Cardinal;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -