📄 jclunicode.pas
字号:
ubHangulCompatibilityJamo,
ubKanbun,
ubBopomofoExtended,
ubEnclosedCJKLettersAndMonths,
ubCJKCompatibility,
ubCJKUnifiedIdeographsExtensionA,
ubCJKUnifiedIdeographs,
ubYiSyllables,
ubYiRadicals,
ubHangulSyllables,
ubHighSurrogates,
ubHighPrivateUseSurrogates,
ubLowSurrogates,
ubPrivateUse,
ubCJKCompatibilityIdeographs,
ubAlphabeticPresentationForms,
ubArabicPresentationFormsA,
ubCombiningHalfMarks,
ubCJKCompatibilityForms,
ubSmallFormVariants,
ubArabicPresentationFormsB,
ubSpecials,
ubHalfwidthAndFullwidthForms,
ubOldItalic,
ubGothic,
ubDeseret,
ubByzantineMusicalSymbols,
ubMusicalSymbols,
ubMathematicalAlphanumericSymbols,
ubCJKUnifiedIdeographsExtensionB,
ubCJKCompatibilityIdeographsSupplement,
ubTags
);
TWideStrings = class;
TSearchFlag = (
sfCaseSensitive, // match letter case
sfIgnoreNonSpacing, // ignore non-spacing characters in search
sfSpaceCompress, // handle several consecutive white spaces as one white space
// (this applies to the pattern as well as the search text)
sfWholeWordOnly // match only text at end/start and/or surrounded by white spaces
);
TSearchFlags = set of TSearchFlag;
// a generic search class defininition used for tuned Boyer-Moore and Unicode
// regular expression searches
TSearchEngine = class(TObject)
private
FResults: TList; // 2 entries for each result (start and stop position)
FOwner: TWideStrings; // at the moment unused, perhaps later to access strings faster
protected
function GetCount: Integer; virtual;
public
constructor Create(AOwner: TWideStrings); virtual;
destructor Destroy; override;
procedure AddResult(Start, Stop: Cardinal); virtual;
procedure Clear; virtual;
procedure ClearResults; virtual;
procedure DeleteResult(Index: Cardinal); virtual;
procedure FindPrepare(const Pattern: WideString; Options: TSearchFlags); overload; virtual; abstract;
procedure FindPrepare(Pattern: PWideChar; PatternLength: Cardinal; Options: TSearchFlags); overload; virtual; abstract;
function FindFirst(const Text: WideString; var Start, Stop: Cardinal): Boolean; overload; virtual; abstract;
function FindFirst(Text: PWideChar; TextLen: Cardinal; var Start, Stop: Cardinal): Boolean; overload; virtual; abstract;
function FindAll(const Text: WideString): Boolean; overload; virtual; abstract;
function FindAll(Text: PWideChar; TextLen: Cardinal): Boolean; overload; virtual; abstract;
procedure GetResult(Index: Cardinal; var Start, Stop: Integer); virtual;
property Count: Integer read GetCount;
end;
// The Unicode Tuned Boyer-Moore (UTBM) search implementation is an extended
// translation created from a free package written by Mark Leisher (mleisher att crl dott nmsu dott edu).
//
// The code handles high and low surrogates as well as case (in)dependency,
// can ignore non-spacing characters and allows optionally to return whole
// words only.
// single pattern character
PUTBMChar = ^TUTBMChar;
TUTBMChar = record
LoCase,
UpCase,
TitleCase: UCS4;
end;
PUTBMSkip = ^TUTBMSkip;
TUTBMSkip = record
BMChar: PUTBMChar;
SkipValues: Integer;
end;
TUTBMSearch = class(TSearchEngine)
private
FFlags: TSearchFlags;
FPattern: PUTBMChar;
FPatternUsed: Cardinal;
FPatternSize: Cardinal;
FPatternLength: Cardinal;
FSkipValues: PUTBMSkip;
FSkipsUsed: Integer;
FMD4: Cardinal;
protected
procedure ClearPattern;
procedure Compile(Pattern: PUCS2; PatternLength: Integer; Flags: TSearchFlags);
function Find(Text: PUCS2; TextLen: Cardinal; var MatchStart, MatchEnd: Cardinal): Boolean;
function GetSkipValue(TextStart, TextEnd: PUCS2): Cardinal;
function Match(Text, Start, Stop: PUCS2; var MatchStart, MatchEnd: Cardinal): Boolean;
public
procedure Clear; override;
procedure FindPrepare(const Pattern: WideString; Options: TSearchFlags); overload; override;
procedure FindPrepare(Pattern: PWideChar; PatternLength: Cardinal; Options: TSearchFlags); overload; override;
function FindFirst(const Text: WideString; var Start, Stop: Cardinal): Boolean; overload; override;
function FindFirst(Text: PWideChar; TextLen: Cardinal; var Start, Stop: Cardinal): Boolean; overload; override;
function FindAll(const Text: WideString): Boolean; overload; override;
function FindAll(Text: PWideChar; TextLen: Cardinal): Boolean; overload; override;
end;
// Regular expression search engine for text in UCS2 form taking surrogates
// into account. This implementation is an improved translation from the URE
// package written by Mark Leisher (mleisher att crl dott nmsu dott edu) who used a variation
// of the RE->DFA algorithm done by Mark Hopkins (markh att csd4 dott csd dott uwm dott edu).
// Assumptions:
// o Regular expression and text already normalized.
// o Conversion to lower case assumes a 1-1 mapping.
//
// Definitions:
// Separator - any one of U+2028, U+2029, NL, CR.
//
// Operators:
// . - match any character
// * - match zero or more of the last subexpression
// + - match one or more of the last subexpression
// ? - match zero or one of the last subexpression
// () - subexpression grouping
// {m, n} - match at least m occurences and up to n occurences
// Note: both values can be 0 or ommitted which denotes then a unlimiting bound
// {,} and {0,} and {0, 0} correspond to *
// {, 1} and {0, 1} correspond to ?
// {1,} and {1, 0} correspond to +
// {m} - match exactly m occurences
//
// Notes:
// o The "." operator normally does not match separators, but a flag is
// available that will allow this operator to match a separator.
//
// Literals and Constants:
// c - literal UCS2 character
// \x.... - hexadecimal number of up to 4 digits
// \X.... - hexadecimal number of up to 4 digits
// \u.... - hexadecimal number of up to 4 digits
// \U.... - hexadecimal number of up to 4 digits
//
// Character classes:
// [...] - Character class
// [^...] - Negated character class
// \pN1,N2,...,Nn - Character properties class
// \PN1,N2,...,Nn - Negated character properties class
//
// POSIX character classes recognized:
// :alnum:
// :alpha:
// :cntrl:
// :digit:
// :graph:
// :lower:
// :print:
// :punct:
// :space:
// :upper:
// :xdigit:
//
// Notes:
// o Character property classes are \p or \P followed by a comma separated
// list of integers between 0 and the maximum entry index in TCharacterCategory.
// These integers directly correspond to the TCharacterCategory enumeration entries.
// Note: upper, lower and title case classes need to have case sensitive search
// be enabled to match correctly!
//
// o Character classes can contain literals, constants and character
// property classes. Example:
//
// [abc\U10A\p0,13,4]
// structure used to handle a compacted range of characters
PUcRange = ^TUcRange;
TUcRange = record
MinCode,
MaxCode: UCS4;
end;
TUcCClass = record
Ranges: array of TUcRange;
RangesUsed: Integer;
end;
// either a single character or a list of character classes
TUcSymbol = record
Chr: UCS4;
CCL: TUcCClass;
end;
// this is a general element structure used for expressions and stack elements
TUcElement = record
OnStack: Boolean;
AType,
LHS,
RHS: Cardinal;
end;
// this is a structure used to track a list or a stack of states
PUcStateList = ^TUcStateList;
TUcStateList = record
List: array of Cardinal;
ListUsed: Integer;
end;
// structure to track the list of unique states for a symbol during reduction
PUcSymbolTableEntry = ^TUcSymbolTableEntry;
TUcSymbolTableEntry = record
ID,
AType: Cardinal;
Mods,
Categories: TCharacterCategories;
Symbol: TUcSymbol;
States: TUcStateList;
end;
// structure to hold a single State
PUcState = ^TUcState;
TUcState = record
ID: Cardinal;
Accepting: Boolean;
StateList: TUcStateList;
Transitions: array of TUcElement;
TransitionsUsed: Integer;
end;
// structure used for keeping lists of states
TUcStateTable = record
States: array of TUcState;
StatesUsed: Integer;
end;
// structure to track pairs of DFA states when equivalent states are merged
TUcEquivalent = record
Left,
Right: Cardinal;
end;
TUcExpressionList = record
Expressions: array of TUcElement;
ExpressionsUsed: Integer;
end;
TUcSymbolTable = record
Symbols: array of TUcSymbolTableEntry;
SymbolsUsed: Integer;
end;
TUcEquivalentList = record
Equivalents: array of TUcEquivalent;
EquivalentsUsed: Integer;
end;
// structure used for constructing the NFA and reducing to a minimal DFA
PUREBuffer = ^TUREBuffer;
TUREBuffer = record
Reducing: Boolean;
Error: Integer;
Flags: Cardinal;
Stack: TUcStateList;
SymbolTable: TUcSymbolTable; // table of unique symbols encountered
ExpressionList: TUcExpressionList; // tracks the unique expressions generated
// for the NFA and when the NFA is reduced
States: TUcStateTable; // the reduced table of unique groups of NFA states
EquivalentList: TUcEquivalentList; // tracks states when equivalent states are merged
end;
TUcTransition = record
Symbol,
NextState: Cardinal;
end;
PDFAState = ^TDFAState;
TDFAState = record
Accepting: Boolean;
NumberTransitions: Integer;
StartTransition: Integer;
end;
TDFAStates = record
States: array of TDFAState;
StatesUsed: Integer;
end;
TUcTransitions = record
Transitions: array of TUcTransition;
TransitionsUsed: Integer;
end;
TDFA = record
Flags: Cardinal;
SymbolTable: TUcSymbolTable;
StateList: TDFAStates;
TransitionList: TUcTransitions;
end;
TURESearch = class(TSearchEngine)
private
FUREBuffer: TUREBuffer;
FDFA: TDFA;
protected
procedure AddEquivalentPair(L, R: Cardinal);
procedure AddRange(var CCL: TUcCClass; Range: TUcRange);
function AddState(NewStates: array of Cardinal): Cardinal;
procedure AddSymbolState(Symbol, State: Cardinal);
function BuildCharacterClass(CP: PUCS2; Limit: Cardinal; Symbol: PUcSymbolTableEntry): Cardinal;
procedure ClearUREBuffer;
function CompileSymbol(S: PUCS2; Limit: Cardinal; Symbol: PUcSymbolTableEntry): Cardinal;
procedure CompileURE(RE: PWideChar; RELength: Cardinal; Casefold: Boolean);
procedure CollectPendingOperations(var State: Cardinal);
function ConvertRegExpToNFA(RE: PWideChar; RELength: Cardinal): Cardinal;
function ExecuteURE(Flags: Cardinal; Text: PUCS2; TextLen: Cardinal; var MatchStart, MatchEnd: Cardinal): Boolean;
procedure ClearDFA;
procedure HexDigitSetup(Symbol: PUcSymbolTableEntry);
function MakeExpression(AType, LHS, RHS: Cardinal): Cardinal;
function MakeHexNumber(NP: PUCS2; Limit: Cardinal; var Number: Cardinal): Cardinal;
function MakeSymbol(S: PUCS2; Limit: Cardinal; var Consumed: Cardinal): Cardinal;
procedure MergeEquivalents;
function ParsePropertyList(Properties: PUCS2; Limit: Cardinal; var Categories: TCharacterCategories): Cardinal;
function Peek: Cardinal;
function Pop: Cardinal;
function PosixCCL(CP: PUCS2; Limit: Cardinal; Symbol: PUcSymbolTableEntry): Cardinal;
function ProbeLowSurrogate(LeftState: PUCS2; Limit: Cardinal; var Code: UCS4): Cardinal;
procedure Push(V: Cardinal);
procedure Reduce(Start: Cardinal);
procedure SpaceSetup(Symbol: PUcSymbolTableEntry; Categories: TCharacterCategories);
function SymbolsAreDifferent(A, B: PUcSymbolTableEntry): Boolean;
public
procedure Clear; override;
procedure FindPrepare(const Pattern: WideString; Options: TSearchFlags); overload; override;
procedure FindPrepare(Pattern: PWideChar; PatternLength: Cardinal; Options: TSearchFlags); overload; override;
function FindFirst(const Text: WideString; var Start, Stop: Cardinal): Boolean; overload; override;
function FindFirst(Text: PWideChar; TextLen: Cardinal; var Start, Stop: Cardinal): Boolean; overload; override;
function FindAll(const Text: WideString): Boolean; overload; override;
function FindAll(Text: PWideChar; TextLen: Cardinal): Boolean; overload; override;
end;
// Event used to give the application a chance to switch the way of how to save
// the text in TWideStrings if the text contains characters not only from the
// ANSI block but the save type is ANSI. On triggering the event the application
// can change the property SaveUnicode as needed. This property is again checked
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -