⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 regexpr.pas

📁 文本编辑例子.可讲批量的处理想要的文本到界面里
💻 PAS
字号:
unit RegExpr;

interface

// ======== Determine compiler
{$IFDEF VER80} Sorry, TRegExpr is for 32-bits Delphi only. Delphi 1 is not supported (and whos really care today?!). {$ENDIF}
{$IFDEF VER90} {$DEFINE D2} {$ENDIF} // D2
{$IFDEF VER93} {$DEFINE D2} {$ENDIF} // CPPB 1
{$IFDEF VER100} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D3
{$IFDEF VER110} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // CPPB 3
{$IFDEF VER120} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D4
{$IFDEF VER130} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D5
{$IFDEF VER140} {$DEFINE D6} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D6
{$IFDEF VER150} {$DEFINE D7} {$DEFINE D6} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D7

// ======== Define base compiler options
{$BOOLEVAL OFF}
{$EXTENDEDSYNTAX ON}
{$LONGSTRINGS ON}
{$OPTIMIZATION ON}
{$IFDEF D6}
  {$WARN SYMBOL_PLATFORM OFF} // Suppress .Net warnings
{$ENDIF}
{$IFDEF D7}
  {$WARN UNSAFE_CAST OFF} // Suppress .Net warnings
  {$WARN UNSAFE_TYPE OFF} // Suppress .Net warnings
  {$WARN UNSAFE_CODE OFF} // Suppress .Net warnings
{$ENDIF}
{$IFDEF FPC}
 {$MODE DELPHI} // Delphi-compatible mode in FreePascal
{$ENDIF}

// ======== Define options for TRegExpr engine
{.$DEFINE UniCode} // Unicode support
{$DEFINE RegExpPCodeDump} // p-code dumping (see Dump method)
{$IFNDEF FPC} // the option is not supported in FreePascal
 {$DEFINE reRealExceptionAddr} // exceptions will point to appropriate source line, not to Error procedure
{$ENDIF}
{$DEFINE ComplexBraces} // support braces in complex cases
{$IFNDEF UniCode} // the option applicable only for non-UniCode mode
 {$DEFINE UseSetOfChar} // Significant optimization by using set of char
{$ENDIF}
{$IFDEF UseSetOfChar}
 {$DEFINE UseFirstCharSet} // Fast skip between matches for r.e. that starts with determined set of chars
{$ENDIF}

// ======== Define Pascal-language options
// Define 'UseAsserts' option (do not edit this definitions).
// Asserts used to catch 'strange bugs' in TRegExpr implementation (when something goes
// completely wrong). You can swith asserts on/off with help of {$C+}/{$C-} compiler options.
{$IFDEF D3} {$DEFINE UseAsserts} {$ENDIF}
{$IFDEF FPC} {$DEFINE UseAsserts} {$ENDIF}

// Define 'use subroutine parameters default values' option (do not edit this definition).
{$IFDEF D4} {$DEFINE DefParam} {$ENDIF}

// Define 'OverMeth' options, to use method overloading (do not edit this definitions).
{$IFDEF D5} {$DEFINE OverMeth} {$ENDIF}
{$IFDEF FPC} {$DEFINE OverMeth} {$ENDIF}

uses
 Classes,  // TStrings in Split method
 SysUtils; // Exception

type
 {$IFDEF UniCode}
 PRegExprChar = PWideChar;
 RegExprString = WideString;
 REChar = WideChar;
 {$ELSE}
 PRegExprChar = PChar;
 RegExprString = AnsiString; //###0.952 was string
 REChar = Char;
 {$ENDIF}
 TREOp = REChar; // internal p-code type //###0.933
 PREOp = ^TREOp;
 TRENextOff = integer; // internal Next "pointer" (offset to current p-code) //###0.933
 PRENextOff = ^TRENextOff; // used for extracting Next "pointers" from compiled r.e. //###0.933
 TREBracesArg = integer; // type of {m,n} arguments
 PREBracesArg = ^TREBracesArg;

const
 REOpSz = SizeOf (TREOp) div SizeOf (REChar); // size of p-code in RegExprString units
 RENextOffSz = SizeOf (TRENextOff) div SizeOf (REChar); // size of Next 'pointer' -"-
 REBracesArgSz = SizeOf (TREBracesArg) div SizeOf (REChar); // size of BRACES arguments -"-

type
 TRegExprInvertCaseFunction = function (const Ch : REChar) : REChar
                               of object;

const
  EscChar = '\'; // 'Escape'-char ('\' in common r.e.) used for escaping metachars (\w, \d etc).
  RegExprModifierI : boolean = False;    // default value for ModifierI
  RegExprModifierR : boolean = True;     // default value for ModifierR
  RegExprModifierS : boolean = True;     // default value for ModifierS
  RegExprModifierG : boolean = True;     // default value for ModifierG
  RegExprModifierM : boolean = False;    // default value for ModifierM
  RegExprModifierX : boolean = False;    // default value for ModifierX
  RegExprSpaceChars : RegExprString =    // default value for SpaceChars
  ' '#$9#$A#$D#$C;
  RegExprWordChars : RegExprString =     // default value for WordChars
    '0123456789' //###0.940
  + 'abcdefghijklmnopqrstuvwxyz'
  + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_';
  RegExprLineSeparators : RegExprString =// default value for LineSeparators
   #$d#$a{$IFDEF UniCode}+#$b#$c#$2028#$2029#$85{$ENDIF}; //###0.947
  RegExprLinePairedSeparator : RegExprString =// default value for LinePairedSeparator
   #$d#$a;
  { if You need Unix-styled line separators (only \n), then use:
  RegExprLineSeparators = #$a;
  RegExprLinePairedSeparator = '';
  }


const
 NSUBEXP = 15; // max number of subexpression //###0.929
 // Cannot be more than NSUBEXPMAX
 // Be carefull - don't use values which overflow CLOSE opcode
 // (in this case you'll get compiler erorr).
 // Big NSUBEXP will cause more slow work and more stack required
 NSUBEXPMAX = 255; // Max possible value for NSUBEXP. //###0.945
 // Don't change it! It's defined by internal TRegExpr design.

 MaxBracesArg = $7FFFFFFF - 1; // max value for {n,m} arguments //###0.933

 {$IFDEF ComplexBraces}
 LoopStackMax = 10; // max depth of loops stack //###0.925
 {$ENDIF}

 TinySetLen = 3;
 // if range includes more then TinySetLen chars, //###0.934
 // then use full (32 bytes) ANYOFFULL instead of ANYOF[BUT]TINYSET
 // !!! Attension ! If you change TinySetLen, you must
 // change code marked as "//!!!TinySet"


type

{$IFDEF UseSetOfChar}
 PSetOfREChar = ^TSetOfREChar;
 TSetOfREChar = set of REChar;
{$ENDIF}

 TRegExpr = class;

 TRegExprReplaceFunction = function (ARegExpr : TRegExpr): string
                               of object;

 TRegExpr = class
   private
    startp : array [0 .. NSUBEXP - 1] of PRegExprChar; // founded expr starting points
    endp : array [0 .. NSUBEXP - 1] of PRegExprChar; // founded expr end points

    {$IFDEF ComplexBraces}
    LoopStack : array [1 .. LoopStackMax] of integer; // state before entering loop
    LoopStackIdx : integer; // 0 - out of all loops
    {$ENDIF}

    // The "internal use only" fields to pass info from compile
    // to execute that permits the execute phase to run lots faster on
    // simple cases.
    regstart : REChar; // char that must begin a match; '\0' if none obvious
    reganch : REChar; // is the match anchored (at beginning-of-line only)?
    regmust : PRegExprChar; // string (pointer into program) that match must include, or nil
    regmlen : integer; // length of regmust string
    // Regstart and reganch permit very fast decisions on suitable starting points
    // for a match, cutting down the work a lot.  Regmust permits fast rejection
    // of lines that cannot possibly match.  The regmust tests are costly enough
    // that regcomp() supplies a regmust only if the r.e. contains something
    // potentially expensive (at present, the only such thing detected is * or +
    // at the start of the r.e., which can involve a lot of backup).  Regmlen is
    // supplied because the test in regexec() needs it and regcomp() is computing
    // it anyway.
    {$IFDEF UseFirstCharSet} //###0.929
    FirstCharSet : TSetOfREChar;
    {$ENDIF}

    // work variables for Exec's routins - save stack in recursion}
    reginput : PRegExprChar; // String-input pointer.
    fInputStart : PRegExprChar; // Pointer to first char of input string.
    fInputEnd : PRegExprChar; // Pointer to char AFTER last char of input string

    // work variables for compiler's routines
    regparse : PRegExprChar;  // Input-scan pointer.
    regnpar : integer; // count.
    regdummy : char;
    regcode : PRegExprChar;   // Code-emit pointer; @regdummy = don't.
    regsize : integer; // Code size.

    regexpbeg : PRegExprChar; // only for error handling. Contains
    // pointer to beginning of r.e. while compiling
    fExprIsCompiled : boolean; // true if r.e. successfully compiled

    // programm is essentially a linear encoding
    // of a nondeterministic finite-state machine (aka syntax charts or
    // "railroad normal form" in parsing technology).  Each node is an opcode
    // plus a "next" pointer, possibly plus an operand.  "Next" pointers of
    // all nodes except BRANCH implement concatenation; a "next" pointer with
    // a BRANCH on both ends of it is connecting two alternatives.  (Here we
    // have one of the subtle syntax dependencies:  an individual BRANCH (as
    // opposed to a collection of them) is never concatenated with anything
    // because of operator precedence.)  The operand of some types of node is
    // a literal string; for others, it is a node leading into a sub-FSM.  In
    // particular, the operand of a BRANCH node is the first node of the branch.
    // (NB this is *not* a tree structure:  the tail of the branch connects
    // to the thing following the set of BRANCHes.)  The opcodes are:
    programm : PRegExprChar; // Unwarranted chumminess with compiler.

    fExpression : PRegExprChar; // source of compiled r.e.
    fInputString : PRegExprChar; // input string

    fLastError : integer; // see Error, LastError

    fModifiers : integer; // modifiers
    fCompModifiers : integer; // compiler's copy of modifiers
    fProgModifiers : integer; // modifiers values from last programm compilation

    fSpaceChars : RegExprString; //###0.927
    fWordChars : RegExprString; //###0.929
    fInvertCase : TRegExprInvertCaseFunction; //###0.927

    fLineSeparators : RegExprString; //###0.941
    fLinePairedSeparatorAssigned : boolean;
    fLinePairedSeparatorHead,
    fLinePairedSeparatorTail : REChar;
    {$IFNDEF UniCode}
    fLineSeparatorsSet : set of REChar;
    {$ENDIF}

    procedure InvalidateProgramm;
    // Mark programm as have to be [re]compiled

    function IsProgrammOk : boolean; //###0.941
    // Check if we can use precompiled r.e. or
    // [re]compile it if something changed

    function GetExpression : RegExprString;
    procedure SetExpression (const s : RegExprString);

    function GetModifierStr : RegExprString;
    class function ParseModifiersStr (const AModifiers : RegExprString;
      var AModifiersInt : integer) : boolean; //###0.941 class function now
    // Parse AModifiers string and return true and set AModifiersInt
    // if it's in format 'ismxrg-ismxrg'.
    procedure SetModifierStr (const AModifiers : RegExprString);

    function GetModifier (AIndex : integer) : boolean;
    procedure SetModifier (AIndex : integer; ASet : boolean);

    procedure Error (AErrorID : integer); virtual; // error handler.
    // Default handler raise exception ERegExpr with
    // Message = ErrorMsg (AErrorID), ErrorCode = AErrorID
    // and CompilerErrorPos = value of property CompilerErrorPos.


    {==================== Compiler section ===================}
    function CompileRegExpr (exp : PRegExprChar) : boolean;
    // compile a regular expression into internal code

    procedure Tail (p : PRegExprChar; val : PRegExprChar);
    // set the next-pointer at the end of a node chain

    procedure OpTail (p : PRegExprChar; val : PRegExprChar);
    // regoptail - regtail on operand of first argument; nop if operandless

    function EmitNode (op : TREOp) : PRegExprChar;
    // regnode - emit a node, return location

    procedure EmitC (b : REChar);
    // emit (if appropriate) a byte of code

    procedure InsertOperator (op : TREOp; opnd : PRegExprChar; sz : integer); //###0.90
    // insert an operator in front of already-emitted operand
    // Means relocating the operand.

    function ParseReg (paren : integer; var flagp : integer) : PRegExprChar;
    // regular expression, i.e. main body or parenthesized thing

    function ParseBranch (var flagp : integer) : PRegExprChar;
    // one alternative of an | operator

    function ParsePiece (var flagp : integer) : PRegExprChar;
    // something followed by possible [*+?]

    function ParseAtom (var flagp : integer) : PRegExprChar;
    // the lowest level

    function GetCompilerErrorPos : integer;
    // current pos in r.e. - for error hanling

    {$IFDEF UseFirstCharSet} //###0.929
    procedure FillFirstCharSet (prog : PRegExprChar);
    {$ENDIF}

    {===================== Mathing section ===================}
    function regrepeat (p : PRegExprChar; AMax : integer) : integer;
    // repeatedly match something simple, report how many

    function regnext (p : PRegExprChar) : PRegExprChar;
    // dig the "next" pointer out of a node

    function MatchPrim (prog : PRegExprChar) : boolean;
    // recursively matching routine

    function ExecPrim (AOffset: integer) : boolean;
    // Exec for stored InputString

    {$IFDEF RegExpPCodeDump}
    function DumpOp (op : REChar) : RegExprString;
    {$ENDIF}

    function GetSubExprMatchCount : integer;
    function GetMatchPos (Idx : integer) : integer;
    function GetMatchLen (Idx : integer) : integer;
    function GetMatch (Idx : integer) : RegExprString;

    function GetInputString : RegExprString;
    procedure SetInputString (const AInputString : RegExprString);

    {$IFNDEF UseSetOfChar}
    function StrScanCI (s : PRegExprChar; ch : REChar) : PRegExprChar; //###0.928
    {$ENDIF}

    procedure SetLineSeparators (const AStr : RegExprString);
    procedure SetLinePairedSeparator (const AStr : RegExprString);
    function GetLinePairedSeparator : RegExprString;

   public
    constructor Create;
    destructor Destroy; override;

    class function VersionMajor : integer; //###0.944
    class function VersionMinor : integer; //###0.944

    property Expression : RegExprString read GetExpression write SetExpression;
    // Regular expression.
    // For optimization, TRegExpr will automatically compiles it into 'P-code'
    // (You can see it with help of Dump method) and stores in internal
    // structures. Real [re]compilation occures only when it really needed -
    // while calling Exec[Next], Substitute, Dump, etc
    // and only if Expression or other P-code affected properties was changed
    // after last [re]compilation.
    // If any errors while [re]compilation occures, Error method is called
    // (by default Error raises exception - see below)

    property ModifierStr : RegExprString read GetModifierStr write SetModifierStr;
    // Set/get default values of r.e.syntax modifiers. Modifiers in
    // r.e. (?ismx-ismx) will replace this default values.
    // If you try to set unsupported modifier, Error will be called
    // (by defaul Error raises exception ERegExpr).

    property ModifierI : boolean index 1 read GetModifier write SetModifier;
    // Modifier /i - caseinsensitive, initialized from RegExprModifierI

    property ModifierR : boolean index 2 read GetModifier write SetModifier;
    // Modifier /r - use r.e.syntax extended for russian,
    // (was property ExtSyntaxEnabled in previous versions)
    // If true, then 

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -