📄 csregex.pas

📁 Delphi script parser
💻 PAS
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
{:
This unit contains the TcsReExpr object for Regular expressions.<br>
Author: M.C. van der Kooij (MLWKooij@hetnet.nl)<br>
Translated to Delphi in may 1998<br>
Created a Caj Script version on 17 april 2000<br>

<br>
Last modification: 20 - march - 2000<br>
<br>
Original regexpr.c<br>
<br>
Author: Tatu Ylonen (ylo@ngs.fi)<br>
<br>
Copyright (c) 1991 Tatu Ylonen, Espoo, Finland<br>
<br>
Permission to use, copy, modify, distribute, and sell this software and its
documentation for any purpose is hereby granted without fee, provided that the
above copyright notice appear in all copies.<br>
This software is provided "as is" without express or implied warranty.<br>
<br>
Created: Thu Sep 26 17:14:05 1991 ylo<br>
Last modified: Mon Nov  4 17:06:48 1991 ylo<br>
Ported to Think C: 19 Jan 1992 guido@cwi.nl<p>
<br>
This code draws many ideas from the regular expression packages by
Henry Spencer of the University of Toronto and Richard Stallman of the
Free Software Foundation.<p>

Emacs-specific code and syntax table code is almost directly borrowed from
GNU regexp.

Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April 1997
Thanks for bug reports and ideas from Andrew Kuchling, Tim Peters,
Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and probably one or two
others that I'm forgetting.<p>

$Id: regexpr.c,v 1.28 1998/04/10 22:27:39 guido Exp $<p>

Last changes:<br>
<UL>
  <LI>2000-03-20 TOnMatchEvent, ret is now a var parameter, so you can continu
      on a other place (the end of a match for instance)
  <LI>2000-03-20 Bug fixed in re_match > cRepeat1 when handling a cSet (code
      used instead of pinst
  <LI>2000-03-20 Changes in the register values, the first register value was 1
      based, the other were 0 based, now all are 1 based
  <LI>2000-03-21 RE_HIGHCHARSWHITESPACE (type mkre_HighCharsWhitespace) when
      set, characters above 127 are whitespaces
  <LI>2000-03-21 Split and SplitX, fill a TStrings with the splitted text
  <LI>2000-04-01 In a set, ansi translation was always used, but it should be a
      translation.
  <LI>2000-04-01 In re_search, when not matched, a match was returned.
  <LI>2000-04-01 Added a few DoBuffer.. methods and changed some internal
      variables to make it capable of working with buffers.
</UL>
}
unit csregex;

interface

uses
  Windows, Classes, SysUtils, CS2, CS2_VAR;

{$IFNDEF VER70}
{$DEFINE DELPHI}
{$ENDIF}

{ MK:
  Assertions are not translated here, they should never be shown to users, and
 they only occurs when the program is corrupted / buggie }

{$IFDEF ver90}
const
{$ELSE}
resourcestring
{$ENDIF}
  SreAbnormal = 'Abnormal error, contact author!!';
  SreAssertion = 'Assertion: ';
  SreBadMregN = 'Bad match register number.';
  SreBadlyPPar = 'Badly placed parenthesis.';
  SreBadlyPSpe = 'Badly placed special character.';
  SreEndPrem = 'Regular expression ends prematurel!';
  SreSyntax = 'Regular expression syntax error.';
  SreToComplex = 'Regular expression too complex.';
  SreOptimize = 'Optimization error.';
  SreUnknowRE = 'Unknown regex opcode: memory corrupted?';

const
  {: number of registers }
  RE_NREGS = 100;

// bit definitions for syntax
  {: no quoting for parentheses }
  RE_NO_BK_PARENS = 1;

  {: no quoting for vertical bar }
  RE_NO_BK_VBAR = 2;

  {: quoting needed for + and ? }
  RE_BK_PLUS_QM = 4;

  {: | binds tighter than ^ and $ }
  RE_TIGHT_VBAR = 8;
  {: treat newline (in expression) as or }
  RE_NEWLINE_OR = 16;

  {: ^$?*+ are special in all contexts }
  RE_CONTEXT_INDEP_OPS = 32;

  {: ansi sequences (\n etc) and \xhh }
  RE_ANSI_HEX = 64;

  {: no gnu extensions }
  RE_NO_GNU_EXTENSIONS = 128;

  {: chars above 127 are whitespaces }
  RE_HIGHCHARSWHITESPACE = 256;

  //* definitions for some common regexp styles */
  RE_SYNTAX_AWK = (RE_NO_BK_PARENS or RE_NO_BK_VBAR or RE_CONTEXT_INDEP_OPS);
  RE_SYNTAX_EGREP = (RE_SYNTAX_AWK or RE_NEWLINE_OR);
  RE_SYNTAX_GREP = (RE_BK_PLUS_QM or RE_NEWLINE_OR);
  RE_SYNTAX_EMACS = 0;

  Sword = 1;
  Swhitespace = 2;
  Sdigit = 4;
  Soctaldigit = 8;
  Shexdigit = 16;

type
  {: Pointer type of Tmkre_registers. }
  Pmkre_registers = ^Tmkre_registers;
  {: Result of match. This is a 1 based record of start and end positions of the
     groups that where found. Tmkre_registers[0] is the start and end of the
     total match. The numbers 1..99 are the groups within parenthesis.
     Lastregister is the index of the last group. }
  Tmkre_registers = record
    _start: array[0..RE_NREGS - 1] of integer;
    _end: array[0..RE_NREGS - 1] of integer;
    LastRegister: integer;
  end;
  TmkreSyntaxStyle = (mkre_No_Bk_Parens, mkre_No_Bk_Vbar, mkre_Bk_Plus_Qm,
    mkre_Tight_Vbar, mkre_Newline_Or, mkre_Context_Indep_Ops,
    mkre_Ansi_Hex, mkre_No_Gnu_Extensions,
    mkre_HighCharsWhitespace);
  TmkreSyntaxStyles = set of TmkreSyntaxStyle;

  regexp_syntax_op = { syntax codes for plain and quoted characters }
    (
    Rend, // special code for end of regexp */
    Rnormal, // normal character */
    Ranychar, //* any character except newline */
    Rquote, //* the quote character */
    Rbol, //* match beginning of line */
    Reol, //* match end of line */
    Roptional, //* match preceding expression optionally */
    Rstar, //* match preceding expr zero or more times */
    Rplus, //* match preceding expr one or more times */
    Ror, //* match either of alternatives */
    Ropenpar, //* opening parenthesis */
    Rclosepar, //* closing parenthesis */
    Rmemory, //* match memory register */
    Rextended_memory, //* \vnn to match registers 10-99 */
    Ropenset, //* open set.  Internal syntax hard-coded below. */
                   //* the following are gnu extensions to "normal" regexp syntax */
    Rbegbuf, //* beginning of buffer */
    Rendbuf, //* end of buffer */
    RDigitChar, //* digit character */      RJ 2000-04-01 special for digits 0-9
    RNotDigitChar, //* not digit character */  RJ 2000-04-01 special for digits 0-9
    Rwordchar, //* word character */
    Rnotwordchar, //* not word character */
    Rwordbeg, //* beginning of word */
    Rwordend, //* end of word */
    Rwordbound, //* word bound */
    Rnotwordbound, //* not word bound */
    Rnum_ops
    );

  regexp_compiled_ops = //* opcodes for compiled regexp */
    (
    Cend, //* end of pattern reached */
    Cbol, //* beginning of line */
    Ceol, //* end of line */
    Cset, //* character set.  Followed by 32 bytes of set. */
    Cexact, //* followed by a byte to match */
    Canychar, //* matches any character except newline */
    Cstart_memory, //* set register start addr (followed by reg number) */
    Cend_memory, //* set register end addr (followed by reg number) */
    Cmatch_memory, //* match a duplicate of reg contents (regnum follows)*/
    Cjump, //* followed by two bytes (lsb,msb) of displacement. */
    Cstar_jump, //* will change to jump/update_failure_jump at runtime */
    Cfailure_jump, //* jump to addr on failure */
    Cupdate_failure_jump, //* update topmost failure point and jump */
    Cdummy_failure_jump, //* push a dummy failure point and jump */
    Cbegbuf, //* match at beginning of buffer */
    Cendbuf, //* match at end of buffer */
    Cwordbeg, //* match at beginning of word */
    Cwordend, //* match at end of word */
    Cwordbound, //* match if at word boundary */
    Cnotwordbound, //* match if not at word boundary */
    Csyntaxspec, //* matches syntax code (1 byte follows) */
    Cnotsyntaxspec, //* matches if syntax code does not match (1 byte follows) */
    Crepeat1
    );

type
{: This Exception is used in <See Class="TcsReExpr"> }
  ERegularExpression = class(Exception);

  Pregexp_t = ^Tregexp_t;
  Tregexp_t = record
    buffer: string; {compiled pattern}
    fastmap: string; {fastmap[ch] is true if ch can start pattern}
    translate: string; {translation to apply during compilation/matching}
    fastmap_accurate: boolean; {true if fastmap is valid}
    can_be_null: char; {true if can match empty string}
    uses_registers: boolean; {registers are used and need to be initialized}
    num_registers: integer; {number of registers used}
    anchor: byte; {anchor: 0=none 1=begline 2=begbuf}
  end;

  TOnMatchEvent = procedure(Sender: TObject; str: string; pos: integer; var ret: integer; re_registers: Tmkre_registers) of object;
  TOnSearchEvent = procedure(Sender: TObject; str: string; pos: integer; re_registers: Tmkre_registers) of object;

{: This component can search in string for Regular expressions.
   <p>Set <See Class="TmkreExpr" Property = "Str"> with the string and
   <See Class="TmkreExpr" Property = "Pattern"> with the Regular Expression.</p>

   <p>See <See Class="TmkreExpr" Property = "Pattern"> for an description of
   possible Expressions.</p> }
  TcsReExpr = class
  private
    FStyle: integer;
    FActive,
      FUseFastmap,
      FCanBeEmpty: boolean;
    Fstr: string;
    FNoChange,
      FStyleChange: boolean;
    FSyntaxStyles: TmkreSyntaxStyles;
    Fpattern: string; // uncompiled pattern
    FBuffer: pointer;
    FBufferEnd: pointer;
    FBufferSize: integer;
    regexp_t: Tregexp_t;
    FMatches: TStringList;
    FOnMatch: TOnMatchEvent;
    FOnStartMatch: TNotifyEvent;
    FOnEndMatch: TNotifyEvent;
    FOnSearch: TOnSearchEvent;
    re_syntax_table: array[0..255] of char;
    re_compile_initialized: boolean;
    regexp_plain_ops,
      regexp_quoted_ops: array[0..255] of regexp_syntax_op;
    regexp_precedences: array[0..ord(Rnum_ops)] of char;
    regexp_context_indep_ops: boolean;
    regexp_ansi_sequences: boolean;

    procedure CheckRegExp;
    procedure SetUseFastmap(const fstm: boolean);
    procedure SetCanBeEmpty(const BeEm: boolean);
    procedure SetStr(const str: string);
    procedure SetSyntaxStyles(const NewStyles: TmkreSyntaxStyles);
    function GetMatches: TStringList;
    procedure Setpattern(const pat: string);

    procedure Inser_Jump(const pos: integer;
      const opcode_type: regexp_compiled_ops;
      const addr: integer; var pattern_offset: integer; var pattern: string);
    function Ansi_Translate(const ch: char; const size: integer;
      var pos: integer; const regex, translate: string): char;
    function hex_char_to_decimal(const ch: char): char;
    function re_optimize: boolean;
    function re_optimize_star_jump(var code: PChar): boolean;
    function re_do_compile_fastmap(const bufferstr: string; const pos: integer;
      var can_be_null: char; const fastmap: PChar): boolean;
    procedure re_compile_fastmap_aux(var code: PChar; pos: integer;
      const visited: PChar; var can_be_null: char; const fastmap: PChar);
    procedure re_compile_fastmap;
    procedure re_compile_initialize;
    function re_compile_pattern: string;
    function re_match(const pos: integer;
      const old_regs: Pmkre_registers): integer;
    function re_search(pos, range: integer; const regs: Pmkre_registers): integer;
    function IntSplit(const split: TStrings; const maxsplit: integer;
      const retain: boolean): integer;
    procedure OnMatchTerminate;
  public
    constructor Create;
    destructor Destroy; override;
    procedure DoMatch;
    function DoSearch(const pos: integer): integer;
    function DoSearchWithRange(const pos, range: integer): integer;
    function DoBufferMatch(const Buffer: Pointer; const BufferLength: Integer;
      const regs: Pmkre_Registers): boolean;
    function DoBufferSearch(const Buffer: Pointer; const BufferLength: Integer): Pointer;
    function DoBufferSearchEx(const Buffer: Pointer; const BufferLength: Integer;
      const regs: Pmkre_registers): Pointer;
    procedure DoBufferGetMatches(const matches: TStrings;
      const Buffer: Pointer; const BufferLength: Integer);
    function Split(const split: TStrings; const maxsplit: integer): integer;
    function SplitX(const split: TStrings; const maxsplit: integer): integer;

    {: Translates characters to other characters.
    <p>This is an string which is: empty or has 256 characters</p>
    <p>When the string is 256 characters each character translates the
    corresponding char to another char. In pattern AND Str.
    So you have to fill it with char #0 till #255, and after that you can choose
    to map some characters to others.</p>

    Example:
<CODE>//Ignore case
var
  c: integer;
  translate: string;
begin
  SetLength(translate, 256);
  for c := 0 to 255 do
    translate[c] := char(c);
  for c := ord('a') to ord('z') do
    translate[c] := char(c - 32);
  mkreExpr1.translate := translate;
end;
</CODE>
    <p>See also: <See Class = "TmkreExpr" Property = "Str"></p> }
    property Translate: string read regexp_t.Translate write regexp_t.Translate;

    {: All matches on Str are stored in this stringlist.
       <p>When Matches is read Str will be matched for pattern. Its behaviour is
       like calling DoMatch.</p>

       <P>See also: <See Class="TmkreExpr" Property="Active">,
       <See Class="TmkreExpr" Property="Str">,
       <See Class="TmkreExpr" Method="DoMatch"></p> }
    property Matches: TStringList read GetMatches;

    {: The compiled expression, can be used in other regex. }
    property RegExp: Tregexp_t read regexp_t write regexp_t;
  published
    {: When active is set to true, all strings are matched when assigned.
    <p>See also: <See Class="TmkreExpr" Property="Str"></p>

    (This is equal to)
<CODE>
  Str := 'string to parse';
  DoMatch;
</CODE> }
    property Active: boolean read FActive write FActive;

    {: Enables quick search.
       <p>When True, an map is created which determinate which characters are
       searched for matching the first valid character in the pattern.
       (When [a-z]* is the pattern searches can skip all other characters,
       an match is first tried when a..z is found.)</p>

       <p>Default fastmap is True</p> }
    property UseFastmap: boolean read FUseFastmap write SetUseFastmap;

    {: When True, matches can contain empty strings. }
    property CanBeEmpty: boolean read FCanBeEmpty write SetCanBeEmpty;

    {: Pattern definition.
12 3 4 5 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -