📄 regex.scm
字号:
;;;; regex.scm - Unit for using the PCRE regex package;; Copyright (c) 2000-2007, Felix L. Winkelmann; Copyright (c) 2008, The Chicken Team; All rights reserved.;; Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following; conditions are met:;; Redistributions of source code must retain the above copyright notice, this list of conditions and the following; disclaimer.; Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following; disclaimer in the documentation and/or other materials provided with the distribution.; Neither the name of the author nor the names of its contributors may be used to endorse or promote; products derived from this software without specific prior written permission.;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY; AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR; OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE; POSSIBILITY OF SUCH DAMAGE.(cond-expand [chicken-compile-shared] [else (declare (unit regex))] )(declare (usual-integrations) (disable-interrupts) (generic) ; PCRE options use lotsa bits (disable-warning var) (bound-to-procedure ;; Forward reference regex-chardef-table? make-anchored-pattern ;; Imports get-output-string open-output-string string->list list->string string-length string-ref substring make-string string-append reverse list-ref char=? char-alphabetic? char-numeric? char->integer set-finalizer! ##sys#pointer? ##sys#slot ##sys#setslot ##sys#size ##sys#make-structure ##sys#structure? ##sys#error ##sys#signal-hook ##sys#substring ##sys#fragments->string ##sys#make-c-string ##sys#string-append ##sys#write-char-0 ) (export regex-chardef-table? regex-chardef-table regexp? regexp regexp* regexp-optimize make-anchored-pattern string-match string-match-positions string-search string-search-positions string-split-fields string-substitute string-substitute* glob? glob->regexp grep regexp-escape ) )(cond-expand [paranoia] [else (declare (no-bound-checks) (no-procedure-checks-for-usual-bindings) ) ] )(cond-expand [unsafe (eval-when (compile) (define-macro (##sys#check-chardef-table . _) '(##core#undefined)) (define-macro (##sys#check-integer . _) '(##core#undefined)) (define-macro (##sys#check-blob . _) '(##core#undefined)) (define-macro (##sys#check-vector . _) '(##core#undefined)) (define-macro (##sys#check-structure . _) '(##core#undefined)) (define-macro (##sys#check-range . _) '(##core#undefined)) (define-macro (##sys#check-pair . _) '(##core#undefined)) (define-macro (##sys#check-list . _) '(##core#undefined)) (define-macro (##sys#check-symbol . _) '(##core#undefined)) (define-macro (##sys#check-string . _) '(##core#undefined)) (define-macro (##sys#check-char . _) '(##core#undefined)) (define-macro (##sys#check-exact . _) '(##core#undefined)) (define-macro (##sys#check-port . _) '(##core#undefined)) (define-macro (##sys#check-number . _) '(##core#undefined)) (define-macro (##sys#check-byte-vector . _) '(##core#undefined)) ) ] [else (define (##sys#check-chardef-table x loc) (unless (regex-chardef-table? x) (##sys#error loc "invalid character definition tables structure" x) ) ) (declare (bound-to-procedure ;; Imports ##sys#check-string ##sys#check-list ##sys#check-exact ##sys#check-vector ##sys#check-structure ##sys#check-symbol ##sys#check-blob ##sys#check-integer ) (export ##sys#check-chardef-table ) (emit-exports "regex.exports") ) ] );;;#>#include "pcre.h"<#;;;(register-feature! 'regex 'pcre);;; From unit lolevel:(define-inline (%tag-pointer ptr tag) (let ([tp (##sys#make-tagged-pointer tag)]) (##core#inline "C_copy_pointer" ptr tp) tp ) )(define-inline (%tagged-pointer? x tag) (and (##core#inline "C_blockp" x) (##core#inline "C_taggedpointerp" x) (eq? tag (##sys#slot x 1)) ) );;; PCRE Types:(define-foreign-type pcre (c-pointer "pcre"))(define-foreign-type nonnull-pcre (nonnull-c-pointer "pcre"))(define-foreign-type pcre_extra (c-pointer "pcre_extra"))(define-foreign-type nonnull-pcre_extra (nonnull-c-pointer "pcre_extra"))(define-foreign-variable PCRE_CASELESS unsigned-integer)(define-foreign-variable PCRE_EXTENDED unsigned-integer)(define-foreign-variable PCRE_UTF8 unsigned-integer);FIXME the use of 'define-foreign-enum' causes unused global variable warning!(define-foreign-enum (pcre-option unsigned-integer) (caseless PCRE_CASELESS) (multiline PCRE_MULTILINE) (dotall PCRE_DOTALL) (extended PCRE_EXTENDED) (anchored PCRE_ANCHORED) (dollar-endonly PCRE_DOLLAR_ENDONLY) (extra PCRE_EXTRA) (notbol PCRE_NOTBOL) (noteol PCRE_NOTEOL) (ungreedy PCRE_UNGREEDY) (notempty PCRE_NOTEMPTY) (utf8 PCRE_UTF8) (no-auto-capture PCRE_NO_AUTO_CAPTURE) (no-utf8-check PCRE_NO_UTF8_CHECK) (auto-callout PCRE_AUTO_CALLOUT) (partial PCRE_PARTIAL) (dfa-shortest PCRE_DFA_SHORTEST) (dfa-restart PCRE_DFA_RESTART) (firstline PCRE_FIRSTLINE) (dupnames PCRE_DUPNAMES) (newline-cr PCRE_NEWLINE_CR) (newline-lf PCRE_NEWLINE_LF) (newline-crlf PCRE_NEWLINE_CRLF) (newline-any PCRE_NEWLINE_ANY) (newline-anycrlf PCRE_NEWLINE_ANYCRLF) (bsr-anycrlf PCRE_BSR_ANYCRLF) (bsr-unicode PCRE_BSR_UNICODE) );;; The regexp structure primitives:(define re-finalizer (foreign-lambda void "pcre_free" c-pointer) )(define-inline (%make-regexp code) (set-finalizer! code re-finalizer) (##sys#make-structure 'regexp code #f 0) )(define-inline (%regexp? x) (##sys#structure? x 'regexp) )(define-inline (%regexp-code rx) (##sys#slot rx 1) )(define-inline (%regexp-extra rx) (##sys#slot rx 2) )(define-inline (%regexp-options rx) (##sys#slot rx 3) )(define-inline (%regexp-extra-set! rx extra) (when extra (set-finalizer! extra re-finalizer)) (##sys#setslot rx 2 extra) )(define-inline (%regexp-options-set! rx options) (##sys#setslot rx 3 options) );;; Character Definition Tables:;; The minimum necessary to handle chardef table parameters.;;(define (regex-chardef-table? x) (%tagged-pointer? x 'chardef-table) );; Get a character definitions tables structure for the current locale.(define regex-chardef-table (let ([re-maketables (foreign-lambda* (c-pointer unsigned-char) () "return (pcre_maketables ());")] [re-make-chardef-table-type (lambda (tables) (%tag-pointer tables 'chardef-table) ) ] ) (lambda (#!optional tables) ; Using this to type tag a ref is a bit of a hack but beats ; having another public variable. (if tables ; then existing reference so just tag it (if (##sys#pointer? tables) (re-make-chardef-table-type tables) (##sys#signal-hook #:type-error 'regex-chardef-table "bad argument type - not a pointer" tables) ) ; else make a new chardef tables (let ([tables (re-maketables)]) (if tables (let ([tables (re-make-chardef-table-type tables)]) (set-finalizer! tables re-finalizer) tables ) (##sys#error-hook 6 'regex-chardef-table) ) ) ) ) ) );;; Regexp record:(define (regexp? x) (%regexp? x) );;; PCRE errors:#>static const char *C_regex_error;static int C_regex_error_offset;<#(define-foreign-variable C_regex_error c-string)(define-foreign-variable C_regex_error_offset int)(define re-error (let ([string-append string-append]) (lambda (loc msg . args) (apply ##sys#error loc (string-append msg " - " C_regex_error) args) ) ) );;; Compile regular expression:;FIXME nonnull-unsigned-c-string causes problems - converted string is too long!(define re-compile (foreign-lambda* pcre ((nonnull-c-string patt) (unsigned-integer options) ((const (c-pointer unsigned-char)) tables)) "return(pcre_compile(patt, options, &C_regex_error, &C_regex_error_offset, tables));") )(define (re-checked-compile pattern options tables loc) (##sys#check-string pattern loc) (or (re-compile pattern options #f) (re-error loc "cannot compile regular expression" pattern C_regex_error_offset) ) );; Compile with subset of options and no tables(define (regexp pattern . options) (let ([options->integer (lambda () (if (null? options) 0 (+ (if (car options) PCRE_CASELESS 0) (let ((options (cdr options))) (if (null? options) 0 (+ (if (car options) PCRE_EXTENDED 0) (let ((options (cdr options))) (if (and (pair? options) (car options)) PCRE_UTF8 0 ) ) ) ) ) ) ) )]) (%make-regexp (re-checked-compile pattern (options->integer) #f 'regexp)) ) );; Compile with full options and tables available(define (regexp* pattern . args) (let-optionals args ([options '()] [tables #f]) (##sys#check-string pattern 'regexp*) (##sys#check-list options 'regexp*) (when tables (##sys#check-chardef-table tables 'regexp*)) (%make-regexp (re-checked-compile pattern (pcre-option->number options) tables 'regexp*)) ) );;; Optimize compiled regular expression:;; Invoke optimizer(define re-study (foreign-lambda* pcre_extra (((const nonnull-pcre) code)) "return(pcre_study(code, 0, &C_regex_error));"));; Optimize compiled regular expression;; Returns whether optimization performed(define (regexp-optimize rx) (##sys#check-structure rx 'regexp 'regexp-optimize) (let ([extra (re-study (%regexp-code rx))]) (cond [C_regex_error (re-error 'regexp-optimize "cannot optimize regular expression" rx)] [extra (%regexp-extra-set! rx extra) #t] [else #f] ) ) );;; Captured results vector:;; Match positions vector (PCRE ovector)#>#define OVECTOR_LENGTH_MULTIPLE 3#define STATIC_OVECTOR_LEN 256static int C_regex_ovector[OVECTOR_LENGTH_MULTIPLE * STATIC_OVECTOR_LEN];<#;;(define ovector-start-ref
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -