📄 pcre.mx
字号:
@' The contents of this file are subject to the MonetDB Public License@' Version 1.1 (the "License"); you may not use this file except in@' compliance with the License. You may obtain a copy of the License at@' http://monetdb.cwi.nl/Legal/MonetDBLicense-1.1.html@'@' Software distributed under the License is distributed on an "AS IS"@' basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the@' License for the specific language governing rights and limitations@' under the License.@'@' The Original Code is the MonetDB Database System.@'@' The Initial Developer of the Original Code is CWI.@' Portions created by CWI are Copyright (C) 1997-2007 CWI.@' All Rights Reserved.@f pcre@a N. Nes@+ PCRE library interfaceThe PCRE library is a set of functions that implement regular expres-sion pattern matching using the same syntax and semantics as Perl,with just a few differences. The current implementation of PCRE(release 4.x) corresponds approximately with Perl 5.8, including sup-port for UTF-8 encoded strings. However, this support has to beexplicitly enabled; it is not the default.@malatom pcre:ptr;command tostr() address pcre_tostr;command fromstr() address pcre_fromstr;command nequal() address pcre_nequal;command hash() address pcre_hash;command null() address pcre_null;command put() address pcre_put;command del() address pcre_del;command length() address pcre_length;command heap() address pcre_heap;command compile(pat:str ) :pcre address PCREcompile_wrapcomment "compile a pattern";command match(pat:pcre, s:str) :bit address PCREexec_wrapcomment "match a pattern";command select(pat:str, strs:bat[:any_1,:str]) :bat[:any_1,:str] address PCREselectcomment "Select tuples based on the pattern";command uselect(pat:str, strs:bat[:any_1,:str]) :bat[:any_1,:void] address PCREuselectcomment "Select tuples based on the pattern, only returning the head";command match(s:str, pat:str):bitaddress PCREmatchcomment "POSIX pattern matching against a string";command patindex(s:str, pat:str):intaddress PCREpatindexcomment "Location of the first POSIX pattern matching against a string"command replace(origin:str,pat:str,repl:str,flags:str):straddress PCREreplace_wrapcomment "Replace _all_ matches of \"pattern\" in \"origin_str\" with \"replacement\".\n\ Parameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n 'e': if present, an empty string is considered to be a valid match 'i': if present, the match operates in case-insensitive mode. Otherwise, in\n case-sensitive mode.\n\ 'm': if present, the match operates in multi-line mode.\n 's': if present, the match operates in \"dot-all\" The specifications of the flags can be found in \"man pcreapi\"\n The flag letters may be repeated.\n No other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\n Returns the replaced string, or if no matches found, the original string.";command replace(orig:bat[:any_1,:str],pat:str,repl:str,flag:str):bat[:any_1,:str]address PCREreplace_bat_wrap;command pcre_quote(s:str):str address PCREquotecomment "Return a PCRE pattern string that matches the argument exactly."command sql2pcre(pat:str,esc:str):straddress PCREsql2pcrecomment "Convert a SQL like pattern with the given escape character into a PCRE pattern.";command like(s:str, pat:str, esc:str):bit address PCRElike3;command like(s:str, pat:str):bit address PCRElike2;command like(s:bat[:any_1,:str], pat:str, esc:str):bat[:any_1,:void] address PCRElike_uselect_pcre;pattern prelude() :void address pcre_initcomment "Initialize pcre";pcre.prelude();@{@-\begin{center}ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre\end{center}@+ Implementation@include prelude.mx@c#include "mal_config.h"#include "mal.h"#include "mal_exception.h"#ifdef WIN32#ifndef HAVE_LIBPCRE#define pcre_export extern __declspec(dllimport)#else#define pcre_export extern __declspec(dllexport)#endif#else#define pcre_export extern#endif#ifndef HAVE_LIBPCRE#define pcre str#else#include <pcre.h>#endifpcre_export str PCREquote(str *r, str *v);pcre_export str PCREselect(int *res, str *pattern, int *bid);pcre_export str PCREuselect(int *res, str *pattern, int *bid);pcre_export str PCREmatch(bit *ret, str *val, str *pat);pcre_export str PCREpatindex(int *ret, str *val, str *pat);pcre_export str PCREfromstr(str instr, int *l, pcre ** val);pcre_export str PCREreplace_wrap(str *res, str *or, str *pat, str *repl, str *flags);pcre_export str PCREreplace_bat_wrap(int *res, int *or, str *pat, str *repl, str *flags);pcre_export str PCREcompile_wrap(pcre ** res, str *pattern);pcre_export str PCREexec_wrap(bit *res, pcre * pattern, str *s);pcre_export int pcre_tostr(str *tostr, int *l, pcre * p);pcre_export int pcre_fromstr(str instr, int *l, pcre ** val);pcre_export int pcre_nequal(pcre * l, pcre * r);pcre_export hash_t pcre_hash(pcre * b);pcre_export pcre * pcre_null(void);pcre_export void pcre_del(Heap *h, var_t *index);pcre_export int pcre_length(pcre * p);pcre_export void pcre_heap(Heap *heap, size_t capacity);pcre_export var_t pcre_put(Heap *h, var_t *bun, pcre * val);pcre_export str PCRElike3(bit *ret, str *s, str *pat, str *esc);pcre_export str PCRElike2(bit *ret, str *s, str *pat);pcre_export str PCRElike_uselect_pcre(int *ret, int *b, str *pat, str *esc);pcre_export str pcre_init(void);#ifndef HAVE_LIBPCREstrpcre_compile_wrap(pcre ** res, str pattern){ (void) res; (void) pattern; throw(MAL, "pcre_compile", "not available as required version of libpcre was not found by configure.\n");}strpcre_exec_wrap(bit *res, pcre * pattern, str s){ (void) res; (void) pattern; (void) s; throw(MAL, "pcre_exec", "not available as required version of libpcre was not found by configure.\n");}strpcre_select(BAT **res, str pattern, BAT *strs){ (void) res, (void) pattern; (void) strs; throw(MAL, "pcre_select", "not available as required version of libpcre was not found by configure.\n");}strpcre_uselect(BAT **res, str pattern, BAT *strs){ (void) res, (void) pattern; (void) strs; throw(MAL, "pcre_uselect", "not available as required version of libpcre was not found by configure.\n");}strpcre_replace(str *res, str origin_str, str pattern, str replacement, str flags){ (void) res; (void) origin_str; (void) pattern; (void) replacement; (void) flags; throw(MAL, "pcre_replace","not available as required version of libpcre was not found by configure.\n");}strpcre_replace_bat(BAT **res, BAT *origin_strs, str pattern, str replacement, str flags){ (void) res; (void) origin_strs; (void) pattern; (void) replacement; (void) flags; throw(MAL, "pcre_replace_bat","not available as required version of libpcre was not found by configure.");}str pcre_init(void){ return NULL;}voidpcre_exit(void){}strpcre_match(bit *ret, str val, str pat){ (void) ret; (void) val; (void) pat; throw(MAL, "pcre_match", "not available as required version of libpcre was not found by configure.\n");}strpcre_patindex(int *ret, str val, str pat){ (void) ret; (void) val; (void) pat; throw(MAL, "pcre_patindexmatch", "not available as required version of libpcre was not found by configure.\n");}#else#include <pcre.h>#define m2p(p) (pcre*)(((sht*)p)+1)#define p2m(p) (pcre*)(((sht*)p)-1)void *my_pcre_malloc(size_t s){ char *r = GDKmalloc(s + sizeof(sht)); sht *sz = (sht *) r; *sz = s + sizeof(sht); return (void *) (sz + 1);}voidmy_pcre_free(void *blk){ sht *sz = (sht *) blk; sz -= 1; GDKfree((void *) sz);}strpcre_compile_wrap(pcre ** res, str pattern){ pcre *r; const char err[BUFSIZ], *err_p = err; int errpos = 0; if ((r = pcre_compile(pattern, PCRE_UTF8 | PCRE_MULTILINE, &err_p, &errpos, NULL)) == NULL) { throw(MAL,"pcre.compile", "failed with\n'%s'\nat %d in\n'%s'.\n", err_p, errpos, pattern); } *(pcre **) res = p2m(r); return MAL_SUCCEED;}strpcre_exec_wrap(bit *res, pcre * pattern, str s){ if (pcre_exec(m2p(pattern), NULL, s, strlen(s), 0, 0, NULL, 0) >= 0) { *res = TRUE; return MAL_SUCCEED; } *res = FALSE; throw(MAL, "pcre.exec","failed to execute pattern match");}strpcre_select(BAT **res, str pattern, BAT *strs){ const char err[BUFSIZ], *err_p = err; int errpos = 0; BAT *r; BUN p, q; pcre *re = NULL; if (strs->htype == TYPE_void) r = BATnew(TYPE_oid, TYPE_str, BATcount(strs)); else r = BATnew(strs->htype, TYPE_str, BATcount(strs)); if ((re = pcre_compile(pattern, PCRE_UTF8 | PCRE_MULTILINE, &err_p, &errpos, NULL)) == NULL) { throw(MAL, "pcre_select", "pcre compile of pattern (%s) failed at %d with\n'%s'.", pattern, errpos, err_p); } BATloop(strs, p, q) { str s = BUNtail(strs, p); if (pcre_exec(re, NULL, s, strlen(s), 0, 0, NULL, 0) >= 0) { BUNins(r, BUNhead(strs, p), s, FALSE); } } if (!(r->batDirty&2)) r = BATsetaccess(r, BAT_READ); my_pcre_free(re); *res = r; return MAL_SUCCEED;}strpcre_uselect(BAT **res, str pattern, BAT *strs){ const char err[BUFSIZ], *err_p = err; int errpos = 0; BAT *r; BUN p, q; pcre *re = NULL; if (strs->htype == TYPE_void) r = BATnew(TYPE_oid, TYPE_void, BATcount(strs)); else r = BATnew(strs->htype, TYPE_void, BATcount(strs)); if ((re = pcre_compile(pattern, PCRE_UTF8 | PCRE_MULTILINE, &err_p, &errpos, NULL)) == NULL) { throw(MAL, "pcre_uselect", "pcre compile of pattern (%s) failed at %d with\n'%s'.", pattern, errpos, err_p); } BATloop(strs, p, q) { str s = BUNtail(strs, p); if (pcre_exec(re, NULL, s, strlen(s), 0, 0, NULL, 0) >= 0) { BUNins(r, BUNhead(strs, p), NULL, FALSE); } } my_pcre_free(re); if (!(r->batDirty&2)) r = BATsetaccess(r, BAT_READ); *res = r; return MAL_SUCCEED;}#define MAX_NR_CAPTURES 1024 /* Maximal number of captured substrings in one original string */strpcre_replace(str *res, str origin_str, str pattern, str replacement, str flags){ const char err[BUFSIZ], *err_p = err, *err_p2 = err; pcre *pcre_code = NULL; pcre_extra *extra; str tmpres; int i, j, k, len, errpos = 0, offset = 0; int compile_options = PCRE_UTF8, exec_options = PCRE_NOTEMPTY; int *ovector, ovecsize; int len_origin_str = strlen(origin_str); int len_replacement = strlen(replacement); int capture_offsets[MAX_NR_CAPTURES * 2], ncaptures = 0, len_del = 0; for (i = 0; i < (int)strlen(flags); i++) { if (flags[i] == 'e') { exec_options -= PCRE_NOTEMPTY; stream_printf(GDKout, "exec_options %d, PCRE_NOTEMPTY %d\n", exec_options, PCRE_NOTEMPTY); } else if (flags[i] == 'i') { compile_options |= PCRE_CASELESS; } else if (flags[i] == 'm') { compile_options |= PCRE_MULTILINE; } else if (flags[i] == 's') { compile_options |= PCRE_DOTALL; } else if (flags[i] == 'x') { compile_options |= PCRE_EXTENDED; } else { throw(MAL,"pcre_replace","unsupported flag character '%c'\n", flags[i]); } } if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) { throw(MAL,"pcre_replace","pcre compile of pattern (%s) failed at %d with\n'%s'.\n", pattern, errpos, err_p); } /* Since the compiled pattern is going to be used several times, it is * worth spending more time analyzing it in order to speed up the time * taken for matching. */ extra = pcre_study(pcre_code, 0, &err_p2); pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i); ovecsize = (i + 1) * 3; if ((ovector = (int *) GDKmalloc(sizeof(int) * ovecsize)) == NULL) { my_pcre_free(pcre_code); throw(MAL, "pcre_replace","not enough memory\n"); } i = 0; do { j = pcre_exec(pcre_code, extra, origin_str, len_origin_str, offset, exec_options, ovector, ovecsize); if (j > 0){ capture_offsets[i] = ovector[0]; capture_offsets[i+1] = ovector[1]; ncaptures++; i += 2; len_del += (ovector[1] - ovector[0]); offset = ovector[1]; } } while((j > 0) && (offset < len_origin_str) && (ncaptures < MAX_NR_CAPTURES)); if (ncaptures > 0){ tmpres = GDKmalloc(len_origin_str - len_del + (len_replacement * ncaptures) + 1); if (!tmpres) { my_pcre_free(pcre_code); GDKfree(ovector); throw(MAL, "pcre_replace","not enough memory\n"); } j = k = 0; /* possibly copy the substring before the first captured substring */ strncpy(tmpres, origin_str, capture_offsets[j]); k = capture_offsets[j]; j++; for (i = 0; i < ncaptures - 1; i++) { strncpy(tmpres+k, replacement, len_replacement); k += len_replacement; /* copy the substring between two captured substrings */ len = capture_offsets[j+1] - capture_offsets[j]; strncpy(tmpres+k, origin_str+capture_offsets[j], len); k += len; j += 2; } /* replace the last captured substring */ strncpy(tmpres+k, replacement, len_replacement); k += len_replacement; /* possibly copy the substring after the last captured substring */ len = len_origin_str - capture_offsets[j]; strncpy(tmpres+k, origin_str+capture_offsets[j], len); k += len; tmpres[k] = '\0'; } else { /* no captured substrings, return the original string*/ tmpres = GDKstrdup(origin_str); } my_pcre_free(pcre_code); GDKfree(ovector); *res = tmpres; return MAL_SUCCEED;}strpcre_replace_bat(BAT **res, BAT *origin_strs, str pattern, str replacement, str flags){ const char err[BUFSIZ], *err_p = err, *err_p2 = err; int i, j, k, len, errpos = 0, offset = 0; int compile_options = PCRE_UTF8, exec_options = PCRE_NOTEMPTY; pcre *pcre_code = NULL; pcre_extra *extra; BAT *tmpbat; BUN p, q; int *ovector, ovecsize; int len_origin_str, len_replacement = strlen(replacement); int capture_offsets[MAX_NR_CAPTURES * 2], ncaptures = 0, len_del = 0; str origin_str, replaced_str; for (i = 0; i < (int)strlen(flags); i++) { if (flags[i] == 'e') { exec_options |= (~PCRE_NOTEMPTY); } else if (flags[i] == 'i') { compile_options |= PCRE_CASELESS; } else if (flags[i] == 'm') { compile_options |= PCRE_MULTILINE; } else if (flags[i] == 's') { compile_options |= PCRE_DOTALL; } else if (flags[i] == 'x') { compile_options |= PCRE_EXTENDED; } else { throw(MAL,"pcre_replace_bat", "\"flags\" contains invalid character '%c'\n", flags[i]); } } if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) { throw(MAL,"pcre_replace_bat", "pcre compile of pattern (%s) failed at %d with\n'%s'.\n", pattern, errpos, err_p); } /* Since the compiled pattern is ging to be used several times, it is worth spending
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -