📄 jsregexp.c
字号:
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- * vim: set sw=4 ts=8 et tw=78: * * ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is Mozilla Communicator client code, released * March 31, 1998. * * The Initial Developer of the Original Code is * Netscape Communications Corporation. * Portions created by the Initial Developer are Copyright (C) 1998 * the Initial Developer. All Rights Reserved. * * Contributor(s): * * Alternatively, the contents of this file may be used under the terms of * either of the GNU General Public License Version 2 or later (the "GPL"), * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), * in which case the provisions of the GPL or the LGPL are applicable instead * of those above. If you wish to allow use of your version of this file only * under the terms of either the GPL or the LGPL, and not to allow others to * use your version of this file under the terms of the MPL, indicate your * decision by deleting the provisions above and replace them with the notice * and other provisions required by the GPL or the LGPL. If you do not delete * the provisions above, a recipient may use your version of this file under * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** *//* * JS regular expressions, after Perl. */#include "jsstddef.h"#include <stdlib.h>#include <string.h>#include "jstypes.h"#include "jsarena.h" /* Added by JSIFY */#include "jsutil.h" /* Added by JSIFY */#include "jsapi.h"#include "jsarray.h"#include "jsatom.h"#include "jscntxt.h"#include "jsconfig.h"#include "jsfun.h"#include "jsgc.h"#include "jsinterp.h"#include "jslock.h"#include "jsnum.h"#include "jsobj.h"#include "jsopcode.h"#include "jsregexp.h"#include "jsscan.h"#include "jsstr.h"/* Note : contiguity of 'simple opcodes' is important for SimpleMatch() */typedef enum REOp { REOP_EMPTY = 0, /* match rest of input against rest of r.e. */ REOP_ALT = 1, /* alternative subexpressions in kid and next */ REOP_SIMPLE_START = 2, /* start of 'simple opcodes' */ REOP_BOL = 2, /* beginning of input (or line if multiline) */ REOP_EOL = 3, /* end of input (or line if multiline) */ REOP_WBDRY = 4, /* match "" at word boundary */ REOP_WNONBDRY = 5, /* match "" at word non-boundary */ REOP_DOT = 6, /* stands for any character */ REOP_DIGIT = 7, /* match a digit char: [0-9] */ REOP_NONDIGIT = 8, /* match a non-digit char: [^0-9] */ REOP_ALNUM = 9, /* match an alphanumeric char: [0-9a-z_A-Z] */ REOP_NONALNUM = 10, /* match a non-alphanumeric char: [^0-9a-z_A-Z] */ REOP_SPACE = 11, /* match a whitespace char */ REOP_NONSPACE = 12, /* match a non-whitespace char */ REOP_BACKREF = 13, /* back-reference (e.g., \1) to a parenthetical */ REOP_FLAT = 14, /* match a flat string */ REOP_FLAT1 = 15, /* match a single char */ REOP_FLATi = 16, /* case-independent REOP_FLAT */ REOP_FLAT1i = 17, /* case-independent REOP_FLAT1 */ REOP_UCFLAT1 = 18, /* single Unicode char */ REOP_UCFLAT1i = 19, /* case-independent REOP_UCFLAT1 */ REOP_UCFLAT = 20, /* flat Unicode string; len immediate counts chars */ REOP_UCFLATi = 21, /* case-independent REOP_UCFLAT */ REOP_CLASS = 22, /* character class with index */ REOP_NCLASS = 23, /* negated character class with index */ REOP_SIMPLE_END = 23, /* end of 'simple opcodes' */ REOP_QUANT = 25, /* quantified atom: atom{1,2} */ REOP_STAR = 26, /* zero or more occurrences of kid */ REOP_PLUS = 27, /* one or more occurrences of kid */ REOP_OPT = 28, /* optional subexpression in kid */ REOP_LPAREN = 29, /* left paren bytecode: kid is u.num'th sub-regexp */ REOP_RPAREN = 30, /* right paren bytecode */ REOP_JUMP = 31, /* for deoptimized closure loops */ REOP_DOTSTAR = 32, /* optimize .* to use a single opcode */ REOP_ANCHOR = 33, /* like .* but skips left context to unanchored r.e. */ REOP_EOLONLY = 34, /* $ not preceded by any pattern */ REOP_BACKREFi = 37, /* case-independent REOP_BACKREF */ REOP_LPARENNON = 41, /* non-capturing version of REOP_LPAREN */ REOP_ASSERT = 43, /* zero width positive lookahead assertion */ REOP_ASSERT_NOT = 44, /* zero width negative lookahead assertion */ REOP_ASSERTTEST = 45, /* sentinel at end of assertion child */ REOP_ASSERTNOTTEST = 46, /* sentinel at end of !assertion child */ REOP_MINIMALSTAR = 47, /* non-greedy version of * */ REOP_MINIMALPLUS = 48, /* non-greedy version of + */ REOP_MINIMALOPT = 49, /* non-greedy version of ? */ REOP_MINIMALQUANT = 50, /* non-greedy version of {} */ REOP_ENDCHILD = 51, /* sentinel at end of quantifier child */ REOP_REPEAT = 52, /* directs execution of greedy quantifier */ REOP_MINIMALREPEAT = 53, /* directs execution of non-greedy quantifier */ REOP_ALTPREREQ = 54, /* prerequisite for ALT, either of two chars */ REOP_ALTPREREQ2 = 55, /* prerequisite for ALT, a char or a class */ REOP_ENDALT = 56, /* end of final alternate */ REOP_CONCAT = 57, /* concatenation of terms (parse time only) */ REOP_END} REOp;#define REOP_IS_SIMPLE(op) ((unsigned)((op) - REOP_SIMPLE_START) < \ (unsigned)REOP_SIMPLE_END)struct RENode { REOp op; /* r.e. op bytecode */ RENode *next; /* next in concatenation order */ void *kid; /* first operand */ union { void *kid2; /* second operand */ jsint num; /* could be a number */ size_t parenIndex; /* or a parenthesis index */ struct { /* or a quantifier range */ uintN min; uintN max; JSPackedBool greedy; } range; struct { /* or a character class */ size_t startIndex; size_t kidlen; /* length of string at kid, in jschars */ size_t index; /* index into class list */ uint16 bmsize; /* bitmap size, based on max char code */ JSPackedBool sense; } ucclass; struct { /* or a literal sequence */ jschar chr; /* of one character */ size_t length; /* or many (via the kid) */ } flat; struct { RENode *kid2; /* second operand from ALT */ jschar ch1; /* match char for ALTPREREQ */ jschar ch2; /* ditto, or class index for ALTPREREQ2 */ } altprereq; } u;};#define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \ ((c >= 'a') && (c <= 'z')) )#define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \ (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))#define CLASS_CACHE_SIZE 4typedef struct CompilerState { JSContext *context; JSTokenStream *tokenStream; /* For reporting errors */ const jschar *cpbegin; const jschar *cpend; const jschar *cp; size_t parenCount; size_t classCount; /* number of [] encountered */ size_t treeDepth; /* maximum depth of parse tree */ size_t progLength; /* estimated bytecode length */ RENode *result; size_t classBitmapsMem; /* memory to hold all class bitmaps */ struct { const jschar *start; /* small cache of class strings */ size_t length; /* since they're often the same */ size_t index; } classCache[CLASS_CACHE_SIZE]; uint16 flags;} CompilerState;typedef struct EmitStateStackEntry { jsbytecode *altHead; /* start of REOP_ALT* opcode */ jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */ jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */ jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */ RENode *continueNode; /* original REOP_ALT* node being stacked */ jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */ JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to avoid 16-bit unsigned offset overflow */} EmitStateStackEntry;/* * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h, * the getters and setters take the pc of the offset, not of the opcode before * the offset. */#define ARG_LEN 2#define GET_ARG(pc) ((uint16)(((pc)[0] << 8) | (pc)[1]))#define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \ (pc)[1] = (jsbytecode) (arg))#define OFFSET_LEN ARG_LEN#define OFFSET_MAX (JS_BIT(ARG_LEN * 8) - 1)#define GET_OFFSET(pc) GET_ARG(pc)/* * Maximum supported tree depth is maximum size of EmitStateStackEntry stack. * For sanity, we limit it to 2^24 bytes. */#define TREE_DEPTH_MAX (JS_BIT(24) / sizeof(EmitStateStackEntry))/* * The maximum memory that can be allocated for class bitmaps. * For sanity, we limit it to 2^24 bytes. */#define CLASS_BITMAPS_MEM_LIMIT JS_BIT(24)/* * Functions to get size and write/read bytecode that represent small indexes * compactly. * Each byte in the code represent 7-bit chunk of the index. 8th bit when set * indicates that the following byte brings more bits to the index. Otherwise * this is the last byte in the index bytecode representing highest index bits. */static size_tGetCompactIndexWidth(size_t index){ size_t width; for (width = 1; (index >>= 7) != 0; ++width) { } return width;}static jsbytecode *WriteCompactIndex(jsbytecode *pc, size_t index){ size_t next; while ((next = index >> 7) != 0) { *pc++ = (jsbytecode)(index | 0x80); index = next; } *pc++ = (jsbytecode)index; return pc;}static jsbytecode *ReadCompactIndex(jsbytecode *pc, size_t *result){ size_t nextByte; nextByte = *pc++; if ((nextByte & 0x80) == 0) { /* * Short-circuit the most common case when compact index <= 127. */ *result = nextByte; } else { size_t shift = 7; *result = 0x7F & nextByte; do { nextByte = *pc++; *result |= (nextByte & 0x7F) << shift; shift += 7; } while ((nextByte & 0x80) != 0); } return pc;}typedef struct RECapture { ptrdiff_t index; /* start of contents, -1 for empty */ size_t length; /* length of capture */} RECapture;typedef struct REMatchState { const jschar *cp; RECapture parens[1]; /* first of 're->parenCount' captures, allocated at end of this struct */} REMatchState;struct REBackTrackData;typedef struct REProgState { jsbytecode *continue_pc; /* current continuation data */ jsbytecode continue_op; ptrdiff_t index; /* progress in text */ size_t parenSoFar; /* highest indexed paren started */ union { struct { uintN min; /* current quantifier limits */ uintN max; } quantifier; struct { size_t top; /* backtrack stack state */ size_t sz; } assertion; } u;} REProgState;typedef struct REBackTrackData { size_t sz; /* size of previous stack entry */ jsbytecode *backtrack_pc; /* where to backtrack to */ jsbytecode backtrack_op; const jschar *cp; /* index in text of match at backtrack */ size_t parenIndex; /* start index of saved paren contents */ size_t parenCount; /* # of saved paren contents */ size_t saveStateStackTop; /* number of parent states */ /* saved parent states follow */ /* saved paren contents follow */} REBackTrackData;#define INITIAL_STATESTACK 100#define INITIAL_BACKTRACK 8000typedef struct REGlobalData { JSContext *cx; JSRegExp *regexp; /* the RE in execution */ JSBool ok; /* runtime error (out_of_memory only?) */ size_t start; /* offset to start at */ ptrdiff_t skipped; /* chars skipped anchoring this r.e. */ const jschar *cpbegin; /* text base address */ const jschar *cpend; /* text limit address */ REProgState *stateStack; /* stack of state of current parents */ size_t stateStackTop; size_t stateStackLimit; REBackTrackData *backTrackStack;/* stack of matched-so-far positions */ REBackTrackData *backTrackSP; size_t backTrackStackSize; size_t cursz; /* size of current stack entry */ JSArenaPool pool; /* It's faster to use one malloc'd pool than to malloc/free the three items that are allocated from this pool */} REGlobalData;/* * 1. If IgnoreCase is false, return ch. * 2. Let u be ch converted to upper case as if by calling * String.prototype.toUpperCase on the one-character string ch. * 3. If u does not consist of a single character, return ch. * 4. Let cu be u's character. * 5. If ch's code point value is greater than or equal to decimal 128 and cu's * code point value is less than decimal 128, then return ch. * 6. Return cu. */static jscharupcase(jschar ch){ jschar cu = JS_TOUPPER(ch); if (ch >= 128 && cu < 128) return ch;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -