jsregexp.c
来自「一个基于alice开发的机器人」· C语言 代码 · 共 1,779 行 · 第 1/5 页
C
1,779 行
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
*
* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Communicator client code, released
* March 31, 1998.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the terms of
* either of the GNU General Public License Version 2 or later (the "GPL"),
* or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/*
* JS regular expressions, after Perl.
*/
#include "jsstddef.h"
#include <stdlib.h>
#include <string.h>
#include "jstypes.h"
#include "jsarena.h" /* Added by JSIFY */
#include "jsutil.h" /* Added by JSIFY */
#include "jsapi.h"
#include "jsarray.h"
#include "jsatom.h"
#include "jscntxt.h"
#include "jsconfig.h"
#include "jsfun.h"
#include "jsgc.h"
#include "jsinterp.h"
#include "jslock.h"
#include "jsnum.h"
#include "jsobj.h"
#include "jsopcode.h"
#include "jsregexp.h"
#include "jsscan.h"
#include "jsstr.h"
#ifdef XP_MAC
#include <MacMemory.h>
#endif
#if JS_HAS_REGEXPS
/* Note : contiguity of 'simple opcodes' is important for simpleMatch() */
typedef enum REOp {
REOP_EMPTY = 0, /* match rest of input against rest of r.e. */
REOP_ALT = 1, /* alternative subexpressions in kid and next */
REOP_SIMPLE_START = 2, /* start of 'simple opcodes' */
REOP_BOL = 2, /* beginning of input (or line if multiline) */
REOP_EOL = 3, /* end of input (or line if multiline) */
REOP_WBDRY = 4, /* match "" at word boundary */
REOP_WNONBDRY = 5, /* match "" at word non-boundary */
REOP_DOT = 6, /* stands for any character */
REOP_DIGIT = 7, /* match a digit char: [0-9] */
REOP_NONDIGIT = 8, /* match a non-digit char: [^0-9] */
REOP_ALNUM = 9, /* match an alphanumeric char: [0-9a-z_A-Z] */
REOP_NONALNUM = 10, /* match a non-alphanumeric char: [^0-9a-z_A-Z] */
REOP_SPACE = 11, /* match a whitespace char */
REOP_NONSPACE = 12, /* match a non-whitespace char */
REOP_BACKREF = 13, /* back-reference (e.g., \1) to a parenthetical */
REOP_FLAT = 14, /* match a flat string */
REOP_FLAT1 = 15, /* match a single char */
REOP_FLATi = 16, /* case-independent REOP_FLAT */
REOP_FLAT1i = 17, /* case-independent REOP_FLAT1 */
REOP_UCFLAT1 = 18, /* single Unicode char */
REOP_UCFLAT1i = 19, /* case-independent REOP_UCFLAT1 */
REOP_UCFLAT = 20, /* flat Unicode string; len immediate counts chars */
REOP_UCFLATi = 21, /* case-independent REOP_UCFLAT */
REOP_CLASS = 22, /* character class with index */
REOP_NCLASS = 23, /* negated character class with index */
REOP_SIMPLE_END = 23, /* end of 'simple opcodes' */
REOP_QUANT = 25, /* quantified atom: atom{1,2} */
REOP_STAR = 26, /* zero or more occurrences of kid */
REOP_PLUS = 27, /* one or more occurrences of kid */
REOP_OPT = 28, /* optional subexpression in kid */
REOP_LPAREN = 29, /* left paren bytecode: kid is u.num'th sub-regexp */
REOP_RPAREN = 30, /* right paren bytecode */
REOP_JUMP = 31, /* for deoptimized closure loops */
REOP_DOTSTAR = 32, /* optimize .* to use a single opcode */
REOP_ANCHOR = 33, /* like .* but skips left context to unanchored r.e. */
REOP_EOLONLY = 34, /* $ not preceded by any pattern */
REOP_BACKREFi = 37, /* case-independent REOP_BACKREF */
REOP_LPARENNON = 41, /* non-capturing version of REOP_LPAREN */
REOP_ASSERT = 43, /* zero width positive lookahead assertion */
REOP_ASSERT_NOT = 44, /* zero width negative lookahead assertion */
REOP_ASSERTTEST = 45, /* sentinel at end of assertion child */
REOP_ASSERTNOTTEST = 46, /* sentinel at end of !assertion child */
REOP_MINIMALSTAR = 47, /* non-greedy version of * */
REOP_MINIMALPLUS = 48, /* non-greedy version of + */
REOP_MINIMALOPT = 49, /* non-greedy version of ? */
REOP_MINIMALQUANT = 50, /* non-greedy version of {} */
REOP_ENDCHILD = 51, /* sentinel at end of quantifier child */
REOP_REPEAT = 52, /* directs execution of greedy quantifier */
REOP_MINIMALREPEAT = 53, /* directs execution of non-greedy quantifier */
REOP_ALTPREREQ = 54, /* prerequisite for ALT, either of two chars */
REOP_ALTPREREQ2 = 55, /* prerequisite for ALT, a char or a class */
REOP_ENDALT = 56, /* end of final alternate */
REOP_CONCAT = 57, /* concatenation of terms (parse time only) */
REOP_END
} REOp;
#define REOP_IS_SIMPLE(op) (((op) >= REOP_SIMPLE_START) && ((op) <= REOP_SIMPLE_END))
struct RENode {
REOp op; /* r.e. op bytecode */
RENode *next; /* next in concatenation order */
void *kid; /* first operand */
union {
void *kid2; /* second operand */
jsint num; /* could be a number */
jsint parenIndex; /* or a parenthesis index */
struct { /* or a quantifier range */
uint16 min;
uint16 max;
JSBool greedy;
} range;
struct { /* or a character class */
uint16 startIndex;
uint16 kidlen; /* length of string at kid, in jschars */
uint16 bmsize; /* bitmap size, based on max char code */
uint16 index; /* index into class list */
JSBool sense;
} ucclass;
struct { /* or a literal sequence */
jschar chr; /* of one character */
uint16 length; /* or many (via the kid) */
} flat;
struct {
RENode *kid2; /* second operand from ALT */
jschar ch1; /* match char for ALTPREREQ */
jschar ch2; /* ditto, or class index for ALTPREREQ2 */
} altprereq;
} u;
};
#define RE_IS_LETTER(c) ( ((c >= 'A') && (c <= 'Z')) || \
((c >= 'a') && (c <= 'z')) )
#define RE_IS_LINE_TERM(c) ( (c == '\n') || (c == '\r') || \
(c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
#define CLASS_CACHE_SIZE (4)
typedef struct CompilerState {
JSContext *context;
JSTokenStream *tokenStream; /* For reporting errors */
const jschar *cpbegin;
const jschar *cpend;
const jschar *cp;
uintN flags;
uint16 parenCount;
uint16 classCount; /* number of [] encountered */
size_t progLength; /* estimated bytecode length */
uintN treeDepth; /* maximum depth of parse tree */
RENode *result;
struct {
const jschar *start; /* small cache of class strings */
uint16 length; /* since they're often the same */
uint16 index;
} classCache[CLASS_CACHE_SIZE];
} CompilerState;
typedef struct RECapture {
int32 index; /* start of contents, -1 for empty */
uint16 length; /* length of capture */
} RECapture;
typedef struct REMatchState {
const jschar *cp;
RECapture parens[1]; /* first of 're->parenCount' captures,
* allocated at end of this struct.
*/
} REMatchState;
struct REBackTrackData;
typedef struct REProgState {
jsbytecode *continue_pc; /* current continuation data */
jsbytecode continue_op;
uint16 index; /* progress in text */
uintN parenSoFar; /* highest indexed paren started */
union {
struct {
uint16 min; /* current quantifier limits */
uint16 max;
} quantifier;
struct {
size_t top; /* backtrack stack state */
size_t sz;
} assertion;
} u;
} REProgState;
typedef struct REBackTrackData {
size_t sz; /* size of previous stack entry */
jsbytecode *backtrack_pc; /* where to backtrack to */
jsbytecode backtrack_op;
const jschar *cp; /* index in text of match at backtrack */
intN parenIndex; /* start index of saved paren contents */
uint16 parenCount; /* # of saved paren contents */
uint16 precedingStateTop; /* number of parent states */
/* saved parent states follow */
/* saved paren contents follow */
} REBackTrackData;
#define INITIAL_STATESTACK (100)
#define INITIAL_BACKTRACK (8000)
typedef struct REGlobalData {
JSContext *cx;
JSRegExp *regexp; /* the RE in execution */
JSBool ok; /* runtime error (out_of_memory only?) */
size_t start; /* offset to start at */
ptrdiff_t skipped; /* chars skipped anchoring this r.e. */
const jschar *cpbegin, *cpend; /* text base address and limit */
REProgState *stateStack; /* stack of state of current parents */
uint16 stateStackTop;
uint16 maxStateStack;
REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
REBackTrackData *backTrackSP;
size_t maxBackTrack;
size_t cursz; /* size of current stack entry */
JSArenaPool pool; /* I don't understand but it's faster to
* use this than to malloc/free the three
* items that are allocated from this pool
*/
} REGlobalData;
/*
* 1. If IgnoreCase is false, return ch.
* 2. Let u be ch converted to upper case as if by calling
* String.prototype.toUpperCase on the one-character string ch.
* 3. If u does not consist of a single character, return ch.
* 4. Let cu be u's character.
* 5. If ch's code point value is greater than or equal to decimal 128 and cu's
* code point value is less than decimal 128, then return ch.
* 6. Return cu.
*/
static jschar
upcase(jschar ch)
{
jschar cu = JS_TOUPPER(ch);
if ((ch >= 128) && (cu < 128)) return ch;
return cu;
}
static jschar
downcase(jschar ch)
{
jschar cl = JS_TOLOWER(ch);
if ((cl >= 128) && (ch < 128)) return ch;
return cl;
}
/* Construct and initialize an RENode, returning NULL for out-of-memory */
static RENode *
NewRENode(CompilerState *state, REOp op)
{
JSContext *cx;
RENode *ren;
cx = state->context;
JS_ARENA_ALLOCATE_CAST(ren, RENode *, &cx->tempPool, sizeof *ren);
if (!ren) {
JS_ReportOutOfMemory(cx);
return NULL;
}
ren->op = op;
ren->next = NULL;
ren->kid = NULL;
return ren;
}
/*
* Validates and converts hex ascii value.
*/
static JSBool
isASCIIHexDigit(jschar c, uintN *digit)
{
uintN cv = c;
if (cv < '0')
return JS_FALSE;
if (cv <= '9') {
*digit = cv - '0';
return JS_TRUE;
}
cv |= 0x20;
if (cv >= 'a' && cv <= 'f') {
*digit = cv - 'a' + 10;
return JS_TRUE;
}
return JS_FALSE;
}
typedef struct {
REOp op;
const jschar *errPos;
uint16 parenIndex;
} REOpData;
/*
* Process the op against the two top operands, reducing them to a single
* operand in the penultimate slot. Update progLength and treeDepth.
*/
static JSBool
processOp(CompilerState *state, REOpData *opData, RENode **operandStack, intN operandSP)
{
RENode *result;
switch (opData->op) {
case REOP_ALT:
result = NewRENode(state, REOP_ALT);
if (!result)
return JS_FALSE;
result->kid = operandStack[operandSP - 2];
result->u.kid2 = operandStack[operandSP - 1];
operandStack[operandSP - 2] = result;
/*
* look at both alternates to see if there's a FLAT or a CLASS at
* the start of each. If so, use a prerequisite match
*/
++state->treeDepth;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?