research.cxx

来自「porting scintilla to qt」· CXX 代码 · 共 981 行 · 第 1/2 页
CXX
981 行
// Scintilla source code edit control/** @file RESearch.cxx ** Regular expression search library. **//* * regex - Regular expression pattern matching and replacement * * By:  Ozan S. Yigit (oz) *      Dept. of Computer Science *      York University * * Original code available from http://www.cs.yorku.ca/~oz/ * Translation to C++ by Neil Hodgson neilh@scintilla.org * Removed all use of register. * Converted to modern function prototypes. * Put all global/static variables into an object so this code can be * used from multiple threads, etc. * Some extensions by Philippe Lhoste PhiLho(a)GMX.net * * These routines are the PUBLIC DOMAIN equivalents of regex * routines as found in 4.nBSD UN*X, with minor extensions. * * These routines are derived from various implementations found * in software tools books, and Conroy's grep. They are NOT derived * from licensed/restricted software. * For more interesting/academic/complicated implementations, * see Henry Spencer's regexp routines, or GNU Emacs pattern * matching module. * * Modification history removed. * * Interfaces: *  RESearch::Compile:      compile a regular expression into a NFA. * *          const char *RESearch::Compile(const char *pat, int length, *                                        bool caseSensitive, bool posix) * * Returns a short error string if they fail. * *  RESearch::Execute:      execute the NFA to match a pattern. * *          int RESearch::Execute(characterIndexer &ci, int lp, int endp) * *  RESearch::Substitute:   substitute the matched portions in a new string. * *          int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) * *  re_fail:                failure routine for RESearch::Execute. (no longer used) * *          void re_fail(char *msg, char op) * * Regular Expressions: * *      [1]     char    matches itself, unless it is a special *                      character (metachar): . \ [ ] * + ^ $ *                      and ( ) if posix option. * *      [2]     .       matches any character. * *      [3]     \       matches the character following it, except: *                      - \a, \b, \f, \n, \r, \t, \v match the corresponding C *                      escape char, respectively BEL, BS, FF, LF, CR, TAB and VT; *                      Note that \r and \n are never matched because Scintilla *                      regex searches are made line per line *                      (stripped of end-of-line chars). *                      - if not in posix mode, when followed by a *                      left or right round bracket (see [7]); *                      - when followed by a digit 1 to 9 (see [8]); *                      - when followed by a left or right angle bracket *                      (see [9]); *                      - when followed by d, D, s, S, w or W (see [10]); *                      - when followed by x and two hexa digits (see [11]. *                      Backslash is used as an escape character for all *                      other meta-characters, and itself. * *      [4]     [set]   matches one of the characters in the set. *                      If the first character in the set is "^", *                      it matches the characters NOT in the set, i.e. *                      complements the set. A shorthand S-E (start dash end) *                      is used to specify a set of characters S up to *                      E, inclusive. S and E must be characters, otherwise *                      the dash is taken literally (eg. in expression [\d-a]). *                      The special characters "]" and "-" have no special *                      meaning if they appear as the first chars in the set. *                      To include both, put - first: [-]A-Z] *                      (or just backslash them). *                      examples:        match: * *                              [-]|]    matches these 3 chars, * *                              []-|]    matches from ] to | chars * *                              [a-z]    any lowercase alpha * *                              [^-]]    any char except - and ] * *                              [^A-Z]   any char except uppercase *                                       alpha * *                              [a-zA-Z] any alpha * *      [5]     *       any regular expression form [1] to [4] *                      (except [7], [8] and [9] forms of [3]), *                      followed by closure char (*) *                      matches zero or more matches of that form. * *      [6]     +       same as [5], except it matches one or more. *                      Both [5] and [6] are greedy (they match as much as possible). * *      [7]             a regular expression in the form [1] to [12], enclosed *                      as \(form\) (or (form) with posix flag) matches what *                      form matches. The enclosure creates a set of tags, *                      used for [8] and for pattern substitution. *                      The tagged forms are numbered starting from 1. * *      [8]             a \ followed by a digit 1 to 9 matches whatever a *                      previously tagged regular expression ([7]) matched. * *      [9]     \<      a regular expression starting with a \< construct *              \>      and/or ending with a \> construct, restricts the *                      pattern matching to the beginning of a word, and/or *                      the end of a word. A word is defined to be a character *                      string beginning and/or ending with the characters *                      A-Z a-z 0-9 and _. Scintilla extends this definition *                      by user setting. The word must also be preceded and/or *                      followed by any character outside those mentioned. * *      [10]    \l      a backslash followed by d, D, s, S, w or W, *                      becomes a character class (both inside and *                      outside sets []). *                        d: decimal digits *                        D: any char except decimal digits *                        s: whitespace (space, \t \n \r \f \v) *                        S: any char except whitespace (see above) *                        w: alphanumeric & underscore (changed by user setting) *                        W: any char except alphanumeric & underscore (see above) * *      [11]    \xHH    a backslash followed by x and two hexa digits, *                      becomes the character whose Ascii code is equal *                      to these digits. If not followed by two digits, *                      it is 'x' char itself. * *      [12]            a composite regular expression xy where x and y *                      are in the form [1] to [11] matches the longest *                      match of x followed by a match for y. * *      [13]    ^       a regular expression starting with a ^ character *              $       and/or ending with a $ character, restricts the *                      pattern matching to the beginning of the line, *                      or the end of line. [anchors] Elsewhere in the *                      pattern, ^ and $ are treated as ordinary characters. * * * Acknowledgements: * *  HCR's Hugh Redelmeier has been most helpful in various *  stages of development. He convinced me to include BOW *  and EOW constructs, originally invented by Rob Pike at *  the University of Toronto. * * References: *              Software tools                  Kernighan & Plauger *              Software tools in Pascal        Kernighan & Plauger *              Grep [rsx-11 C dist]            David Conroy *              ed - text editor                Un*x Programmer's Manual *              Advanced editing on Un*x        B. W. Kernighan *              RegExp routines                 Henry Spencer * * Notes: * *  This implementation uses a bit-set representation for character *  classes for speed and compactness. Each character is represented *  by one bit in a 256-bit block. Thus, CCL always takes a *	constant 32 bytes in the internal nfa, and RESearch::Execute does a single *  bit comparison to locate the character in the set. * * Examples: * *  pattern:    foo*.* *  compile:    CHR f CHR o CLO CHR o END CLO ANY END END *  matches:    fo foo fooo foobar fobar foxx ... * *  pattern:    fo[ob]a[rz] *  compile:    CHR f CHR o CCL bitset CHR a CCL bitset END *  matches:    fobar fooar fobaz fooaz * *  pattern:    foo\\+ *  compile:    CHR f CHR o CHR o CHR \ CLO CHR \ END END *  matches:    foo\ foo\\ foo\\\  ... * *  pattern:    \(foo\)[1-3]\1  (same as foo[1-3]foo) *  compile:    BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END *  matches:    foo1foo foo2foo foo3foo * *  pattern:    \(fo.*\)-\1 *  compile:    BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END *  matches:    foo-foo fo-fo fob-fob foobar-foobar ... */#include "CharClassify.h"#include "RESearch.h"// Shut up annoying Visual C++ warnings:#ifdef _MSC_VER#pragma warning(disable: 4514)#endif#ifdef SCI_NAMESPACEusing namespace Scintilla;#endif#define OKP     1#define NOP     0#define CHR     1#define ANY     2#define CCL     3#define BOL     4#define EOL     5#define BOT     6#define EOT     7#define BOW     8#define EOW     9#define REF     10#define CLO     11#define END     0/* * The following defines are not meant to be changeable. * They are for readability only. */#define BLKIND  0370#define BITIND  07const char bitarr[] = { 1, 2, 4, 8, 16, 32, 64, '\200' };#define badpat(x)	(*nfa = END, x)/* * Character classification table for word boundary operators BOW * and EOW is passed in by the creator of this object (Scintilla * Document). The Document default state is that word chars are: * 0-9, a-z, A-Z and _ */RESearch::RESearch(CharClassify *charClassTable) {	charClass = charClassTable;	Init();}RESearch::~RESearch() {	Clear();}void RESearch::Init() {	sta = NOP;                  /* status of lastpat */	bol = 0;	for (int i = 0; i < MAXTAG; i++)		pat[i] = 0;	for (int j = 0; j < BITBLK; j++)		bittab[j] = 0;}void RESearch::Clear() {	for (int i = 0; i < MAXTAG; i++) {		delete []pat[i];		pat[i] = 0;		bopat[i] = NOTFOUND;		eopat[i] = NOTFOUND;	}}bool RESearch::GrabMatches(CharacterIndexer &ci) {	bool success = true;	for (unsigned int i = 0; i < MAXTAG; i++) {		if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {			unsigned int len = eopat[i] - bopat[i];			pat[i] = new char[len + 1];			if (pat[i]) {				for (unsigned int j = 0; j < len; j++)					pat[i][j] = ci.CharAt(bopat[i] + j);				pat[i][len] = '\0';			} else {				success = false;			}		}	}	return success;}void RESearch::ChSet(unsigned char c) {	bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];}void RESearch::ChSetWithCase(unsigned char c, bool caseSensitive) {	if (caseSensitive) {		ChSet(c);	} else {		if ((c >= 'a') && (c <= 'z')) {			ChSet(c);			ChSet(static_cast<unsigned char>(c - 'a' + 'A'));		} else if ((c >= 'A') && (c <= 'Z')) {			ChSet(c);			ChSet(static_cast<unsigned char>(c - 'A' + 'a'));		} else {			ChSet(c);		}	}}const unsigned char escapeValue(unsigned char ch) {	switch (ch) {	case 'a':	return '\a';	case 'b':	return '\b';	case 'f':	return '\f';	case 'n':	return '\n';	case 'r':	return '\r';	case 't':	return '\t';	case 'v':	return '\v';	}	return 0;}static int GetHexaChar(unsigned char hd1, unsigned char hd2) {	int hexValue = 0;	if (hd1 >= '0' && hd1 <= '9') {		hexValue += 16 * (hd1 - '0');	} else if (hd1 >= 'A' && hd1 <= 'F') {		hexValue += 16 * (hd1 - 'A' + 10);	} else if (hd1 >= 'a' && hd1 <= 'f') {		hexValue += 16 * (hd1 - 'a' + 10);	} else		return -1;	if (hd2 >= '0' && hd2 <= '9') {		hexValue += hd2 - '0';	} else if (hd2 >= 'A' && hd2 <= 'F') {		hexValue += hd2 - 'A' + 10;	} else if (hd2 >= 'a' && hd2 <= 'f') {		hexValue += hd2 - 'a' + 10;	} else		return -1;	return hexValue;}/** * Called when the parser finds a backslash not followed * by a valid expression (like \( in non-Posix mode). * @param pat: pointer on the char after the backslash. * @param incr: (out) number of chars to skip after expression evaluation. * @return the char if it resolves to a simple char, * or -1 for a char class. In this case, bittab is changed. */int RESearch::GetBackslashExpression(		const char *pat,		int &incr) {	// Since error reporting is primitive and messages are not used anyway,	// I choose to interpret unexpected syntax in a logical way instead	// of reporting errors. Otherwise, we can stick on, eg., PCRE behavior.	incr = 0;	// Most of the time, will skip the char "naturally".	int c;	int result = -1;	unsigned char bsc = *pat;	if (!bsc) {		// Avoid overrun		result = '\\';	// \ at end of pattern, take it literally		return result;	}	switch (bsc) {	case 'a':	case 'b':	case 'n':	case 'f':	case 'r':	case 't':	case 'v':		result = escapeValue(bsc);		break;	case 'x': {			unsigned char hd1 = *(pat + 1);			unsigned char hd2 = *(pat + 2);			int hexValue = GetHexaChar(hd1, hd2);			if (hexValue >= 0) {				result = hexValue;				incr = 2;	// Must skip the digits			} else {				result = 'x';	// \x without 2 digits: see it as 'x'			}		}		break;	case 'd':		for (c = '0'; c <= '9'; c++) {			ChSet(static_cast<unsigned char>(c));		}		break;	case 'D':		for (c = 0; c < MAXCHR; c++) {			if (c < '0' || c > '9') {				ChSet(static_cast<unsigned char>(c));			}		}		break;	case 's':		ChSet(' ');		ChSet('\t');		ChSet('\n');		ChSet('\r');		ChSet('\f');		ChSet('\v');		break;	case 'S':		for (c = 0; c < MAXCHR; c++) {			if (c != ' ' && !(c >= 0x09 && c <= 0x0D)) {				ChSet(static_cast<unsigned char>(c));			}		}	case 'w':		for (c = 0; c < MAXCHR; c++) {			if (iswordc(static_cast<unsigned char>(c))) {				ChSet(static_cast<unsigned char>(c));			}		}		break;	case 'W':		for (c = 0; c < MAXCHR; c++) {			if (!iswordc(static_cast<unsigned char>(c))) {				ChSet(static_cast<unsigned char>(c));			}		}		break;	default:		result = bsc;	}	return result;}const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, bool posix) {	char *mp=nfa;          /* nfa pointer       */	char *lp;              /* saved pointer     */	char *sp=nfa;          /* another one       */	char *mpMax = mp + MAXNFA - BITBLK - 10;	int tagi = 0;          /* tag stack index   */	int tagc = 1;          /* actual tag count  */	int n;	char mask;             /* xor mask -CCL/NCL */	int c1, c2, prevChar;	if (!pat || !length)		if (sta)			return 0;		else			return badpat("No previous regular expression");	sta = NOP;	const char *p=pat;     /* pattern pointer   */	for (int i=0; i<length; i++, p++) {		if (mp > mpMax)			return badpat("Pattern too long");		lp = mp;		switch (*p) {		case '.':               /* match any char  */			*mp++ = ANY;			break;		case '^':               /* match beginning */			if (p == pat)				*mp++ = BOL;			else {				*mp++ = CHR;				*mp++ = *p;			}			break;		case '$':               /* match endofline */			if (!*(p+1))				*mp++ = EOL;			else {				*mp++ = CHR;				*mp++ = *p;			}			break;		case '[':               /* match char class */			*mp++ = CCL;			prevChar = 0;
research.cxx - 源码说明

本页面展示了「porting scintilla to qt」中的 research.cxx 源码文件，采用 CXX 编程语言编写，共 981 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫开发者社区收录了大量与Scintilla相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?