📄 lexperl.cxx.svn-base
字号:
// Scintilla source code edit control
/** @file LexPerl.cxx
** Lexer for subset of Perl.
**/
// Copyright 1998-2005 by Neil Hodgson <neilh@scintilla.org>
// Lexical analysis fixes by Kein-Hong Man <mkh@pl.jaring.my>
// The License.txt file describes the conditions under which this software may be distributed.
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <stdio.h>
#include <stdarg.h>
#include "Platform.h"
#include "PropSet.h"
#include "Accessor.h"
#include "KeyWords.h"
#include "Scintilla.h"
#include "SciLexer.h"
#define PERLNUM_BINARY 1 // order is significant: 1-4 cannot have a dot
#define PERLNUM_HEX 2
#define PERLNUM_OCTAL 3
#define PERLNUM_FLOAT 4 // actually exponent part
#define PERLNUM_DECIMAL 5 // 1-5 are numbers; 6-7 are strings
#define PERLNUM_VECTOR 6
#define PERLNUM_V_VECTOR 7
#define PERLNUM_BAD 8
#define BACK_NONE 0 // lookback state for bareword disambiguation:
#define BACK_OPERATOR 1 // whitespace/comments are insignificant
#define BACK_KEYWORD 2 // operators/keywords are needed for disambiguation
#define HERE_DELIM_MAX 256
static inline bool isEOLChar(char ch) {
return (ch == '\r') || (ch == '\n');
}
static bool isSingleCharOp(char ch) {
char strCharSet[2];
strCharSet[0] = ch;
strCharSet[1] = '\0';
return (NULL != strstr("rwxoRWXOezsfdlpSbctugkTBMAC", strCharSet));
}
static inline bool isPerlOperator(char ch) {
if (ch == '^' || ch == '&' || ch == '\\' ||
ch == '(' || ch == ')' || ch == '-' || ch == '+' ||
ch == '=' || ch == '|' || ch == '{' || ch == '}' ||
ch == '[' || ch == ']' || ch == ':' || ch == ';' ||
ch == '>' || ch == ',' ||
ch == '?' || ch == '!' || ch == '.' || ch == '~')
return true;
// these chars are already tested before this call
// ch == '%' || ch == '*' || ch == '<' || ch == '/' ||
return false;
}
static bool isPerlKeyword(unsigned int start, unsigned int end, WordList &keywords, Accessor &styler) {
char s[100];
unsigned int i, len = end - start;
if (len > 30) { len = 30; }
for (i = 0; i < len; i++, start++) s[i] = styler[start];
s[i] = '\0';
return keywords.InList(s);
}
// Note: as lexer uses chars, UTF-8 bytes are considered as <0 values
// Note: iswordchar() was used in only one place in LexPerl, it is
// unnecessary as '.' is processed as the concatenation operator, so
// only isWordStart() is used in LexPerl
static inline bool isWordStart(char ch) {
return !isascii(ch) || isalnum(ch) || ch == '_';
}
static inline bool isEndVar(char ch) {
return isascii(ch) && !isalnum(ch) && ch != '#' && ch != '$' &&
ch != '_' && ch != '\'';
}
static inline bool isNonQuote(char ch) {
return !isascii(ch) || isalnum(ch) || ch == '_';
}
static inline char actualNumStyle(int numberStyle) {
if (numberStyle == PERLNUM_VECTOR || numberStyle == PERLNUM_V_VECTOR) {
return SCE_PL_STRING;
} else if (numberStyle == PERLNUM_BAD) {
return SCE_PL_ERROR;
}
return SCE_PL_NUMBER;
}
static bool isMatch(Accessor &styler, int lengthDoc, int pos, const char *val) {
if ((pos + static_cast<int>(strlen(val))) >= lengthDoc) {
return false;
}
while (*val) {
if (*val != styler[pos++]) {
return false;
}
val++;
}
return true;
}
static char opposite(char ch) {
if (ch == '(')
return ')';
if (ch == '[')
return ']';
if (ch == '{')
return '}';
if (ch == '<')
return '>';
return ch;
}
static void ColourisePerlDoc(unsigned int startPos, int length, int initStyle,
WordList *keywordlists[], Accessor &styler) {
// Lexer for perl often has to backtrack to start of current style to determine
// which characters are being used as quotes, how deeply nested is the
// start position and what the termination string is for here documents
WordList &keywords = *keywordlists[0];
class HereDocCls {
public:
int State; // 0: '<<' encountered
// 1: collect the delimiter
// 2: here doc text (lines after the delimiter)
char Quote; // the char after '<<'
bool Quoted; // true if Quote in ('\'','"','`')
int DelimiterLength; // strlen(Delimiter)
char *Delimiter; // the Delimiter, 256: sizeof PL_tokenbuf
HereDocCls() {
State = 0;
Quote = 0;
Quoted = false;
DelimiterLength = 0;
Delimiter = new char[HERE_DELIM_MAX];
Delimiter[0] = '\0';
}
~HereDocCls() {
delete []Delimiter;
}
};
HereDocCls HereDoc; // TODO: FIFO for stacked here-docs
class QuoteCls {
public:
int Rep;
int Count;
char Up;
char Down;
QuoteCls() {
this->New(1);
}
void New(int r) {
Rep = r;
Count = 0;
Up = '\0';
Down = '\0';
}
void Open(char u) {
Count++;
Up = u;
Down = opposite(Up);
}
};
QuoteCls Quote;
int state = initStyle;
char numState = PERLNUM_DECIMAL;
int dotCount = 0;
unsigned int lengthDoc = startPos + length;
//int sookedpos = 0; // these have no apparent use, see POD state
//char sooked[100];
//sooked[sookedpos] = '\0';
// If in a long distance lexical state, seek to the beginning to find quote characters
// Perl strings can be multi-line with embedded newlines, so backtrack.
// Perl numbers have additional state during lexing, so backtrack too.
if (state == SCE_PL_HERE_Q || state == SCE_PL_HERE_QQ || state == SCE_PL_HERE_QX) {
while ((startPos > 1) && (styler.StyleAt(startPos) != SCE_PL_HERE_DELIM)) {
startPos--;
}
startPos = styler.LineStart(styler.GetLine(startPos));
state = styler.StyleAt(startPos - 1);
}
if ( state == SCE_PL_STRING_Q
|| state == SCE_PL_STRING_QQ
|| state == SCE_PL_STRING_QX
|| state == SCE_PL_STRING_QR
|| state == SCE_PL_STRING_QW
|| state == SCE_PL_REGEX
|| state == SCE_PL_REGSUBST
|| state == SCE_PL_STRING
|| state == SCE_PL_BACKTICKS
|| state == SCE_PL_CHARACTER
|| state == SCE_PL_NUMBER
|| state == SCE_PL_IDENTIFIER
|| state == SCE_PL_ERROR
) {
while ((startPos > 1) && (styler.StyleAt(startPos - 1) == state)) {
startPos--;
}
state = SCE_PL_DEFAULT;
}
// lookback at start of lexing to set proper state for backflag
// after this, they are updated when elements are lexed
int backflag = BACK_NONE;
unsigned int backPos = startPos;
if (backPos > 0) {
backPos--;
int sty = SCE_PL_DEFAULT;
while ((backPos > 0) && (sty = styler.StyleAt(backPos),
sty == SCE_PL_DEFAULT || sty == SCE_PL_COMMENTLINE))
backPos--;
if (sty == SCE_PL_OPERATOR)
backflag = BACK_OPERATOR;
else if (sty == SCE_PL_WORD)
backflag = BACK_KEYWORD;
}
styler.StartAt(startPos);
char chPrev = styler.SafeGetCharAt(startPos - 1);
if (startPos == 0)
chPrev = '\n';
char chNext = styler[startPos];
styler.StartSegment(startPos);
for (unsigned int i = startPos; i < lengthDoc; i++) {
char ch = chNext;
// if the current character is not consumed due to the completion of an
// earlier style, lexing can be restarted via a simple goto
restartLexer:
chNext = styler.SafeGetCharAt(i + 1);
char chNext2 = styler.SafeGetCharAt(i + 2);
if (styler.IsLeadByte(ch)) {
chNext = styler.SafeGetCharAt(i + 2);
chPrev = ' ';
i += 1;
continue;
}
if ((chPrev == '\r' && ch == '\n')) { // skip on DOS/Windows
styler.ColourTo(i, state);
chPrev = ch;
continue;
}
if (HereDoc.State == 1 && isEOLChar(ch)) {
// Begin of here-doc (the line after the here-doc delimiter):
// Lexically, the here-doc starts from the next line after the >>, but the
// first line of here-doc seem to follow the style of the last EOL sequence
HereDoc.State = 2;
if (HereDoc.Quoted) {
if (state == SCE_PL_HERE_DELIM) {
// Missing quote at end of string! We are stricter than perl.
// Colour here-doc anyway while marking this bit as an error.
state = SCE_PL_ERROR;
}
styler.ColourTo(i - 1, state);
switch (HereDoc.Quote) {
case '\'':
state = SCE_PL_HERE_Q ;
break;
case '"':
state = SCE_PL_HERE_QQ;
break;
case '`':
state = SCE_PL_HERE_QX;
break;
}
} else {
styler.ColourTo(i - 1, state);
switch (HereDoc.Quote) {
case '\\':
state = SCE_PL_HERE_Q ;
break;
default :
state = SCE_PL_HERE_QQ;
}
}
}
if (state == SCE_PL_DEFAULT) {
if ((isascii(ch) && isdigit(ch)) || (isascii(chNext) && isdigit(chNext) &&
(ch == '.' || ch == 'v'))) {
state = SCE_PL_NUMBER;
backflag = BACK_NONE;
numState = PERLNUM_DECIMAL;
dotCount = 0;
if (ch == '0') { // hex,bin,octal
if (chNext == 'x') {
numState = PERLNUM_HEX;
} else if (chNext == 'b') {
numState = PERLNUM_BINARY;
} else if (isascii(chNext) && isdigit(chNext)) {
numState = PERLNUM_OCTAL;
}
if (numState != PERLNUM_DECIMAL) {
i++;
ch = chNext;
chNext = chNext2;
}
} else if (ch == 'v') { // vector
numState = PERLNUM_V_VECTOR;
}
} else if (isWordStart(ch)) {
// if immediately prefixed by '::', always a bareword
state = SCE_PL_WORD;
if (chPrev == ':' && styler.SafeGetCharAt(i - 2) == ':') {
state = SCE_PL_IDENTIFIER;
}
unsigned int kw = i + 1;
// first check for possible quote-like delimiter
if (ch == 's' && !isNonQuote(chNext)) {
state = SCE_PL_REGSUBST;
Quote.New(2);
} else if (ch == 'm' && !isNonQuote(chNext)) {
state = SCE_PL_REGEX;
Quote.New(1);
} else if (ch == 'q' && !isNonQuote(chNext)) {
state = SCE_PL_STRING_Q;
Quote.New(1);
} else if (ch == 'y' && !isNonQuote(chNext)) {
state = SCE_PL_REGSUBST;
Quote.New(2);
} else if (ch == 't' && chNext == 'r' && !isNonQuote(chNext2)) {
state = SCE_PL_REGSUBST;
Quote.New(2);
kw++;
} else if (ch == 'q' && (chNext == 'q' || chNext == 'r' || chNext == 'w' || chNext == 'x') && !isNonQuote(chNext2)) {
if (chNext == 'q') state = SCE_PL_STRING_QQ;
else if (chNext == 'x') state = SCE_PL_STRING_QX;
else if (chNext == 'r') state = SCE_PL_STRING_QR;
else if (chNext == 'w') state = SCE_PL_STRING_QW;
Quote.New(1);
kw++;
} else if (ch == 'x' && (chNext == '=' || // repetition
!isWordStart(chNext) ||
(isdigit(chPrev) && isdigit(chNext)))) {
state = SCE_PL_OPERATOR;
}
// if potentially a keyword, scan forward and grab word, then check
// if it's really one; if yes, disambiguation test is performed
// otherwise it is always a bareword and we skip a lot of scanning
// note: keywords assumed to be limited to [_a-zA-Z] only
if (state == SCE_PL_WORD) {
while (isWordStart(styler.SafeGetCharAt(kw))) kw++;
if (!isPerlKeyword(styler.GetStartSegment(), kw, keywords, styler)) {
state = SCE_PL_IDENTIFIER;
}
}
// if already SCE_PL_IDENTIFIER, then no ambiguity, skip this
// for quote-like delimiters/keywords, attempt to disambiguate
// to select for bareword, change state -> SCE_PL_IDENTIFIER
if (state != SCE_PL_IDENTIFIER && i > 0) {
unsigned int j = i;
bool moreback = false; // true if passed newline/comments
bool brace = false; // true if opening brace found
char ch2;
// first look backwards past whitespace/comments for EOLs
// if BACK_NONE, neither operator nor keyword, so skip test
if (backflag != BACK_NONE) {
while (--j > backPos) {
if (isEOLChar(styler.SafeGetCharAt(j)))
moreback = true;
}
ch2 = styler.SafeGetCharAt(j);
if (ch2 == '{' && !moreback) {
// {bareword: possible variable spec
brace = true;
} else if ((ch2 == '&' && styler.SafeGetCharAt(j - 1) != '&')
// &bareword: subroutine call
|| (ch2 == '>' && styler.SafeGetCharAt(j - 1) == '-')
// ->bareword: part of variable spec
|| (ch2 == 'b' && styler.Match(j - 2, "su"))) {
// sub bareword: subroutine declaration
// (implied BACK_KEYWORD, no keywords end in 'sub'!)
state = SCE_PL_IDENTIFIER;
}
// if status still ambiguous, look forward after word past
// tabs/spaces only; if ch2 isn't one of '[{(,' it can never
// match anything, so skip the whole thing
j = kw;
if (state != SCE_PL_IDENTIFIER
&& (ch2 == '{' || ch2 == '(' || ch2 == '['|| ch2 == ',')
&& kw < lengthDoc) {
while (ch2 = styler.SafeGetCharAt(j),
(ch2 == ' ' || ch2 == '\t') && j < lengthDoc) {
j++;
}
if ((ch2 == '}' && brace)
// {bareword}: variable spec
|| (ch2 == '=' && styler.SafeGetCharAt(j + 1) == '>')) {
// [{(, bareword=>: hash literal
state = SCE_PL_IDENTIFIER;
}
}
}
}
backflag = BACK_NONE;
// an identifier or bareword
if (state == SCE_PL_IDENTIFIER) {
if ((!isWordStart(chNext) && chNext != '\'')
|| (chNext == '.' && chNext2 == '.')) {
// We need that if length of word == 1!
// This test is copied from the SCE_PL_WORD handler.
styler.ColourTo(i, SCE_PL_IDENTIFIER);
state = SCE_PL_DEFAULT;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -