📄 mal_parser.mx
字号:
@' The contents of this file are subject to the MonetDB Public License@' Version 1.1 (the "License"); you may not use this file except in@' compliance with the License. You may obtain a copy of the License at@' http://monetdb.cwi.nl/Legal/MonetDBLicense-1.1.html@'@' Software distributed under the License is distributed on an "AS IS"@' basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the@' License for the specific language governing rights and limitations@' under the License.@'@' The Original Code is the MonetDB Database System.@'@' The Initial Developer of the Original Code is CWI.@' Portions created by CWI are Copyright (C) 1997-2007 CWI.@' All Rights Reserved.@a M. L. Kersten@v 1.1@-@{@+ The Parser ImplementationThe parser (and its target language) are designed for speed of analysis.For, parsing is a dominant cost-factor in applications interfering withMonetDB. For the language design it meant that look-ahead and ambiguityis avoided where-ever possible without compromising readability andto ease debugging.The syntax layout of a MAL program consists of a module name,a list of include commands, a list of function/ pattern/ command/ factorydefinitions and concludes with the statements to be executed asthe main body of the program. All components are optional.The program may be decorated with comments, which starts with a # andruns till the end of the current line. Comments are retainedin the code block for debugging, but can be removed with an optimizer to reduce spaceand interpretation overhead.@+ The lexical analyzerThe implementation of the lexical analyzer is straightforward:the input is taken from a client input buffer. It is assumed thatthis buffer contains the complete MIL structure to be parsed.@h#ifndef _MAL_PARSER_H#define _MAL_PARSER_H#include "mal_import.h"#define MAXERRORS 250#define CURRENT(c) (c->fdin->buf + c->fdin->pos + c->yycur)#define currChar(X) (*CURRENT(X))#define peekChar(X) (*((X)->fdin->buf + (X)->fdin->pos + (X)->yycur+1))#define nextChar(X) X->yycur++#define prevChar(X) if(X->yycur) X->yycur--mal_export void initParser(void); /* needed in src/mal/mal.c */mal_export int parseMAL(Client cntxt, Symbol curPrg);mal_export void echoInput(Client cntxt);mal_export void debugParser(int i);mal_export str parseError(Client cntxt, str msg);mal_export void advance(Client cntxt, int length);mal_export void skipSpace(Client cntxt);mal_export void skipToEnd(Client cntxt);mal_export int idLength(Client cntxt);mal_export int stringLength(Client cntxt);mal_export str idCopy(Client cntxt, int len);mal_export str strCopy(Client cntxt, int len);mal_export int cstToken(Client cntxt, ValPtr val);mal_export int charCst(Client cntxt, ValPtr val);mal_export int operatorLength(Client cntxt);mal_export str operatorCopy(Client cntxt, int length);mal_export int keyphrase(Client cntxt, str kw, int length);mal_export int keyphrase1(Client cntxt, str kw);mal_export int keyphrase2(Client cntxt, str kw);mal_export int MALkeyword(Client cntxt, str kw, int length);mal_export int MALlookahead(Client cntxt, str kw, int length);mal_export str lastline(Client cntxt);mal_export long position(Client cntxt);#endif /* _MAL_PARSER_H */@- lexical utilitiesBefore a line is parsed we check for a request to echo it.This command should be executed at the beginning of a parserequest and each time we encounter EOL.@c#include "mal_config.h"#include "mal_parser.h"#include "mal_resolve.h"#include "mal_linker.h"#include "mal_atom.h" /* for malAtomDefinition(), malAtomArray(), malAtomProperty() */#include "mal_interpreter.h" /* for showErrors() */#include "mal_instruction.h" /* for pushEndInstruction(), findVariableLength() */#include "mal_namespace.h"#include "mal_utils.h"#define FATALINPUT MAXERRORS+1#define NL(X) ((X)=='\n' || (X)=='\r')void echoInput(Client cntxt){ if (cntxt->listing & LIST_INPUT) { char *c = CURRENT(cntxt); stream_printf(cntxt->fdout,"#"); while (*c && !NL(*c)) { stream_printf(cntxt->fdout, "%c", *c++); } stream_printf(cntxt->fdout, "\n"); }}INLINE void skipSpace(Client cntxt){ char *s= &currChar(cntxt); for (;;) { switch (*s++) { case ' ': case '\t': case '\n': case '\r': nextChar(cntxt); break; default: return; } }}INLINE void advance(Client cntxt, int length){ cntxt->yycur += length; skipSpace(cntxt);}@-The most recurring situation is to recognize identifiers.This process is split into a few steps to simplify subsequentconstruction and comparison.IdLength searches the end of an identifier without changingthe cursor into the input pool.IdCopy subsequently prepares a GDK string for inclusion in theinstruction datastructures.@cshort opCharacter[256];short idCharacter[256];short idCharacter2[256];void initParser(){ int i; for (i = 0; i < 256; i++){ idCharacter2[i]= isalpha(i) || isdigit(i); idCharacter[i] = isalpha(i); } for (i = 0; i < 256; i++) switch(i){ case '-': case '!': case '\\': case '$': case '%': case '^': case '*': case '~': case '+': case '&': case '|': case '<': case '>': case '=': case '/': case ':': opCharacter[i]=1; } idCharacter[TMPMARKER]=1; idCharacter2[TMPMARKER]=1;}#undef isdigit#define isdigit(X) ((X)>='0' && (X)<='9')int idLength(Client cntxt){ str s,t; skipSpace(cntxt); s = CURRENT(cntxt); t=s; if (!idCharacter[(int) (*s)]) return 0; s++; while (idCharacter2[(int) (*s)] ) s++; return s-t;}@-Simple type identifiers can not be marked with a type variable.@cint typeidLength(Client cntxt){ int l; str s; skipSpace(cntxt); s = CURRENT(cntxt); if (!idCharacter[(int) (*s)]) return 0; l = 1; s++; idCharacter[TMPMARKER] = 0; while (idCharacter[(int) (*s)] || isdigit(*s)) { s++; l++; } idCharacter[TMPMARKER]=1; return l;}str idCopy(Client cntxt, int length){ str s= GDKmalloc(length+1); memcpy(s, CURRENT(cntxt),(size_t) length); s[length]=0; advance(cntxt,length); return s;}int MALkeyword(Client cntxt, str kw, int length){ skipSpace(cntxt); if (MALlookahead(cntxt, kw, length)) { advance(cntxt, length); return 1; } return 0;}int MALlookahead(Client cntxt, str kw, int length){ int i; skipSpace(cntxt); /* avoid double test or use lowercase only. */ if (currChar(cntxt) == *kw && strncmp(CURRENT(cntxt), kw, length) == 0 && !idCharacter[(int) (CURRENT(cntxt)[length])] && !isdigit((int) (CURRENT(cntxt)[length])) ) { return 1; } /* check for captialized versions */ for (i = 0; i < length; i++) if (tolower(CURRENT(cntxt)[i]) != kw[i]) return 0; if (!idCharacter[(int) (CURRENT(cntxt)[length])] && !isdigit((int) (CURRENT(cntxt)[length])) ) { return 1; } return 0;}@-Keyphrase testing is limited to a few characters only(check manually). To speed this up we use a pipelined andinline macros.@cINLINE int keyphrase1(Client cntxt, str kw){ skipSpace(cntxt); if (currChar(cntxt) == *kw) { advance(cntxt,1); return 1; } return 0;}INLINE int keyphrase2(Client cntxt, str kw){ skipSpace(cntxt); if (CURRENT(cntxt)[0] == kw[0] && CURRENT(cntxt)[1] == kw[1]) { advance(cntxt,2); return 1; } return 0;}INLINE int keyphrase(Client cntxt, str kw,int length){ skipSpace(cntxt); if( strncmp(CURRENT(cntxt),kw,length)== 0){ advance(cntxt,length); return 1; } return 0;}@-A similar approach is used for string literals.Beware, string lengths returned include thebrackets and escapes. They are eaten away in strCopy.We should provide the C-method to split strings andconcatenate them upon retrieval[todo]@cint stringLength(Client cntxt){ int l=0; int quote =0; str s; skipSpace(cntxt); s = CURRENT(cntxt); if( *s != '"') return 0; s++; while( *s ){ if( quote ){ l++; s++; quote=0; } else { if( *s == '"' ) break; quote= *s == '\\'; l++; s++; } } return l+2;}@-Beware, the idcmp routine uses a short cast to compare multiple bytesat once. This may cause problems when the net string length is zero.@cstr strCopy(Client cntxt, int length){ str s; int i; i = length<4 ? 4: length; s = GDKzalloc(i); if (s == 0) GDKfatal("FATAL:strCopy:"); memcpy(s, CURRENT(cntxt) + 1, (size_t) (length - 2) ); mal_unquote(s); return s;}@-And a similar approach is used for operator names.A lookup table is considered, because it generally isfaster then a non-dense switch.@cint operatorLength(Client cntxt){ int l=0; str s; skipSpace(cntxt); for (s = CURRENT(cntxt); *s; s++) { if( opCharacter[(int)(*s)] ) l++; else return l; } return l;}str operatorCopy(Client cntxt, int length){ return idCopy(cntxt,length);}@-For error reporting we may have to find the start of the previous line,which, ofcourse, is easy given the client buffer.The remaining functions are self-explanatory.@cstr lastline(Client cntxt){ str s = CURRENT(cntxt); if (NL(*s)) s++; while (s && s > cntxt->fdin->buf && !NL(*s)) s--; if (NL(*s)) s++; return s;}long position(Client cntxt){ str s = lastline(cntxt); return (long) (CURRENT(cntxt) - s);}#if HAVE_STRTOLL && !HAVE_DECL_STRTOLLextern long long strtoll(const char *, char **, int);#endif@-Upon encountering an error we skip to the nearest semicolon,or comment terminated by a new line@cINLINE void skipToEnd(Client cntxt){ char c; while( (c= *CURRENT(cntxt)) != ';' && c) nextChar(cntxt); if(c) nextChar(cntxt);}@-The lexical analyser for constants is a little more complex.Aside from getting its length, we need an indication of its type.The constant structure is initialized for later use.@cint cstToken(Client cntxt, ValPtr cst){ int i = 0; long long l; int hex=0; str s = CURRENT(cntxt); cst->vtype = TYPE_int; switch(*s){ case '"': cst->vtype= TYPE_str; i= stringLength(cntxt); cst->val.sval =strCopy(cntxt, i); cst->len= strlen(cst->val.sval); return i; case '\'': return charCst(cntxt,cst); case '-': i++; s++; case '0': if( (s[1] == 'x' || s[1] == 'X')){ /* deal with hex */ hex= TRUE; i+=2; s+=2; } case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if( hex) while (isdigit((int)*s) || isalpha((int)*s) ) { if( !((tolower(*s) >= 'a' && tolower(*s) <= 'f') || isdigit((int)*s) ) ) break; i++; s++; } else while (isdigit((int)*s) ) { i++; s++; } if( hex) goto handleInts; case '.': if (*s == '.' && isdigit(*(s+1)) ) { i++; s++; while (isdigit(*s)) { i++; s++; } cst->vtype = TYPE_flt; } if (*s == 'e' || *s == 'E') { i++; s++; if (*s == '-' || *s == '+'){ i++; s++; } cst->vtype = TYPE_dbl; while (isdigit(*s)) { i++; s++; } } if( cst->vtype == TYPE_flt) { int len= i; float *pval= 0; fltFromStr(CURRENT(cntxt), &len, &pval); cst->val.fval= *pval; if( pval) GDKfree(pval); } if( cst->vtype == TYPE_dbl){ int len= i; double *pval= 0; dblFromStr(CURRENT(cntxt), &len, &pval); cst->val.dval= *pval; if( pval) GDKfree(pval); if( cst->val.dval> FLT_MIN && cst->val.dval<= FLT_MAX ){ cst->vtype= TYPE_flt; cst->val.fval = (flt) cst->val.dval; } } if (*s == '@') { cst->vtype = TYPE_oid; errno = 0; cst->val.lval = strtoll(CURRENT(cntxt),NULL,0); if( cst->val.lval <0 || errno== ERANGE ) cst->val.oval= oid_nil; else cst->val.oval= (oid) cst->val.lval; i++; s++; while (isdigit(*s)) { i++; s++; } return i; } if (*s == 'L') { if( cst->vtype == TYPE_int) cst->vtype = TYPE_lng; if( cst->vtype == TYPE_flt) cst->vtype = TYPE_dbl; i++; s++; if (*s == 'L') { i++; s++; } if( cst->vtype == TYPE_dbl ){ int len= i; double *pval= 0; dblFromStr(CURRENT(cntxt), &len, &pval); cst->val.dval= *pval; if( pval) GDKfree(pval); } else { int len= i; lng *pval= 0; lngFromStr(CURRENT(cntxt), &len, &pval); cst->val.lval= *pval; if( pval) GDKfree(pval); } return i; }handleInts: if( cst->vtype == TYPE_int || cst->vtype == TYPE_lng){ l = strtoll(CURRENT(cntxt),NULL,0); if( l> INT_MIN && l<= INT_MAX ){ cst->vtype= TYPE_int; cst->val.ival = (int)l; } else { cst->vtype= TYPE_lng; cst->val.lval = l; } } return i; case 'f': if( strncmp(s,"false",5)==0 && !isalnum((int)*(s+5)) && *(s+5)!= '_'){ cst->vtype = TYPE_bit; cst->val.cval[0] = 0; cst->len = 1; return 5; } return 0; case 't': if( strncmp(s,"true",4)==0 && !isalnum((int)*(s+4)) && *(s+4)!= '_'){ cst->vtype = TYPE_bit; cst->val.cval[0] = 1; cst->len = 1; return 4; } return 0; case 'n': if( strncmp(s,"nil",3)==0 && !isalnum((int)*(s+3)) && *(s+3)!= '_'){ cst->vtype = TYPE_void; cst->val.oval = oid_nil; return 3; } } return 0;}#define cstCopy(C,I) idCopy(C,I)@- Type qualifierTypes are recognized as identifiers preceded by a colon.They may be extended with a property listand 'any' types can be marked with an alias.The type qualifier parser returns the encoded type as a short 32-bit integer. The syntax structure is@multitable @columnfractions 0.15 0.8@item typeQualifier @tab : typeName propQualifier @item typeName@tab : scalarType | collectionType | anyType@item scalarType@tab : ':' @sc{ identifier} @item collectionType @tab : ':' @sc{ bat} ['[' col ',' col ']'] @item anyType@tab : ':' @sc{ any} [typeAlias] @item col@tab : scalarType | anyType
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -