📄 lexer.cs

📁 charp compiler
💻 CS
📖 第 1 页 / 共 4 页
字号:
    {
        m_reader = reader;
        
        // Set line number info
        m_stFilenameHint = stFilenameHint;
        m_row = 1;
        m_col = 1;
        m_fStartOfLine = true;
        
        m_fIsErrorMode = false;        
        
        InitPreprocessor(stDefines);
        
        
    }
#endregion

#region Static Construction
    // Fill out keyword hash. We only need one copy for all the lexers
    static Lexer()
    {   
        m_keywords["return"] = Token.Type.cReturn;
        m_keywords["class"] = Token.Type.cClass;
        m_keywords["interface"] = Token.Type.cInterface;
        m_keywords["struct"] = Token.Type.cStruct;
        m_keywords["enum"] = Token.Type.cEnum;
        m_keywords["delegate"] = Token.Type.cDelegate;
        m_keywords["event"] = Token.Type.cEvent;
        m_keywords["operator"] = Token.Type.cOperator;
        
        m_keywords["new"] = Token.Type.cNew;
        m_keywords["if"] = Token.Type.cIf;
        m_keywords["switch"] = Token.Type.cSwitch;
        m_keywords["else"] = Token.Type.cElse;
        
        m_keywords["using"] = Token.Type.cUsing;
        m_keywords["namespace"] = Token.Type.cNamespace;
        
        m_keywords["out"] = Token.Type.cOut;        
        m_keywords["ref"] = Token.Type.cRef;
        m_keywords["params"] = Token.Type.cParams;
        
        m_keywords["get"] = Token.Type.cGet;        
        m_keywords["set"] = Token.Type.cSet;
        //m_keywords["value"] = Token.Type.cValue; 
        
        m_keywords["do"] = Token.Type.cDo;
        m_keywords["while"] = Token.Type.cWhile;
        m_keywords["for"] = Token.Type.cFor;
        m_keywords["foreach"] = Token.Type.cForEach;
        m_keywords["in"] = Token.Type.cIn;
        
        m_keywords["goto"] = Token.Type.cGoto;
        m_keywords["break"] = Token.Type.cBreak;
        m_keywords["continue"] = Token.Type.cContinue;
        m_keywords["default"] = Token.Type.cDefault;
        m_keywords["case"] = Token.Type.cCase;

        m_keywords["is"] = Token.Type.cIs;
        m_keywords["as"] = Token.Type.cAs;


        m_keywords["try"] = Token.Type.cTry;
        m_keywords["catch"] = Token.Type.cCatch;
        m_keywords["finally"] = Token.Type.cFinally;
        m_keywords["throw"] = Token.Type.cThrow;

        // Literal keywords                
        m_keywords["true"] = Token.Type.cBool;        
        m_keywords["false"] = Token.Type.cBool;        
        m_keywords["null"] = Token.Type.cNull;
        
        // Modifiers
        m_keywords["public"] = Token.Type.cAttrPublic;
        m_keywords["private"] = Token.Type.cAttrPrivate;
        m_keywords["protected"] = Token.Type.cAttrProtected;
        m_keywords["static"] = Token.Type.cAttrStatic;        
        m_keywords["virtual"] = Token.Type.cAttrVirtual;
        m_keywords["abstract"] = Token.Type.cAttrAbstract;
        m_keywords["override"] = Token.Type.cAttrOverride;
        m_keywords["internal"] = Token.Type.cAttrInternal;
        m_keywords["sealed"] = Token.Type.cAttrSealed;
        m_keywords["readonly"] = Token.Type.cAttrReadOnly;
        m_keywords["const"] = Token.Type.cAttrConst;
        
        
        m_keywords["typeof"] = Token.Type.cTypeOf;
    
        // Preprocessor directives        
        m_keywords["#if"] = Token.Type.cPP_If;
        m_keywords["#elif"] = Token.Type.cPP_ElseIf;
        m_keywords["#else"] = Token.Type.cPP_Else;
        m_keywords["#endif"] = Token.Type.cPP_Endif;
        m_keywords["#define"] = Token.Type.cPP_Define;
        m_keywords["#undef"] = Token.Type.cPP_Undef;
        m_keywords["#region"] = Token.Type.cPP_Region;
        m_keywords["#endregion"] = Token.Type.cPP_EndRegion;
            
    }

    // If we find an identifier, we lookup in this table to see
    // if it's actually a keyword. If so, return the keyword (else return the id)     
    protected static Hashtable m_keywords = new Hashtable();
#endregion
    
    // Are we in error mode (in which case we always return EOF)
    bool m_fIsErrorMode;        
    string m_stFilenameHint;


#region Errors
    // Error codes. Mostly from preprocessor / bad EOF
    internal enum ErrorCode
    {
        cUnmatchedEndRegion,            // Missing a #region for this #endregion
        cMissingEndifBeforeEOF,
        cUnterminatedComment,
        cPreProcDirMustBeAtStartOfLine,
        cInvalidPreProcDir,
        cUnterminatedChar,
        cNoNewlineInString,
        cUnexpectedEOF,
        cUnrecognizedEscapeSequence,
        
    }
    
    // Main error hub for lexer
    internal void ThrowError(LexerException e)
    {
        Blue.Driver.StdErrorLog.ThrowError(e);
    }

    // We have a #region, but no matching #endregion    
    LexerException E_MissingEndRegion()
    {
        return new LexerException(ErrorCode.cUnmatchedEndRegion, CalcCurFileRange(), "Missing a #region for this #endregion.");
    }
    
    LexerException E_MissingEndifBeforeEOF()
    {
        return new LexerException(ErrorCode.cMissingEndifBeforeEOF, CalcCurFileRange(),
            "Expected #endif before end-of-file.");
    }
    
    LexerException E_UnterminatedComment()
    {
        return new LexerException(ErrorCode.cUnterminatedComment, CalcCurFileRange(),
            "Must terminate multi-line comment with '*/' before end-of-file.");
    }
        
    LexerException E_PreProcDirMustBeAtStartOfLine()
    {
        return new LexerException(ErrorCode.cPreProcDirMustBeAtStartOfLine, CalcCurFileRange(),
            "Preprocessor directives must be the first non-whitespace token in a line.");
    }
    
    LexerException E_InvalidPreProcDir(string stHint)
    {
        return new LexerException(ErrorCode.cInvalidPreProcDir, CalcCurFileRange(),
            "'" + stHint + "' is not a valid preprocessor directive.");        
    }

    LexerException E_UnterminatedChar()
    {
        return new LexerException(ErrorCode.cUnterminatedChar, CalcCurFileRange(),
            "Unterminated character constant.");
    }
    
    
    LexerException E_NoNewlineInString()
    {
        return new LexerException(ErrorCode.cNoNewlineInString, CalcCurFileRange(),
            "Can not have a newline in a string.");
    }
    
    LexerException E_UnexpectedEOF()
    {
        return new LexerException(ErrorCode.cUnexpectedEOF, CalcCurFileRange(),
            "Unexpected EOF.");
    }
    
    LexerException E_UnrecognizedEscapeSequence(char ch)
    {
        return new LexerException(ErrorCode.cUnrecognizedEscapeSequence, CalcCurFileRange(),
            "Unrecognized escape sequence '\\" + ch + "'.");
    }
    

#endregion
            
#region Data for stream    
    // The lexer is really just a high level wrapper around the TextReader
    protected TextReader m_reader;
    
    // Used to track where in the file we are
    int m_row;
    int m_col;
    
    bool m_fStartOfLine; // are we the first token on a new line?
    
    // Wrappers around the TextReader to track line number info
    int Read()
    {
        int iCh = m_reader.Read();
        m_col++;
        if (iCh == '\n') {
            m_row++;
            m_col = 1;
            m_fStartOfLine = true;
        } 
        return iCh;
    }
    
    int Peek()
    {
        return m_reader.Peek();
    }
    
    string ReadLine()
    {
        // Reading a line will consume a '\n', thus bump us up.
        m_row++;
        m_col = 1;
        m_fStartOfLine = true;
        return m_reader.ReadLine();
    }
    
    // Cache this at the beginning of a lexeme
    protected CursorPos m_StartPos;
    protected FileRange CalcCurFileRange()
    {
        FileRange r = new FileRange();
        r.Filename = this.m_stFilenameHint;
        r.ColStart = m_StartPos.col;
        r.RowStart = m_StartPos.row;
        
        r.ColEnd = m_col;
        r.RowEnd = m_row;
        
        return r;        
    }
#endregion

#region Public Interface Methods
    // Get
    public Token GetNextToken()
    {
        if (m_tknNext != null)
        {
            Token t = m_tknNext;
            m_tknNext = null;
            return t;
        }
        
        return SafeGetNextToken();
    }
    
    // Peek
    public Token PeekNextToken()
    {
        if (m_tknNext == null)
            m_tknNext = SafeGetNextToken();
            
        return m_tknNext;
    }
    
    // For peeking, we remember the next token.
    protected Token m_tknNext = null;
    
    
    // Safe wrapper around GetNextToken
    // Catch exceptions and convert them to Error tokens
    private Token SafeGetNextToken()
    {
        // Once in error mode, we stay in error mode.
        
        Token t = null;        
        
        if (!m_fIsErrorMode)
        {
            try {
                // Do the real work.
                t = GetNextToken_PreprocessorFilter();
            }
            catch(ManualParser.LexerException)
            {
                m_fIsErrorMode = true;
                t = null;
            }
        }
        
        if (t == null)
            return new Token(Token.Type.cEOF, CalcCurFileRange());
        
        return t;
    }
#endregion
   
#region Helper Functions   
    // Helper funcs
    public static bool IsWhitespace(int iCh)
    {
        return iCh == 0x20 || iCh == '\t' || iCh == '\n' || iCh == '\r';
    }
    
    public static bool IsDigit(int iCh)
    {
        return iCh >= '0' && iCh <= '9';    
    }
    
    // Return -1 if not a hex digit, else return 0..15
    public static int AsHexDigit(int iCh)
    {
        if (iCh >= '0' && iCh <= '9')
            return iCh - '0';
        if (iCh >= 'A' && iCh <= 'F')
            return iCh - 'A' + 10;
        if (iCh >= 'a' && iCh <= 'f')
            return iCh - 'a' + 10;            
        return -1;            
    }
    
    public static bool IsFirstIdChar(int iCh)
    {   
        return (iCh == '_') || (iCh >= 'a' && iCh <= 'z') || (iCh >= 'A' && iCh <= 'Z');
    }
    
    public static bool IsIdChar(int iCh)
    {
        return IsFirstIdChar(iCh) || IsDigit(iCh);
    }
#endregion

#region Preprocessor Layer
//-----------------------------------------------------------------------------
// The preprocessor works as a middle layer around GetNextTokenWorker()
// If manages a small symbol table (for #define / #undef) as well 
// as conditionals (#if,#elif,#else,#endif) and strips away #region/#endregion
// Most of the errors that can occur in the lexer are in the preprocessor
//-----------------------------------------------------------------------------

//-----------------------------------------------------------------------------
// Construction. Supply an optional list of predefined symbols
//-----------------------------------------------------------------------------
    protected void InitPreprocessor(string [] stDefines)
    {
        m_tblPreprocSymbols = new Hashtable();
        
        // Always add this as a predefined symbol
        AddSymbol("__BLUE__");
        
        if (stDefines != null)
        {
            foreach(string s in stDefines)
                AddSymbol(s);
        }
    }   

#region Preprocessor Filter
//-----------------------------------------------------------------------------
// When we're skipping over text (in a false branch of an #if), the text
// doesn't have to lex properly. But we still have to recognize nested #if, 
// and the closing #endif, as well as an EOF. 
// So we have a modified lexer to lexer stuff in dead code.
// Note that this lexer must preserve the expression after an #elsif
// This lexer is also #if..#endif nest aware
//-----------------------------------------------------------------------------
    protected Token.Type GetNextDeadToken()
    {
        int iRowBefore = m_row;
        
        int cIfDepth = 0;
        string st;
        do
        {
        // Does this line start with a preprocessor directive?
        // If so, handle it delicately so that we can read the expression afterwards
            SkipWhiteSpace();
#if true
            int iCh;
            do {
                iCh = Read();
            } while (iCh == '\n');
            
            // Skip past opening whitespace
            while(iCh == ' ' || iCh == '\t')
                iCh = Read();
                
            
            if (iCh == '#')
            {                
                // Note that we don't want to call GetNextTokenWorker() because
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -