📄 lex.h

📁 一个很好的协议,数据包解码工具,可以分析7号(ISUP,MTP,...), TCP/UDP等各种协议,特别的是还能支持自定义的二进制数据报,可以通过插件无限扩充协议库.
💻 H
字号:
/*==================================================================
=  文件名  : Lex
=  主要功能: 词法分析器
=  修改日期: 2006.10
=  作者    : shen beide
====================================================================*/

#if !defined(_LEX_H)
#define _LEX_H

#include "PubHeader.h"
#include "Lexbase.h"
#include "ObArray.h"
#include "StringObject.h"

/*
#define _LEX_DEBUG_ON_
*/

////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
// 基本定义规则说明: 字符(集)或SubRULE + 重复次数定义 
//               
// \.  	 匹配除换行符外的任何字符，“.”等于是字符集[^\n\r](Window)或[^\n]( Unix)的简写。
// \b    匹配一个单词的开始或结尾   匹配一个单词边界，也就是指单词和空格间的位置。
// ^ 	 匹配字符串的开始           作为多行匹配的时候，匹配行的开始 '\r\n'或'\n'
// $ 	 匹配字字符串的结尾         作为多行匹配的时候，匹配行的结束。
// \d 	 任何数字
// \c    字母([a-zA-Z]) 
//
// *     重复任意次数            
// +     重复一次或多次 
// ?     重复一次或0次 
// <n>   重复n次 
// <n,m> 重复最少n次，最多m次 
// <n,>  重复最少n次 
//
// ( )   分组操作符。
// [ ]   范围之内, 字符集定义(支持标准的256个ASCII字符), 除]\和第一个字符是^$, 其他的字符不需要转意
// [^]	 范围之外, 除]\之外的字符不需要转意
// |     选择(或)
//
// {name}     引用已定义规则
// {name Id}  引用已定义规则, 并将匹配串以Token中的ParamList[Id]返回
//            如果整个规则集未有自定义的ParamId,则系统默认会将最外层的子分组和引用按顺序返回      
//
// {?[ ]}     遇到范围之内的字符前时停止(范围定义同上), 否则继续, 该定义不支持重复(因为必然为死循环)
// {?=str}    在str前停止, 该定义不支持重复(因为必然为死循环), str必须存在
// {?>str}    在str后停止, 该定义不支持重复, str必须存在
//            尽量采用以上连续分析模式,可以有效减少递归分支数目,提高速度
//
//
// 注: 1. ^$*+?<[()|{} 是具有特殊含义的字符, 
//                     
//                     如果要使用其原始含义,需要在字符前加上转义符\\
//
//        [ ] 中除了首字符可能为^(表示非包含), 以上的特殊含义字符不再具有特殊意义, 但对-(){}需要加上\\
//            可以含\\d \\c \\. 方便表示一个集合
//
//     2. 在*+?<>重复次数定义后, 若再加?, 表示非贪婪模式(重复次数尽可能少)
//
////////////////////////////////////////////////////////////////////////////


//////////////////////////////////////////////////////////////
#ifdef _DLL_PROJECT
template class CLASS_EXPORT TObArray<LexState>;
template class CLASS_EXPORT CObjectPool_<MatchPattern>;
template class CLASS_EXPORT TObArray<MatchPattern>;  
#endif

//////////////////////////////////////////////////////////////
#ifdef _DLL_PROJECT
class CLASS_EXPORT LexRules
#else
class LexRules
#endif
{
public:
    LexRules();
   ~LexRules();
    
    bool AddRule(long Id, char* szRule);
    bool AddRule(long Id, char* szRuleName, char* szRule);
    int  GetRuleNum();

    LexState* FoundRule(long  Id);
    LexState* FoundRule(char* szName,long* pId=NULL);
    
    bool Optimize();
    
    bool Dump(char* filename,bool bOverWrite=true);
    bool DumpRule(long Id,OTSTR& strRule);

    void Clear();

    OTSTR& GetLastError() { return m_strLastError;  }
    
private:
    LexState* BuildRule(long Id, char* szRule);
    LexState* NewLexState();

    CharSet*  GetCharSet(char* &szRule);
    RepeatAttribute* GetRepeatAttribute(char* &szRule);
    char GetChar(const char*& pFrom,bool& bIsEscape);
    long GetLong(const char*& pFrom);
        
    bool DumpRule (LexState* pEntry, OTSTR& strRule);
    bool CheckRule(LexState* pEntry, bool ParamList[]);
        
private:    
    friend class LexState;
    friend class CLex;

    /////////////////////////////////////////
    TObArray<LexState>  m_RuleEntryList;
    TObArray<LexState>  m_StateObjectList;  // 方便删除
    
    bool                m_bHasCustomReturnParam;
    bool                m_bEnableLineBoundCheck;      
    bool                m_bEnableWordBoundCheck;      
    
    OTSTR    m_strLastError;
};

//////////////////////////////////////////////////////////////
#ifdef _DLL_PROJECT
class CLASS_EXPORT LexToken: public TObject
#else
class LexToken: public TObject
#endif
{
public:
    enum { TOKEN_INVALID=-1, TOKEN_UNMATCH=0 };  // system Reserved TokenId

private:
    long     m_TokenId;     // RuleId
    OTSTR 	 m_TokenName;
    Location m_TokenLocation;
    long     m_TokenLen;

    int      m_ParamNum;
    long     m_ParamLocatList[MAX_LEXPARAM_ID+1][2];   // [2] begin(相对位置),len   initial: -1,-1
    
public:
    LexToken();
    LexToken(long TokenId,char* name,long namelen=-1);

    ////////////////////////////////////////////////
    long      getTokenId()  { return m_TokenId; }
    OTSTR&    getName()     { return m_TokenName;  }
    Location& getLocation() { return m_TokenLocation; }
    long      getTokenLen() { return m_TokenLen;  }
    int       getParamNum() { return m_ParamNum;  }
    bool      getParam(long ParamId, OTSTR& strRet);
    
    bool isToken(long TokenId) { return (m_TokenId==TokenId)? true:false; }
    
    void set(long TokenId,char* name, long namelen=-1);
    void setTokenId(long TokenId);
    void setLocation(long locat,long nline,long ncolumn=-1);
    void setLocation(Location& locat);
    void setTokenLen(long len);

    void operator=(const LexToken& right);
    void clone(const LexToken& right);
    
protected:
    friend class CLex;
};

//////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////
#ifdef _DLL_PROJECT
class CLASS_EXPORT CLex
#else
class CLex
#endif
{
public:
    enum { UNRETURN_RULE_BEGIN_ID=1000, //GetToken时不作为Token返回, 合法Token, 但忽略
           INTERNAL_RULE_BEGIN_ID=2000  //内部子流程(由其他流程调用),不作为词法分析入口
    };   

    enum { GET_TOKEN_EOF=0, GET_TOKEN_SUCC=1, GET_TOKEN_FAIL=-1 };

public:
    CLex();
    virtual ~CLex();

    virtual char* GetName() { return ""; }
    virtual char* GetPath() { return ""; }
    virtual char* GetAbsoluteFilename() { return ""; }
    
    void  setRules(LexRules* pLexRules);
    bool  FoundRule(long  Id);
    bool  FoundRule(char* szName,long* pId=NULL);
    
    void  Enable_OverlapMatch(bool bEnableOverlapMatch=true);    // 否则每次GetToken会清空MatchPatternList
    void  Enable_ReturnUnMatchedContent(bool bReturnUnMatchedContent=true);  // match and unmatch(TokenType=Other)
    void  Enable_CaseSensitive(bool bCaseSensitive=true);
    void  SetWordBoundMode(int mode);  //  mode=0  (0x20 <ch< 0x7F)与其他字符的交界认为是Wordbound
                                       //      =1  (0x00<=ch<=0x20)与其他字符的交界认为是Wordbound,用于中文识别

    void  setLocatOffset(Location& locat);
    Location& getLocatOffset();
    
    void  setBuffer(char* pbuf, long buflen=-1);
    char* getBuffer() { return m_pbuf;   }
    long  getBufLen() { return m_buflen; }

    virtual void Reset(bool bResetBuffer);
    
    /////////////////////////////////////////////////////////////////////
    // Params: bJustPreview=false  Get & move out next Token
    //         bJustPreview=true   Just read next Token (not move out)
    virtual int  GetToken(LexToken& Token, bool bJustPreview=false);   
    virtual int  GetToken(LexToken& Token, WORD nExFlag, bool bJustPreview);
    virtual bool GetSpecificToken(LexToken& Token, long TokenId); // 0<TokenId
    
    bool  Fetch_string(char* szLeftOperator, char* szRightOperator, bool bStartWithLeftOperator, // 考虑了配对的() [], 当前buf必须以LeftOperator开头
                       bool bReturnBetween, OTSTR& strReturn);  // bReturnBetween决定返回串是否包含Left & Right Operator
                                                                // bStartWithLeftOperator只有当szLeftOperator!=NULL时有效
    bool  ForwardLocatTo(long nLocat);

    virtual Location& getLocation() { return m_location; }
    virtual bool  setLocation(Location& locat);

    OTSTR& GetLastError()   { return m_strLastError;  }
    
private:
    int   GetToken_(LexToken& Token, long RuleId, bool bJustPreview);   

    bool  TryFrom(Location& Locat, long RuleId=-1); // 广度优先
    bool  PriTryFrom(Location& Locat);              // 深度优先
    bool  InputChar (long nCurLocat,short eChar,bool& bContinue);
    bool  Transition(long nCurLocat,short eChar,bool bCaseSensitive=true,short eCharReverse=0);

    void  DumpStack(char* filename,bool bOverWrite=true);
    
    bool  isWordBound(unsigned char charLeft,unsigned char charRight);
    short CharReverse(short eChar);

    void  Clear_MatchPatternList(TObArray<MatchPattern>& PatternList);

    void  CreateTokenFromPattern(MatchPattern& Pattern, LexToken& Token);

    ////////////////////////////////////////////
    CObjectPool_<MatchPattern>  m_MatchPatternPool;
    
    MatchPattern* New_MatchPattern(MatchPattern* pClonePattern=NULL);
    void  Delete_MatchPattern(MatchPattern* pPattern);
    
protected:
    enum { CHECK_TOP_STATE=300 };
    
    char*     m_pbuf;
    long      m_buflen;
    Location  m_location;
    
    LexRules* m_pLexRules;
    bool      m_bUserOptimizedMode;
    bool      m_bReturnUnMatchedContent;    // default: true
    bool      m_bEnableOverlapMatch;        // default: false
    bool      m_bCaseSensitive;             // default: true
    int       m_nWordBoundCheckMode;        // default: 1
    Location  m_locationOffset;

    OTSTR     m_strLastError;
    
    TObArray<MatchPattern>  m_MatchPatternList;  

    //////////////////////////////////
    friend class GramParser;
};

/////////////////////////////////////////////////////
/////////////////////////////////////////////////////
#ifdef _DLL_PROJECT
class CLASS_EXPORT LexPattern
#else
class LexPattern
#endif
{
public:
    LexPattern(char* szRule=NULL);
   ~LexPattern();

    bool AddRule(bool bMainAccess,char* szRule);  // bMainAccess区分是入口还是非入口(内部Rule)
    bool AddRule(bool bMainAccess,char* szRuleName,char* szRule);
    void Clear();

    void Enable_CaseSensitive(bool bCaseSensitive) { m_bCaseSensitive=bCaseSensitive; }
    
    bool Matching(char* szString,OtstrArray* lpParamArray=NULL);

    OTSTR& GetLastError() { return m_strLastError;  }
    
protected:
    LexRules m_Rules;
    CLex     m_Lex;
    bool     m_bCaseSensitive;      // default: true
    
    OTSTR    m_strLastError;
    
private:
    long     m_InternalRuleCounter;
};


#endif
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -