tokenizer.cs

来自「Grammatica is a C# and Java parser gener」· CS 代码 · 共 898 行 · 第 1/2 页
898 行
/* * Tokenizer.cs * * This work is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published * by the Free Software Foundation; either version 2 of the License, * or (at your option) any later version. * * This work is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA * * As a special exception, the copyright holders of this library give * you permission to link this library with independent modules to * produce an executable, regardless of the license terms of these * independent modules, and to copy and distribute the resulting * executable under terms of your choice, provided that you also meet, * for each linked independent module, the terms and conditions of the * license of that module. An independent module is a module which is * not derived from or based on this library. If you modify this * library, you may extend this exception to your version of the * library, but you are not obligated to do so. If you do not wish to * do so, delete this exception statement from your version. * * Copyright (c) 2003 Per Cederberg. All rights reserved. */using System.Collections;using System.IO;using System.Text;using PerCederberg.Grammatica.Parser.RE;namespace PerCederberg.Grammatica.Parser {    /**     * A character stream tokenizer. This class groups the characters read      * from the stream together into tokens ("words"). The grouping is     * controlled by token patterns that contain either a fixed string to     * search for, or a regular expression. If the stream of characters      * don't match any of the token patterns, a parse exception is thrown.      *     * @author   Per Cederberg, <per at percederberg dot net>     * @version  1.4     */    public class Tokenizer {            /**         * The token list feature flag.         */        private bool useTokenList = false;        /**         * The string token matcher. This token matcher is used for all         * string token patterns. This matcher implements a DFA to          * provide maximum performance.         */        private StringTokenMatcher stringMatcher = new StringTokenMatcher();        /**         * The list of all regular expression token matchers. These          * matchers each test matches for a single regular expression.         */        private ArrayList regexpMatchers = new ArrayList();        /**         * The input stream to read from. When this is set to null, no         * further input is available.         */        private TextReader input = null;        /**         * The buffer with previously read characters. Normally characters         * are appended in blocks to this buffer, and for every token that         * is found, its characters are removed from the buffer.         */        private StringBuilder buffer = new StringBuilder();        /**         * The current position in the string buffer.         */        private int position = 0;        /**         * The line number of the first character in the buffer. This          * value will be incremented when reading past line breaks.          */        private int line = 1;        /**         * The column number of the first character in the buffer. This          * value will be updated for every character read.          */            private int column = 1;            /**         * The end of buffer read flag. This flag is set if the end of         * the buffer was encountered while matching token patterns.         */        private bool endOfBuffer = false;        /**         * The previous token in the token list.         */        private Token previousToken = null;         /**         * Creates a new tokenizer for the specified input stream.         *          * @param input          the input stream to read         */        public Tokenizer(TextReader input) {            this.input = input;        }        /**         * Checks if the token list feature is used. The token list          * feature makes all tokens (including ignored tokens) link to          * each other in a linked list. By default the token list feature          * is not used.         *         * @return true if the token list feature is used, or          *         false otherwise         *         * @see #setUseTokenList         * @see Token#getPreviousToken         * @see Token#getNextToken         *         * @since 1.4         */        public bool GetUseTokenList() {            return useTokenList;        }        /**         * Sets the token list feature flag. The token list feature makes         * all tokens (including ignored tokens) link to each other in a          * linked list when active. By default the token list feature is         * not used.         *         * @param useTokenList   the token list feature flag         *         * @see #getUseTokenList         * @see Token#getPreviousToken         * @see Token#getNextToken         *         * @since 1.4         */        public void SetUseTokenList(bool useTokenList) {            this.useTokenList = useTokenList;        }        /**         * Returns a description of the token pattern with the         * specified id.         *          * @param id             the token pattern id         *          * @return the token pattern description, or         *         null if not present         */        public string GetPatternDescription(int id) {            TokenPattern        pattern;            RegExpTokenMatcher  re;                    pattern = stringMatcher.GetPattern(id);            if (pattern != null) {                return pattern.ToShortString();            }            for (int i = 0; i < regexpMatchers.Count; i++) {                re = (RegExpTokenMatcher) regexpMatchers[i];                if (re.GetPattern().GetId() == id) {                    return re.GetPattern().ToShortString();                }            }            return null;        }        /**         * Returns the current line number. This number will be the line         * number of the next token returned.         *          * @return the current line number         */        public int GetCurrentLine() {            return line;        }            /**         * Returns the current column number. This number will be the          * column number of the next token returned.         *          * @return the current column number         */        public int GetCurrentColumn() {            return column;        }        /**         * Adds a new token pattern to the tokenizer. The pattern will be         * added last in the list, choosing a previous token pattern in          * case two matches the same string.         *          * @param pattern        the pattern to add         *          * @throws ParserCreationException if the pattern couldn't be          *             added to the tokenizer         */        public void AddPattern(TokenPattern pattern) {            switch (pattern.GetPatternType()) {            case TokenPattern.PatternType.STRING:                stringMatcher.AddPattern(pattern);                break;            case TokenPattern.PatternType.REGEXP:                try {                    regexpMatchers.Add(new RegExpTokenMatcher(pattern));                } catch (RegExpException e) {                    throw new ParserCreationException(                        ParserCreationException.ErrorType.INVALID_TOKEN,                        pattern.GetName(),                        "regular expression contains error(s): " +                         e.Message);                }                break;            default:                throw new ParserCreationException(                    ParserCreationException.ErrorType.INVALID_TOKEN,                    pattern.GetName(),                    "pattern type " + pattern.GetPatternType() +                     " is undefined");            }        }        /**         * Finds the next token on the stream. This method will return         * null when end of file has been reached. It will return a         * parse exception if no token matched the input stream, or if         * a token pattern with the error flag set matched. Any tokens         * matching a token pattern with the ignore flag set will be         * silently ignored and the next token will be returned.         *          * @return the next token found, or          *         null if end of file was encountered         *         * @throws ParseException if the input stream couldn't be read or         *             parsed correctly         */        public Token Next() {            Token  token = null;                        do {                token = NextToken();                if (useTokenList && token != null) {                    token.SetPreviousToken(previousToken);                    previousToken = token;                }                if (token == null) {                    return null;                } else if (token.GetPattern().IsError()) {                    throw new ParseException(                        ParseException.ErrorType.INVALID_TOKEN,                        token.GetPattern().GetErrorMessage(),                        token.GetStartLine(),                        token.GetStartColumn());                } else if (token.GetPattern().IsIgnore()) {                    token = null;                }            } while (token == null);                        return token;        }        /**         * Finds the next token on the stream. This method will return         * null when end of file has been reached. It will return a         * parse exception if no token matched the input stream.         *          * @return the next token found, or          *         null if end of file was encountered         *         * @throws ParseException if the input stream couldn't be read or         *             parsed correctly         */        private Token NextToken() {            TokenMatcher    m;            Token           token;            string          str;            ParseException  e;                        // Find longest matching string             do {                if (endOfBuffer) {                    ReadInput();                    endOfBuffer = false;                }                m = FindMatch();            } while (endOfBuffer && input != null);                        // Return token results            if (m != null) {                str = buffer.ToString();                str = str.Substring(position, m.GetMatchedLength());                token = new Token(m.GetMatchedPattern(), str, line, column);                 position += m.GetMatchedLength();                line = token.GetEndLine();                column = token.GetEndColumn() + 1;                return token;            } else if (position >= buffer.Length) {                return null;            } else {                e = new ParseException(                    ParseException.ErrorType.UNEXPECTED_CHAR,                    buffer[position].ToString(),                    line,                    column);                 if (buffer[position] == '\n') {                    line++;                    column = 1;                } else {                    column++;                }                position++;                throw e;            }        }        /**         * Reads characters from the input stream and appends them to         * the input buffer. This method is safe to call even though         * the end of file has been reached. As a side effect, this         * method may also remove         *          * @throws ParseException if an error was encountered while          *             reading the input stream         */        private void ReadInput() {            char[]  chars = new char[4096];            int     length;                        // Check for end of file            if (input == null) {                return;            }                        // Remove old characters from buffer            if (position > 1024) {                buffer.Remove(0, position);                position = 0;            }                        // Read characters            try {                length = input.Read(chars, 0, chars.Length);            } catch (IOException e) {                input = null;                throw new ParseException(ParseException.ErrorType.IO,                                         e.Message,                                         -1,                                         -1);            }                        // Append characters to buffer            if (length > 0) {                buffer.Append(chars, 0, length);            }            if (length < chars.Length) {                input.Close();                input = null;            }        }        /**         * Finds the longest token match from the current buffer         * position. This method will return the token matcher for the         * best match, or null if no match was found. As a side         * effect, this method will also set the end of buffer flag.         *           * @return the token mathcher with the longest match, or         *         null if no match was found         */        private TokenMatcher FindMatch() {            TokenMatcher        bestMatch = null;            int                 bestLength = 0;            RegExpTokenMatcher  re;            string              str = buffer.ToString();            // Check string matches            if (stringMatcher.MatchFrom(str, position)) {                bestMatch = stringMatcher;                bestLength = bestMatch.GetMatchedLength();            }            if (stringMatcher.HasReadEndOfString()) {                endOfBuffer = true;            }                    // Check regular expression matches            for (int i = 0; i < regexpMatchers.Count; i++) {                re = (RegExpTokenMatcher) regexpMatchers[i];                if (re.MatchFrom(str, position)                 && re.GetMatchedLength() > bestLength) {                    bestMatch = re;                    bestLength = bestMatch.GetMatchedLength();                }                if (re.HasReadEndOfString()) {                    endOfBuffer = true;                }            }            return bestMatch;        }        /**         * Returns a string representation of this object. The returned         * string will contain the details of all the token patterns          * contained in this tokenizer.         *          * @return a detailed string representation          */        public override string ToString() {            StringBuilder  buffer = new StringBuilder();            buffer.Append(stringMatcher);            for (int i = 0; i < regexpMatchers.Count; i++) {                buffer.Append(regexpMatchers[i]);            }            return buffer.ToString();        }    }    /**     * A token pattern matcher. This class is the base class for the     * two types of token matchers that exist. The token matcher     * checks for matches with the tokenizer buffer, and maintains the     * state of the last match.     */    internal abstract class TokenMatcher {        /**         * Returns the latest matched token pattern.         *          * @return the latest matched token pattern, or         *         null if no match found         */
tokenizer.cs - 源码说明

本页面展示了「Grammatica is a C# and Java parser generator (compiler compiler). It improves upon simlar tools (lik」中的 tokenizer.cs 源码文件，采用 CS 编程语言编写，共 898 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与compiler相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?