📄 tokenizer.cs
字号:
/* * Tokenizer.cs * * This work is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published * by the Free Software Foundation; either version 2 of the License, * or (at your option) any later version. * * This work is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA * * As a special exception, the copyright holders of this library give * you permission to link this library with independent modules to * produce an executable, regardless of the license terms of these * independent modules, and to copy and distribute the resulting * executable under terms of your choice, provided that you also meet, * for each linked independent module, the terms and conditions of the * license of that module. An independent module is a module which is * not derived from or based on this library. If you modify this * library, you may extend this exception to your version of the * library, but you are not obligated to do so. If you do not wish to * do so, delete this exception statement from your version. * * Copyright (c) 2003 Per Cederberg. All rights reserved. */using System.Collections;using System.IO;using System.Text;using PerCederberg.Grammatica.Parser.RE;namespace PerCederberg.Grammatica.Parser { /** * A character stream tokenizer. This class groups the characters read * from the stream together into tokens ("words"). The grouping is * controlled by token patterns that contain either a fixed string to * search for, or a regular expression. If the stream of characters * don't match any of the token patterns, a parse exception is thrown. * * @author Per Cederberg, <per at percederberg dot net> * @version 1.4 */ public class Tokenizer { /** * The token list feature flag. */ private bool useTokenList = false; /** * The string token matcher. This token matcher is used for all * string token patterns. This matcher implements a DFA to * provide maximum performance. */ private StringTokenMatcher stringMatcher = new StringTokenMatcher(); /** * The list of all regular expression token matchers. These * matchers each test matches for a single regular expression. */ private ArrayList regexpMatchers = new ArrayList(); /** * The input stream to read from. When this is set to null, no * further input is available. */ private TextReader input = null; /** * The buffer with previously read characters. Normally characters * are appended in blocks to this buffer, and for every token that * is found, its characters are removed from the buffer. */ private StringBuilder buffer = new StringBuilder(); /** * The current position in the string buffer. */ private int position = 0; /** * The line number of the first character in the buffer. This * value will be incremented when reading past line breaks. */ private int line = 1; /** * The column number of the first character in the buffer. This * value will be updated for every character read. */ private int column = 1; /** * The end of buffer read flag. This flag is set if the end of * the buffer was encountered while matching token patterns. */ private bool endOfBuffer = false; /** * The previous token in the token list. */ private Token previousToken = null; /** * Creates a new tokenizer for the specified input stream. * * @param input the input stream to read */ public Tokenizer(TextReader input) { this.input = input; } /** * Checks if the token list feature is used. The token list * feature makes all tokens (including ignored tokens) link to * each other in a linked list. By default the token list feature * is not used. * * @return true if the token list feature is used, or * false otherwise * * @see #setUseTokenList * @see Token#getPreviousToken * @see Token#getNextToken * * @since 1.4 */ public bool GetUseTokenList() { return useTokenList; } /** * Sets the token list feature flag. The token list feature makes * all tokens (including ignored tokens) link to each other in a * linked list when active. By default the token list feature is * not used. * * @param useTokenList the token list feature flag * * @see #getUseTokenList * @see Token#getPreviousToken * @see Token#getNextToken * * @since 1.4 */ public void SetUseTokenList(bool useTokenList) { this.useTokenList = useTokenList; } /** * Returns a description of the token pattern with the * specified id. * * @param id the token pattern id * * @return the token pattern description, or * null if not present */ public string GetPatternDescription(int id) { TokenPattern pattern; RegExpTokenMatcher re; pattern = stringMatcher.GetPattern(id); if (pattern != null) { return pattern.ToShortString(); } for (int i = 0; i < regexpMatchers.Count; i++) { re = (RegExpTokenMatcher) regexpMatchers[i]; if (re.GetPattern().GetId() == id) { return re.GetPattern().ToShortString(); } } return null; } /** * Returns the current line number. This number will be the line * number of the next token returned. * * @return the current line number */ public int GetCurrentLine() { return line; } /** * Returns the current column number. This number will be the * column number of the next token returned. * * @return the current column number */ public int GetCurrentColumn() { return column; } /** * Adds a new token pattern to the tokenizer. The pattern will be * added last in the list, choosing a previous token pattern in * case two matches the same string. * * @param pattern the pattern to add * * @throws ParserCreationException if the pattern couldn't be * added to the tokenizer */ public void AddPattern(TokenPattern pattern) { switch (pattern.GetPatternType()) { case TokenPattern.PatternType.STRING: stringMatcher.AddPattern(pattern); break; case TokenPattern.PatternType.REGEXP: try { regexpMatchers.Add(new RegExpTokenMatcher(pattern)); } catch (RegExpException e) { throw new ParserCreationException( ParserCreationException.ErrorType.INVALID_TOKEN, pattern.GetName(), "regular expression contains error(s): " + e.Message); } break; default: throw new ParserCreationException( ParserCreationException.ErrorType.INVALID_TOKEN, pattern.GetName(), "pattern type " + pattern.GetPatternType() + " is undefined"); } } /** * Finds the next token on the stream. This method will return * null when end of file has been reached. It will return a * parse exception if no token matched the input stream, or if * a token pattern with the error flag set matched. Any tokens * matching a token pattern with the ignore flag set will be * silently ignored and the next token will be returned. * * @return the next token found, or * null if end of file was encountered * * @throws ParseException if the input stream couldn't be read or * parsed correctly */ public Token Next() { Token token = null; do { token = NextToken(); if (useTokenList && token != null) { token.SetPreviousToken(previousToken); previousToken = token; } if (token == null) { return null; } else if (token.GetPattern().IsError()) { throw new ParseException( ParseException.ErrorType.INVALID_TOKEN, token.GetPattern().GetErrorMessage(), token.GetStartLine(), token.GetStartColumn()); } else if (token.GetPattern().IsIgnore()) { token = null; } } while (token == null); return token; } /** * Finds the next token on the stream. This method will return * null when end of file has been reached. It will return a * parse exception if no token matched the input stream. * * @return the next token found, or * null if end of file was encountered * * @throws ParseException if the input stream couldn't be read or * parsed correctly */ private Token NextToken() { TokenMatcher m; Token token; string str; ParseException e; // Find longest matching string do { if (endOfBuffer) { ReadInput(); endOfBuffer = false; } m = FindMatch(); } while (endOfBuffer && input != null); // Return token results if (m != null) { str = buffer.ToString(); str = str.Substring(position, m.GetMatchedLength()); token = new Token(m.GetMatchedPattern(), str, line, column); position += m.GetMatchedLength(); line = token.GetEndLine(); column = token.GetEndColumn() + 1; return token; } else if (position >= buffer.Length) { return null; } else { e = new ParseException( ParseException.ErrorType.UNEXPECTED_CHAR, buffer[position].ToString(), line, column); if (buffer[position] == '\n') { line++; column = 1; } else { column++; } position++; throw e; } } /** * Reads characters from the input stream and appends them to * the input buffer. This method is safe to call even though * the end of file has been reached. As a side effect, this * method may also remove * * @throws ParseException if an error was encountered while * reading the input stream */ private void ReadInput() { char[] chars = new char[4096]; int length; // Check for end of file if (input == null) { return; } // Remove old characters from buffer if (position > 1024) { buffer.Remove(0, position); position = 0; } // Read characters try { length = input.Read(chars, 0, chars.Length); } catch (IOException e) { input = null; throw new ParseException(ParseException.ErrorType.IO, e.Message, -1, -1); } // Append characters to buffer if (length > 0) { buffer.Append(chars, 0, length); } if (length < chars.Length) { input.Close(); input = null; } } /** * Finds the longest token match from the current buffer * position. This method will return the token matcher for the * best match, or null if no match was found. As a side * effect, this method will also set the end of buffer flag. * * @return the token mathcher with the longest match, or * null if no match was found */ private TokenMatcher FindMatch() { TokenMatcher bestMatch = null; int bestLength = 0; RegExpTokenMatcher re; string str = buffer.ToString(); // Check string matches if (stringMatcher.MatchFrom(str, position)) { bestMatch = stringMatcher; bestLength = bestMatch.GetMatchedLength(); } if (stringMatcher.HasReadEndOfString()) { endOfBuffer = true; } // Check regular expression matches for (int i = 0; i < regexpMatchers.Count; i++) { re = (RegExpTokenMatcher) regexpMatchers[i]; if (re.MatchFrom(str, position) && re.GetMatchedLength() > bestLength) { bestMatch = re; bestLength = bestMatch.GetMatchedLength(); } if (re.HasReadEndOfString()) { endOfBuffer = true; } } return bestMatch; } /** * Returns a string representation of this object. The returned * string will contain the details of all the token patterns * contained in this tokenizer. * * @return a detailed string representation */ public override string ToString() { StringBuilder buffer = new StringBuilder(); buffer.Append(stringMatcher); for (int i = 0; i < regexpMatchers.Count; i++) { buffer.Append(regexpMatchers[i]); } return buffer.ToString(); } } /** * A token pattern matcher. This class is the base class for the * two types of token matchers that exist. The token matcher * checks for matches with the tokenizer buffer, and maintains the * state of the last match. */ internal abstract class TokenMatcher { /** * Returns the latest matched token pattern. * * @return the latest matched token pattern, or * null if no match found */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -