📄 tokenizer.java
字号:
/* * Tokenizer.java * * This work is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published * by the Free Software Foundation; either version 2 of the License, * or (at your option) any later version. * * This work is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA * * As a special exception, the copyright holders of this library give * you permission to link this library with independent modules to * produce an executable, regardless of the license terms of these * independent modules, and to copy and distribute the resulting * executable under terms of your choice, provided that you also meet, * for each linked independent module, the terms and conditions of the * license of that module. An independent module is a module which is * not derived from or based on this library. If you modify this * library, you may extend this exception to your version of the * library, but you are not obligated to do so. If you do not wish to * do so, delete this exception statement from your version. * * Copyright (c) 2003 Per Cederberg. All rights reserved. */package net.percederberg.grammatica.parser;import java.io.IOException;import java.io.Reader;import java.util.ArrayList;import net.percederberg.grammatica.parser.re.CharBuffer;import net.percederberg.grammatica.parser.re.RegExp;import net.percederberg.grammatica.parser.re.Matcher;import net.percederberg.grammatica.parser.re.RegExpException;/** * A character stream tokenizer. This class groups the characters read * from the stream together into tokens ("words"). The grouping is * controlled by token patterns that contain either a fixed string to * search for, or a regular expression. If the stream of characters * don't match any of the token patterns, a parse exception is thrown. * * @author Per Cederberg, <per at percederberg dot net> * @version 1.4 */public class Tokenizer { /** * The token list feature flag. */ private boolean useTokenList = false; /** * The string token matcher. This token matcher is used for all * string token patterns. This matcher implements a DFA to * provide maximum performance. */ private StringTokenMatcher stringMatcher = new StringTokenMatcher(); /** * The list of all regular expression token matchers. These * matchers each test matches for a single regular expression. */ private ArrayList regexpMatchers = new ArrayList(); /** * The input stream to read from. When this is set to null, no * further input is available. */ private Reader input = null; /** * The buffer with previously read characters. Normally characters * are appended in blocks to this buffer, and for every token that * is found, its characters are removed from the buffer. */ private CharBuffer buffer = new CharBuffer(); /** * The current position in the string buffer. */ private int position = 0; /** * The line number of the first character in the buffer. This * value will be incremented when reading past line breaks. */ private int line = 1; /** * The column number of the first character in the buffer. This * value will be updated for every character read. */ private int column = 1; /** * The end of buffer read flag. This flag is set if the end of * the buffer was encountered while matching token patterns. */ private boolean endOfBuffer = false; /** * The previous token in the token list. */ private Token previousToken = null; /** * Creates a new tokenizer for the specified input stream. * * @param input the input stream to read */ public Tokenizer(Reader input) { this.input = input; } /** * Checks if the token list feature is used. The token list * feature makes all tokens (including ignored tokens) link to * each other in a linked list. By default the token list feature * is not used. * * @return true if the token list feature is used, or * false otherwise * * @see #setUseTokenList * @see Token#getPreviousToken * @see Token#getNextToken * * @since 1.4 */ public boolean getUseTokenList() { return useTokenList; } /** * Sets the token list feature flag. The token list feature makes * all tokens (including ignored tokens) link to each other in a * linked list when active. By default the token list feature is * not used. * * @param useTokenList the token list feature flag * * @see #getUseTokenList * @see Token#getPreviousToken * @see Token#getNextToken * * @since 1.4 */ public void setUseTokenList(boolean useTokenList) { this.useTokenList = useTokenList; } /** * Returns a description of the token pattern with the specified * id. * * @param id the token pattern id * * @return the token pattern description, or * null if not present */ public String getPatternDescription(int id) { TokenPattern pattern; RegExpTokenMatcher re; pattern = stringMatcher.getPattern(id); if (pattern != null) { return pattern.toShortString(); } for (int i = 0; i < regexpMatchers.size(); i++) { re = (RegExpTokenMatcher) regexpMatchers.get(i); if (re.getPattern().getId() == id) { return re.getPattern().toShortString(); } } return null; } /** * Returns the current line number. This number will be the line * number of the next token returned. * * @return the current line number */ public int getCurrentLine() { return line; } /** * Returns the current column number. This number will be the * column number of the next token returned. * * @return the current column number */ public int getCurrentColumn() { return column; } /** * Adds a new token pattern to the tokenizer. The pattern will be * added last in the list, choosing a previous token pattern in * case two matches the same string. * * @param pattern the pattern to add * * @throws ParserCreationException if the pattern couldn't be * added to the tokenizer */ public void addPattern(TokenPattern pattern) throws ParserCreationException { switch (pattern.getType()) { case TokenPattern.STRING_TYPE: stringMatcher.addPattern(pattern); break; case TokenPattern.REGEXP_TYPE: try { regexpMatchers.add(new RegExpTokenMatcher(pattern)); } catch (RegExpException e) { throw new ParserCreationException( ParserCreationException.INVALID_TOKEN_ERROR, pattern.getName(), "regular expression contains error(s): " + e.getMessage()); } break; default: throw new ParserCreationException( ParserCreationException.INVALID_TOKEN_ERROR, pattern.getName(), "pattern type " + pattern.getType() + " is undefined"); } } /** * Finds the next token on the stream. This method will return * null when end of file has been reached. It will return a parse * exception if no token matched the input stream, or if a token * pattern with the error flag set matched. Any tokens matching a * token pattern with the ignore flag set will be silently ignored * and the next token will be returned. * * @return the next token found, or * null if end of file was encountered * * @throws ParseException if the input stream couldn't be read or * parsed correctly */ public Token next() throws ParseException { Token token = null; do { token = nextToken(); if (useTokenList && token != null) { token.setPreviousToken(previousToken); previousToken = token; } if (token == null) { return null; } else if (token.getPattern().isError()) { throw new ParseException( ParseException.INVALID_TOKEN_ERROR, token.getPattern().getErrorMessage(), token.getStartLine(), token.getStartColumn()); } else if (token.getPattern().isIgnore()) { token = null; } } while (token == null); return token; } /** * Finds the next token on the stream. This method will return * null when end of file has been reached. It will return a parse * exception if no token matched the input stream. * * @return the next token found, or * null if end of file was encountered * * @throws ParseException if the input stream couldn't be read or * parsed correctly */ private Token nextToken() throws ParseException { TokenMatcher m; Token token; String str; ParseException e; // Find longest matching string do { if (endOfBuffer) { readInput(); endOfBuffer = false; } m = findMatch(); } while (endOfBuffer && input != null); // Return token results if (m != null) { str = buffer.substring(position, position + m.getMatchedLength()); token = new Token(m.getMatchedPattern(), str, line, column); position += m.getMatchedLength(); line = token.getEndLine(); column = token.getEndColumn() + 1; return token; } else if (position >= buffer.length()) { return null; } else { e = new ParseException( ParseException.UNEXPECTED_CHAR_ERROR, String.valueOf(buffer.charAt(position)), line, column); if (buffer.charAt(position) == '\n') { line++; column = 1; } else { column++; } position++; throw e; } } /** * Reads characters from the input stream and appends them to the * input buffer. This method is safe to call even though the end * of file has been reached. As a side effect, this method may * also remove * * @throws ParseException if an error was encountered while * reading the input stream */ private void readInput() throws ParseException { char chars[] = new char[4096]; int length; // Check for end of file if (input == null) { return; } // Remove old characters from buffer if (position > 1024) { buffer.delete(0, position); position = 0; } // Read characters try { length = input.read(chars); } catch (IOException e) { input = null; throw new ParseException(ParseException.IO_ERROR, e.getMessage(), -1, -1); } // Append characters to buffer if (length > 0) { buffer.append(chars, 0, length); } if (length < chars.length) { try { input.close(); } catch (IOException e) { // Do nothing } input = null; } } /** * Finds the longest token match from the current buffer position. * This method will return the token matcher for the best match, * or null if no match was found. As a side effect, this method * will also set the end of buffer flag. * * @return the token mathcher with the longest match, or * null if no match was found */ private TokenMatcher findMatch() { TokenMatcher bestMatch = null; int bestLength = 0; RegExpTokenMatcher re; // Check string matches if (stringMatcher.matchFrom(position)) { bestMatch = stringMatcher; bestLength = bestMatch.getMatchedLength(); } if (stringMatcher.hasReadEndOfString()) { endOfBuffer = true; } // Check regular expression matches for (int i = 0; i < regexpMatchers.size(); i++) { re = (RegExpTokenMatcher) regexpMatchers.get(i); if (re.matchFrom(position) && re.getMatchedLength() > bestLength) { bestMatch = re; bestLength = bestMatch.getMatchedLength(); } if (re.hasReadEndOfString()) { endOfBuffer = true; } } return bestMatch; } /** * Returns a string representation of this object. The returned * string will contain the details of all the token patterns * contained in this tokenizer. * * @return a detailed string representation */ public String toString() { StringBuffer buffer = new StringBuffer(); buffer.append(stringMatcher); for (int i = 0; i < regexpMatchers.size(); i++) { buffer.append(regexpMatchers.get(i)); } return buffer.toString(); } /** * A token pattern matcher. This class is the base class for the * two types of token matchers that exist. The token matcher * checks for matches with the tokenizer buffer, and maintains the * state of the last match. */ private abstract class TokenMatcher { /** * Returns the latest matched token pattern. *
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -