📄 tokenizer.java
字号:
package org.ofbiz.rules.parse.tokens;
import java.io.*;
/**
* <p><b>Title:</b> Tokenizer
* <p><b>Description:</b> None
* <p>Copyright (c) 1999 Steven J. Metsker.
* <p>Copyright (c) 2001 The Open For Business Project - www.ofbiz.org
*
* <p>Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* <p>The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* <p>THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* <br>
* A tokenizer divides a string into tokens. This class is
* highly customizable with regard to exactly how this division
* occurs, but it also has defaults that are suitable for many
* languages. This class assumes that the character values read
* from the string lie in the range 0-255. For example, the
* Unicode value of a capital A is 65, so
* <code> System.out.println((char)65); </code> prints out a
* capital A.
*
* <p>
* The behavior of a tokenizer depends on its character state
* table. This table is an array of 256 <code>TokenizerState
* </code> states. The state table decides which state to
* enter upon reading a character from the input
* string.
*
* <p>
* For example, by default, upon reading an 'A', a tokenizer
* will enter a "word" state. This means the tokenizer will
* ask a <code>WordState</code> object to consume the 'A',
* along with the characters after the 'A' that form a word.
* The state's responsibility is to consume characters and
* return a complete token.
*
* <p>
* The default table sets a SymbolState for every character
* from 0 to 255, and then overrides this with:
*
* <blockquote><pre>
* From To State
* 0 ' ' whitespaceState
* 'a' 'z' wordState
* 'A' 'Z' wordState
* 160 255 wordState
* '0' '9' numberState
* '-' '-' numberState
* '.' '.' numberState
* '"' '"' quoteState
* '\'' '\'' quoteState
* '/' '/' slashState
* </pre></blockquote>
*
* In addition to allowing modification of the state table,
* this class makes each of the states above available. Some
* of these states are customizable. For example, wordState
* allows customization of what characters can be part of a
* word, after the first character.
*
* @author Steven J. Metsker
* @version 1.0
*/
public class Tokenizer {
/**
* The reader to read characters from
*/
protected PushbackReader reader;
/**
* The number of characters that might be in a symbol;
*/
protected static final int DEFAULT_SYMBOL_MAX = 4;
/**
* The state lookup table
*/
protected TokenizerState[] characterState =
new TokenizerState[256];
/**
* The default states that actually consume text and
* produce a token
*/
protected NumberState numberState = new NumberState();
protected QuoteState quoteState = new QuoteState();
protected SlashState slashState = new SlashState();
protected SymbolState symbolState = new SymbolState();
protected WhitespaceState whitespaceState =
new WhitespaceState();
protected WordState wordState = new WordState();
/**
* Constructs a tokenizer with a default state table (as
* described in the class comment).
*
* @return a tokenizer
*/
public Tokenizer() {
setCharacterState(0, 255, symbolState()); // the default
setCharacterState(0, ' ', whitespaceState());
setCharacterState('a', 'z', wordState());
setCharacterState('A', 'Z', wordState());
setCharacterState(0xc0, 0xff, wordState());
setCharacterState('0', '9', numberState());
setCharacterState('-', '-', numberState());
setCharacterState('.', '.', numberState());
setCharacterState('"', '"', quoteState());
setCharacterState('\'', '\'', quoteState());
setCharacterState('/', '/', slashState());
}
/**
* Constructs a tokenizer to read from the supplied string.
*
* @param String the string to read from
*/
public Tokenizer(String s) {
this();
setString(s);
}
/**
* Return the reader this tokenizer will read from.
*
* @return the reader this tokenizer will read from
*/
public PushbackReader getReader() {
return reader;
}
/**
* Return the next token.
*
* @return the next token.
*
* @exception IOException if there is any problem reading
*/
public Token nextToken() throws IOException {
int c = reader.read();
/* There was a defect here, that resulted from the fact
* that unreading a -1 results in the next read having a
* value of (int)(char)-1, which is 65535. This may be
* a defect in PushbackReader. */
if (c >= 0 && c < characterState.length) {
return characterState[c].nextToken(reader, c, this);
}
return Token.EOF;
}
/**
* Return the state this tokenizer uses to build numbers.
*
* @return the state this tokenizer uses to build numbers
*/
public NumberState numberState() {
return numberState;
}
/**
* Return the state this tokenizer uses to build quoted
* strings.
*
* @return the state this tokenizer uses to build quoted
* strings
*/
public QuoteState quoteState() {
return quoteState;
}
/**
* Change the state the tokenizer will enter upon reading
* any character between "from" and "to".
*
* @param from the "from" character
*
* @param to the "to" character
*
* @param TokenizerState the state to enter upon reading a
* character between "from" and "to"
*/
public void setCharacterState(
int from, int to, TokenizerState state) {
for (int i = from; i <= to; i++) {
if (i >= 0 && i < characterState.length) {
characterState[i] = state;
}
}
}
/**
* Set the reader to read from.
*
* @param PushbackReader the reader to read from
*/
public void setReader(PushbackReader r) {
this.reader = r;
}
/**
* Set the string to read from.
*
* @param String the string to read from
*/
public void setString(String s) {
setString(s, DEFAULT_SYMBOL_MAX);
}
/**
* Set the string to read from.
*
* @param String the string to read from
*
* @param int the maximum length of a symbol, which
* establishes the size of pushback buffer
* we need
*/
public void setString(String s, int symbolMax) {
setReader(
new PushbackReader(new StringReader(s), symbolMax));
}
/**
* Return the state this tokenizer uses to recognize
* (and ignore) comments.
*
* @return the state this tokenizer uses to recognize
* (and ignore) comments
*
*/
public SlashState slashState() {
return slashState;
}
/**
* Return the state this tokenizer uses to recognize
* symbols.
*
* @return the state this tokenizer uses to recognize
* symbols
*/
public SymbolState symbolState() {
return symbolState;
}
/**
* Return the state this tokenizer uses to recognize (and
* ignore) whitespace.
*
* @return the state this tokenizer uses to recognize
* whitespace
*/
public WhitespaceState whitespaceState() {
return whitespaceState;
}
/**
* Return the state this tokenizer uses to build words.
*
* @return the state this tokenizer uses to build words
*/
public WordState wordState() {
return wordState;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -