📄 tokenizerimpl.java
字号:
/** * Portions Copyright 2001 Sun Microsystems, Inc. * Portions Copyright 1999-2001 Language Technologies Institute, * Carnegie Mellon University. * All Rights Reserved. Use is subject to license terms. * * See the file "license.terms" for information on usage and * redistribution of this file, and for a DISCLAIMER OF ALL * WARRANTIES. */package com.sun.speech.freetts.en;import com.sun.speech.freetts.Token;import com.sun.speech.freetts.Tokenizer;import java.io.Reader;import java.io.IOException;/** * Implements the tokenizer interface. Breaks an input sequence of * characters into a set of tokens. */public class TokenizerImpl implements Tokenizer { /** A constant indicating that the end of the stream has been read. */ public static final int EOF = -1; /** A string containing the default whitespace characters. */ public static final String DEFAULT_WHITESPACE_SYMBOLS = " \t\n\r"; /** A string containing the default single characters. */ public static final String DEFAULT_SINGLE_CHAR_SYMBOLS = "(){}[]"; /** A string containing the default pre-punctuation characters. */ public static final String DEFAULT_PREPUNCTUATION_SYMBOLS = "\"'`({["; /** A string containing the default post-punctuation characters. */ public static final String DEFAULT_POSTPUNCTUATION_SYMBOLS = "\"'`.,:;!?(){}[]"; // the line number private int lineNumber = 0; // the input text (from the Utterance) to tokenize private String inputText = null; // the file to read input text from, if using file mode private Reader reader = null; // the token position - doesn't seem really necessary at this point // private int tokenPosition = 0; // the current character, whether its from the file or the input text private int currentChar = 0; // the current char position for the input text (not the file) // this is called "file_pos" in flite private int currentPosition = 0; // the delimiting symbols of this tokenizer private String whitespaceSymbols = DEFAULT_WHITESPACE_SYMBOLS; private String singleCharSymbols = DEFAULT_SINGLE_CHAR_SYMBOLS; private String prepunctuationSymbols = DEFAULT_PREPUNCTUATION_SYMBOLS; private String postpunctuationSymbols = DEFAULT_POSTPUNCTUATION_SYMBOLS; // The error description private String errorDescription = null; // a place to store the current token private Token token; private Token lastToken = null; // for timing private long duration = 0; /** * Constructs a Tokenizer. */ public TokenizerImpl() { } /** * Creates a tokenizer that will return tokens from * the given string. * * @param string the string to tokenize */ public TokenizerImpl(String string) { setInputText(string); } /** * Creates a tokenizer that will return tokens from * the given file. * * @param file where to read the input from */ public TokenizerImpl(Reader file) { setInputReader(file); } /** * Sets the whitespace symbols of this Tokenizer to the given symbols. * * @param symbols the whitespace symbols */ public void setWhitespaceSymbols(String symbols) { whitespaceSymbols = symbols; } /** * Sets the single character symbols of this Tokenizer to the given * symbols. * * @param symbols the single character symbols */ public void setSingleCharSymbols(String symbols) { singleCharSymbols = symbols; } /** * Sets the prepunctuation symbols of this Tokenizer to the given * symbols. * * @param symbols the prepunctuation symbols */ public void setPrepunctuationSymbols(String symbols) { prepunctuationSymbols = symbols; } /** * Sets the postpunctuation symbols of this Tokenizer to the given * symbols. * * @param symbols the postpunctuation symbols */ public void setPostpunctuationSymbols(String symbols) { postpunctuationSymbols = symbols; } /** * Sets the text to tokenize. * * @param inputString the string to tokenize */ public void setInputText(String inputString) { inputText = inputString; currentPosition = 0; if (inputText != null) { getNextChar(); } } /** * Sets the input reader * * @param reader the input source */ public void setInputReader(Reader reader) { this.reader = reader; getNextChar(); } /** * Returns the next token. * * @return the next token if it exists, * <code>null</code> if no more tokens */ public Token getNextToken() { lastToken = token; token = new Token(); // Skip whitespace token.setWhitespace(getTokenOfCharClass(whitespaceSymbols)); // quoted strings currently ignored // get prepunctuation token.setPrepunctuation(getTokenOfCharClass(prepunctuationSymbols)); // get the symbol itself if (singleCharSymbols.indexOf(currentChar) != -1) { token.setWord(String.valueOf((char) currentChar)); getNextChar(); } else { token.setWord(getTokenNotOfCharClass(whitespaceSymbols)); } token.setPosition(currentPosition); token.setLineNumber(lineNumber); // This'll have token *plus* postpunctuation // Get postpunctuation removeTokenPostpunctuation(); return token; } /** * Returns <code>true</code> if there are more tokens, * <code>false</code> otherwise. * * @return <code>true</code> if there are more tokens * <code>false</code> otherwise */ public boolean hasMoreTokens() { int nextChar = currentChar; return (nextChar != EOF); } /** * Advances the currentPosition pointer by 1 (if not exceeding * length of inputText, and returns the character pointed by * currentPosition. * * @return the next character EOF if no more characters exist */ private int getNextChar() { if (reader != null) { try { int readVal = reader.read(); if (readVal == -1) { currentChar = EOF; } else { currentChar = (char) readVal; } } catch (IOException ioe) { currentChar = EOF; errorDescription = ioe.getMessage(); } } else if (inputText != null) { if (currentPosition < inputText.length()) { currentChar = (int) inputText.charAt(currentPosition); } else { currentChar = EOF; } } if (currentChar != EOF) { currentPosition++; } if (currentChar == '\n') { lineNumber++; } return currentChar; } /** * Starting from the current position of the input text, * returns the subsequent characters of type charClass, * and not of type singleCharSymbols. * * @param charClass the type of characters to look for * @param buffer the place to append characters of type charClass * * @return a string of characters starting from the current position * of the input text, until it encounters a character not * in the string charClass * */ private String getTokenOfCharClass(String charClass) { return getTokenByCharClass(charClass, true); } /** * Starting from the current position of the input text/file, * returns the subsequent characters, not of type singleCharSymbols, * and ended at characters of type endingCharClass. E.g., if the current * string is "xxxxyyy", endingCharClass is "yz", and singleCharClass * "abc". Then this method will return to "xxxx". * * @param endingCharClass the type of characters to look for * * @return a string of characters from the current position until * it encounters characters in endingCharClass * */ private String getTokenNotOfCharClass(String endingCharClass) { return getTokenByCharClass(endingCharClass, false); } /** * Provides a `compressed' method from getTokenOfCharClass() and * getTokenNotOfCharClass(). * If parameter containThisCharClass is <code>true</code>, * then a string from the * current position to the last character in charClass is returned. * If containThisCharClass is <code>false</code>, then a string * before the first * occurrence of a character in containThisCharClass is returned. * * @param charClass the string of characters you want included or * excluded in your return * @param containThisCharClass determines if you want characters * in charClass in the returned string or not * * @return a string of characters from the current position until * it encounters characters in endingCharClass */ private String getTokenByCharClass(String charClass, boolean containThisCharClass) { StringBuffer buffer = new StringBuffer(); // if we want the returned string to contain chars in charClass, then // containThisCharClass is TRUE and // (charClass.indexOf(currentChar) != 1) == containThisCharClass) // returns true; if we want it to stop at characters of charClass, // then containThisCharClass is FALSE, and the condition returns // false. while ((charClass.indexOf(currentChar) != -1) == containThisCharClass && singleCharSymbols.indexOf(currentChar) == -1 && currentChar != EOF) { buffer.append((char) currentChar); getNextChar(); } return buffer.toString(); } /** * Removes the postpunctuation characters from the current token. * Copies those postpunctuation characters to the class * variable 'postpunctuation'. */ private void removeTokenPostpunctuation() { if (token != null) { String tokenWord = token.getWord(); int tokenLength = tokenWord.length(); int position = tokenLength - 1; while (position > 0 && postpunctuationSymbols.indexOf ((int)tokenWord.charAt(position)) != -1) { position--; } if (tokenLength - 1 != position) { // Copy postpunctuation from token token.setPostpunctuation( tokenWord.substring(position+1)); // truncate token at postpunctuation token.setWord(tokenWord.substring(0, position+1)); } else { token.setPostpunctuation(""); } } } /** * Returns <code>true</code> if there were errors while reading tokens * * @return <code>true</code> if there were errors; * <code>false</code> otherwise */ public boolean hasErrors() { return errorDescription != null; } /** * if hasErrors returns <code>true</code>, this will return a * description of the error encountered, otherwise * it will return <code>null</code> * * @return a description of the last error that occurred. */ public String getErrorDescription() { return errorDescription; } /** * Determines if the current token should start a new sentence. * * @return <code>true</code> if a new sentence should be started */ public boolean isBreak() { String tokenWhiteSpace = token.getWhitespace(); String lastTokenPostpunctuation = null; if (lastToken != null) { lastTokenPostpunctuation = lastToken.getPostpunctuation(); } if (lastToken == null || token == null) { return false; } else if (tokenWhiteSpace.indexOf('\n') != tokenWhiteSpace.lastIndexOf('\n')) { return true; } else if (lastTokenPostpunctuation.indexOf(':') != -1 || lastTokenPostpunctuation.indexOf('?') != -1 || lastTokenPostpunctuation.indexOf('!') != -1) { return true; } else if (lastTokenPostpunctuation.indexOf('.') != -1 && tokenWhiteSpace.length() > 1 && Character.isUpperCase(token.getWord().charAt(0))) { return true; } else { String lastWord = lastToken.getWord(); int lastWordLength = lastWord.length(); if (lastTokenPostpunctuation.indexOf('.') != -1 && /* next word starts with a capital */ Character.isUpperCase(token.getWord().charAt(0)) && /* last word isn't an abbreviation */ !(Character.isUpperCase (lastWord.charAt(lastWordLength - 1)) || (lastWordLength < 4 && Character.isUpperCase(lastWord.charAt(0))))) { return true; } } return false; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -