tokenizerimpl.java

来自「这是java 开发的的免费语音播放插件,很值得学习参考!!!!!!!!!!!!1」· Java 代码 · 共 430 行
JAVA
430 行
/** * Portions Copyright 2001 Sun Microsystems, Inc. * Portions Copyright 1999-2001 Language Technologies Institute,  * Carnegie Mellon University. * All Rights Reserved.  Use is subject to license terms. *  * See the file "license.terms" for information on usage and * redistribution of this file, and for a DISCLAIMER OF ALL  * WARRANTIES. */package com.sun.speech.freetts.en;import com.sun.speech.freetts.Token;import com.sun.speech.freetts.Tokenizer;import java.io.Reader;import java.io.IOException;/** * Implements the tokenizer interface. Breaks an input sequence of * characters into a set of tokens. */public class TokenizerImpl implements Tokenizer {            /** A constant indicating that the end of the stream has been read. */    public static final int EOF = -1;        /** A string containing the default whitespace characters. */    public static final String DEFAULT_WHITESPACE_SYMBOLS = " \t\n\r";        /** A string containing the default single characters. */    public static final String DEFAULT_SINGLE_CHAR_SYMBOLS = "(){}[]";        /** A string containing the default pre-punctuation characters. */    public static final String DEFAULT_PREPUNCTUATION_SYMBOLS = "\"'`({[";        /** A string containing the default post-punctuation characters. */    public static final String DEFAULT_POSTPUNCTUATION_SYMBOLS 	= "\"'`.,:;!?(){}[]";    // the line number    private int lineNumber = 0;        // the input text (from the Utterance) to tokenize    private String inputText = null;    // the file to read input text from, if using file mode    private Reader reader = null;    // the token position - doesn't seem really necessary at this point    // private int tokenPosition = 0;    // the current character, whether its from the file or the input text    private int currentChar = 0;        // the current char position for the input text (not the file)    // this is called "file_pos" in flite    private int currentPosition = 0;            // the delimiting symbols of this tokenizer    private String whitespaceSymbols = DEFAULT_WHITESPACE_SYMBOLS;    private String singleCharSymbols = DEFAULT_SINGLE_CHAR_SYMBOLS;    private String prepunctuationSymbols = DEFAULT_PREPUNCTUATION_SYMBOLS;    private String postpunctuationSymbols = DEFAULT_POSTPUNCTUATION_SYMBOLS;    // The error description    private String errorDescription = null;        // a place to store the current token    private Token token;    private Token lastToken = null;    // for timing    private long duration = 0;            /**     * Constructs a Tokenizer.     */    public TokenizerImpl() {    }    /**     * Creates a tokenizer that will return tokens from     * the given string.     *     * @param string the string to tokenize     */    public TokenizerImpl(String string) {	setInputText(string);    }    /**     * Creates a tokenizer that will return tokens from     * the given file.     *     * @param file where to read the input from     */    public TokenizerImpl(Reader file) {	setInputReader(file);    }    /**     * Sets the whitespace symbols of this Tokenizer to the given symbols.     *     * @param symbols the whitespace symbols     */    public void setWhitespaceSymbols(String symbols) {	whitespaceSymbols = symbols;    }            /**     * Sets the single character symbols of this Tokenizer to the given     * symbols.     *     * @param symbols the single character symbols     */    public void setSingleCharSymbols(String symbols) {	singleCharSymbols = symbols;    }            /**     * Sets the prepunctuation symbols of this Tokenizer to the given     * symbols.     *     * @param symbols the prepunctuation symbols     */    public void setPrepunctuationSymbols(String symbols) {	prepunctuationSymbols = symbols;    }            /**     * Sets the postpunctuation symbols of this Tokenizer to the given     * symbols.     *     * @param symbols the postpunctuation symbols     */    public void setPostpunctuationSymbols(String symbols) {	postpunctuationSymbols = symbols;    }        /**     * Sets the text to tokenize.      *     * @param  inputString  the string to tokenize     */    public void setInputText(String inputString) {	inputText = inputString;	currentPosition = 0;		if (inputText != null) {	    getNextChar();	}    }    /**     * Sets the input reader     *     * @param  reader the input source     */    public void setInputReader(Reader reader) {	this.reader = reader;	getNextChar();    }        /**     * Returns the next token.     *     * @return  the next token if it exists,     *          <code>null</code> if no more tokens     */    public Token getNextToken() {	lastToken = token;	token = new Token();		// Skip whitespace	token.setWhitespace(getTokenOfCharClass(whitespaceSymbols));		// quoted strings currently ignored		// get prepunctuation	token.setPrepunctuation(getTokenOfCharClass(prepunctuationSymbols));		// get the symbol itself	if (singleCharSymbols.indexOf(currentChar) != -1) {	    token.setWord(String.valueOf((char) currentChar));	    getNextChar();	} else {	    token.setWord(getTokenNotOfCharClass(whitespaceSymbols));	}	token.setPosition(currentPosition);	token.setLineNumber(lineNumber);		// This'll have token *plus* postpunctuation	// Get postpunctuation	removeTokenPostpunctuation();		return token;    }        /**     * Returns <code>true</code> if there are more tokens,     * 		<code>false</code> otherwise.     *     * @return <code>true</code> if there are more tokens     *         <code>false</code> otherwise     */    public boolean hasMoreTokens() {	int nextChar = currentChar;	return (nextChar != EOF);    }        /**     * Advances the currentPosition pointer by 1 (if not exceeding     * length of inputText, and returns the character pointed by     * currentPosition.     *     * @return the next character EOF if no more characters exist     */    private int getNextChar() {	if (reader != null) {	    try {	        int readVal  = reader.read();		if (readVal == -1) {		    currentChar = EOF;		} else {		    currentChar = (char) readVal;		}	    } catch (IOException ioe) {		currentChar = EOF;		errorDescription = ioe.getMessage();	    }	} else if (inputText != null) {	    if (currentPosition < inputText.length()) {		currentChar = (int) inputText.charAt(currentPosition);	    } else {		currentChar = EOF;	    }	}	if (currentChar != EOF) {	    currentPosition++;	}	if (currentChar == '\n') {	    lineNumber++;	}	return currentChar;    }        /**     * Starting from the current position of the input text,     * returns the subsequent characters of type charClass,     * and not of type singleCharSymbols.     *     * @param  charClass  the type of characters to look for     * @param  buffer  the place to append characters of type charClass     *     * @return  a string of characters starting from the current position     *          of the input text, until it encounters a character not     *          in the string charClass     *     */    private String getTokenOfCharClass(String charClass) {	return getTokenByCharClass(charClass, true);    }    /**     * Starting from the current position of the input text/file,     * returns the subsequent characters, not of type singleCharSymbols,     * and ended at characters of type endingCharClass.  E.g., if the current     * string is "xxxxyyy", endingCharClass is "yz", and singleCharClass     * "abc". Then this method will return to "xxxx".     *     * @param  endingCharClass  the type of characters to look for     *     * @return  a string of characters from the current position until     *          it encounters characters in endingCharClass     *     */    private String getTokenNotOfCharClass(String endingCharClass) {	return getTokenByCharClass(endingCharClass, false);    }        /**     * Provides a `compressed' method from getTokenOfCharClass() and      * getTokenNotOfCharClass().     * If parameter containThisCharClass is <code>true</code>,      * then a string from the     * current position to the last character in charClass is returned.     * If containThisCharClass is <code>false</code>, then a string      * before the first     * occurrence of a character in containThisCharClass is returned.     *     * @param  charClass  the string of characters you want included or     *                    excluded in your return     * @param  containThisCharClass  determines if you want characters     *                in charClass in the returned string or not     *     * @return  a string of characters from the current position until     *          it encounters characters in endingCharClass     */    private String getTokenByCharClass(String charClass,                                        boolean containThisCharClass) {		StringBuffer buffer = new StringBuffer();		// if we want the returned string to contain chars in charClass, then	// containThisCharClass is TRUE and	// (charClass.indexOf(currentChar) != 1) == containThisCharClass)	// returns true; if we want it to stop at characters of charClass,	// then containThisCharClass is FALSE, and the condition returns	// false.	while ((charClass.indexOf(currentChar) != -1)	       == containThisCharClass  &&	       singleCharSymbols.indexOf(currentChar) == -1 &&	       currentChar != EOF) {	    buffer.append((char) currentChar);	    getNextChar();	}	return buffer.toString();    }        /**     * Removes the postpunctuation characters from the current token.     * Copies those postpunctuation characters to the class     * variable 'postpunctuation'.     */    private void removeTokenPostpunctuation() {	if (token != null) {	    String tokenWord = token.getWord();	    int tokenLength = tokenWord.length();	    int position = tokenLength - 1;	    	    while (position > 0 &&		   postpunctuationSymbols.indexOf		   ((int)tokenWord.charAt(position)) != -1) {		position--;	    }	    	    if (tokenLength - 1 != position) {		// Copy postpunctuation from token		token.setPostpunctuation( tokenWord.substring(position+1));				// truncate token at postpunctuation		token.setWord(tokenWord.substring(0, position+1));	    } else {		token.setPostpunctuation("");	    }	}    }    /**     * Returns <code>true</code> if there were errors while reading tokens     *     * @return <code>true</code> if there were errors;     * 		<code>false</code> otherwise     */    public boolean hasErrors() {	return errorDescription != null;    }    /**     * if hasErrors returns <code>true</code>, this will return a      * description of the error encountered, otherwise     * it will return <code>null</code>     *     * @return a description of the last error that occurred.     */    public String getErrorDescription() {	return errorDescription;    }    /**     * Determines if the current token should start a new sentence.     *     * @return <code>true</code> if a new sentence should be started     */    public boolean isBreak() {	String tokenWhiteSpace = token.getWhitespace();	String lastTokenPostpunctuation = null;	if (lastToken != null) {	    lastTokenPostpunctuation = lastToken.getPostpunctuation();	}		if (lastToken == null || token == null) {	    return false;	} else if (tokenWhiteSpace.indexOf('\n') !=		   tokenWhiteSpace.lastIndexOf('\n')) {	    return true;	} else if (lastTokenPostpunctuation.indexOf(':') != -1 ||		   lastTokenPostpunctuation.indexOf('?') != -1 ||		   lastTokenPostpunctuation.indexOf('!') != -1) {	    return true;    	} else if (lastTokenPostpunctuation.indexOf('.') != -1 &&		   tokenWhiteSpace.length() > 1 &&		   Character.isUpperCase(token.getWord().charAt(0))) {	    return true;    	} else {	    String lastWord = lastToken.getWord();	    int lastWordLength = lastWord.length();	    if (lastTokenPostpunctuation.indexOf('.') != -1 &&		/* next word starts with a capital */		Character.isUpperCase(token.getWord().charAt(0)) &&		/* last word isn't an abbreviation */		!(Character.isUpperCase		  (lastWord.charAt(lastWordLength - 1)) ||		  (lastWordLength < 4 &&		   Character.isUpperCase(lastWord.charAt(0))))) {		return true;	    }	}	return false;    }}
tokenizerimpl.java - 源码说明

本页面展示了「这是java 开发的的免费语音播放插件,很值得学习参考!!!!!!!!!!!!111」中的 tokenizerimpl.java 源码文件，采用 Java 编程语言编写，共 430 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与java相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?