📄 lexiconimpl.java
字号:
/** * Portions Copyright 2001 Sun Microsystems, Inc. * Portions Copyright 1999-2001 Language Technologies Institute, * Carnegie Mellon University. * All Rights Reserved. Use is subject to license terms. * * See the file "license.terms" for information on usage and * redistribution of this file, and for a DISCLAIMER OF ALL * WARRANTIES. */package com.sun.speech.freetts.lexicon;import com.sun.speech.freetts.util.Utilities;import com.sun.speech.freetts.util.BulkTimer;import java.io.BufferedInputStream;import java.io.BufferedOutputStream;import java.io.BufferedReader;import java.io.DataInputStream;import java.io.DataOutputStream;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.InputStream;import java.io.InputStreamReader;import java.io.IOException;import java.nio.channels.FileChannel;import java.nio.ByteBuffer;import java.nio.MappedByteBuffer;import java.net.MalformedURLException;import java.net.URL;import java.util.ArrayList;import java.util.Collections;import java.util.HashMap;import java.util.LinkedHashMap;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.Set;import java.util.StringTokenizer;/** * Provides an implementation of a Lexicon. * * <p>This implementation will either read from a straight ASCII file * or a binary file. When reading from an ASCII file, you can specify * when the input line is tokenized: load, lookup, or never. If you * specify 'load', the entire file will be parsed when it is loaded. * If you specify 'lookup', the file will be loaded, but the parsing * for each line will be delayed until it is referenced and the parsed * form will be saved away. If you specify 'never', the lines will * parsed each time they are referenced. The default is 'never'. To * specify the load type, set the system property as follows: * * <pre> * -Dcom.sun.speech.freetts.lexicon.LexTokenize=load * </pre> * * <p>If a binary file is used, you can also specify whether the new * IO package is used. The new IO package is new for JDK1.4, and can * greatly improve the speed of loading files. To enable new IO, use * the following system property (it is enabled by default): * * <pre> * -Dcom.sun.speech.freetts.useNewIO=true * </pre> * * <p>The implementation also allows users to define their own addenda * that will be used in addition to the system addenda. If the user * defines their own addenda, it values will be added to the system * addenda, overriding any existing elements in the system addenda. * To define a user addenda, the user needs to set the following * property: * * <pre> * -Dcom.sun.speeech.freetts.lexicon.userAddenda=<URLToUserAddenda> * </pre> * * Where <URLToUserAddenda> is a URL pointing to an ASCII file * containing addenda entries. * * <p>[[[TODO: support multiple homographs with the same part of speech.]]] */abstract public class LexiconImpl implements Lexicon { /** * If true, the phone string is replaced with the phone array in * the hashmap when the phone array is loaded. The side effects * of this are quicker lookups, but more memory usage and a longer * startup time. */ protected boolean tokenizeOnLoad = false; /** * If true, the phone string is replaced with the phone array in * the hashmap when the phone array is first looked up. The side effects * Set by cmufilelex.tokenize=lookup. */ protected boolean tokenizeOnLookup = false; /** * Magic number for binary Lexicon files. */ private final static int MAGIC = 0xBABB1E; /** * Current binary file version. */ private final static int VERSION = 1; /** * URL for the compiled form. */ private URL compiledURL; /** * URL for the addenda. */ private URL addendaURL; /** * URL for the letter to sound rules. */ private URL letterToSoundURL; /** * The addenda. */ private Map addenda; /** * The compiled lexicon. */ private Map compiled; /** * The LetterToSound rules. */ private LetterToSound letterToSound = null; /** * Parts of Speech. */ private ArrayList partsOfSpeech = new ArrayList(); /** * A static directory of compiledURL URL objects and associated * already-loaded compiled Map objects. This is used to share * the immutable compiled lexicons between lexicon instances. * As the addenda can be changed using <code>addAddendum()</code> * and <code>removeAddendum</code>, each lexicon instance has its * own addenda. */ private static Map loadedCompiledLexicons; /** * Loaded State of the lexicon */ private boolean loaded = false; /** * Type of lexicon to load */ private boolean binary = false; /** * No phones for this word. */ final static private String[] NO_PHONES = new String[0]; /** * Temporary place holder. */ private char charBuffer[] = new char[128]; /** * Use the new IO package? */ private boolean useNewIO = Utilities.getProperty("com.sun.speech.freetts.useNewIO", "true").equals("true"); /** * Create a new LexiconImpl by reading from the given URLS. * * @param compiledURL a URL pointing to the compiled lexicon * @param addendaURL a URL pointing to lexicon addenda * @param letterToSoundURL a LetterToSound to use if a word cannot * be found in the compiled form or the addenda * @param binary if <code>true</code>, the input streams are binary; * otherwise, they are text. */ public LexiconImpl(URL compiledURL, URL addendaURL, URL letterToSoundURL, boolean binary) { this(); setLexiconParameters(compiledURL, addendaURL, letterToSoundURL, binary); } /** * Class constructor for an empty Lexicon. */ public LexiconImpl() { // Find out when to convert the phone string into an array. // String tokenize = Utilities.getProperty("com.sun.speech.freetts.lexicon.LexTokenize", "never"); tokenizeOnLoad = tokenize.equals("load"); tokenizeOnLookup = tokenize.equals("lookup"); } /** * Sets the lexicon parameters * @param compiledURL a URL pointing to the compiled lexicon * @param addendaURL a URL pointing to lexicon addenda * @param letterToSoundURL a URL pointing to the LetterToSound to use * @param binary if <code>true</code>, the input streams are binary; * otherwise, they are text. */ protected void setLexiconParameters(URL compiledURL, URL addendaURL, URL letterToSoundURL, boolean binary) { this.compiledURL = compiledURL; this.addendaURL = addendaURL; this.letterToSoundURL = letterToSoundURL; this.binary = binary; } /** * Determines if this lexicon is loaded. * * @return <code>true</code> if the lexicon is loaded */ public boolean isLoaded() { return loaded; } /** * Loads the data for this lexicon. If the * * @throws IOException if errors occur during loading */ public void load() throws IOException { BulkTimer.LOAD.start("Lexicon"); if (compiledURL == null) { throw new IOException("Can't load lexicon"); } if (addendaURL == null) { throw new IOException("Can't load lexicon addenda " ); } if (loadedCompiledLexicons == null) { loadedCompiledLexicons = new HashMap(); } if (!loadedCompiledLexicons.containsKey(compiledURL)) { InputStream compiledIS = Utilities.getInputStream(compiledURL); if (compiledIS == null) { throw new IOException("Can't load lexicon from " + compiledURL); } Map newCompiled = createLexicon(compiledIS, binary, 65000); loadedCompiledLexicons.put(compiledURL, newCompiled); compiledIS.close(); } assert loadedCompiledLexicons.containsKey(compiledURL); compiled = Collections.unmodifiableMap((Map)loadedCompiledLexicons.get(compiledURL)); InputStream addendaIS = Utilities.getInputStream(addendaURL); if (addendaIS == null) { throw new IOException("Can't load lexicon addenda from " + addendaURL); } // [[[TODO: what is the best way to derive the estimated sizes?]]] // addenda = createLexicon(addendaIS, binary, 50); addendaIS.close(); /* Load the user-defined addenda and override any existing * entries in the system addenda. */ String userAddenda = Utilities.getProperty( "com.sun.speech.freetts.lexicon.userAddenda", null); if (userAddenda != null) { try { URL userAddendaURL = new URL(userAddenda); InputStream userAddendaIS = Utilities.getInputStream( userAddendaURL); if (userAddendaIS == null) { throw new IOException("Can't load user addenda from " + userAddenda); } Map tmpAddenda = createLexicon(userAddendaIS, false, 50); userAddendaIS.close(); for (Iterator keys = tmpAddenda.keySet().iterator(); keys.hasNext();) { Object key = keys.next(); addenda.put(key, tmpAddenda.get(key)); } } catch (MalformedURLException e) { throw new IOException("User addenda URL is malformed: " + userAddenda); } } loaded = true; BulkTimer.LOAD.stop("Lexicon"); letterToSound = new LetterToSoundImpl(letterToSoundURL, binary); } /** * Reads the given input stream as lexicon data and returns the * results in a <code>Map</code>. * * @param is the input stream * @param binary if <code>true</code>, the data is binary * @param estimatedSize the estimated size of the lexicon * * @throws IOException if errors are encountered while reading the data */ protected Map createLexicon(InputStream is, boolean binary, int estimatedSize) throws IOException { if (binary) { if (useNewIO && is instanceof FileInputStream) { FileInputStream fis = (FileInputStream) is; return loadMappedBinaryLexicon(fis, estimatedSize); } else { DataInputStream dis = new DataInputStream( new BufferedInputStream(is)); return loadBinaryLexicon(dis, estimatedSize); } } else { return loadTextLexicon(is, estimatedSize); } } /** * Reads the given input stream as text lexicon data and returns the * results in a <code>Map</code>. * * @param is the input stream * @param estimatedSize the estimated number of entries of the lexicon * * @throws IOException if errors are encountered while reading the data */ protected Map loadTextLexicon(InputStream is, int estimatedSize) throws IOException { Map lexicon = new LinkedHashMap(estimatedSize * 4 / 3); BufferedReader reader = new BufferedReader(new InputStreamReader(is)); String line; line = reader.readLine(); while (line != null) { if (!line.startsWith("***")) { parseAndAdd(lexicon, line); } line = reader.readLine(); } return lexicon; } /** * Creates a word from the given input line and add it to the lexicon. * * @param lexicon the lexicon * @param line the input text */ protected void parseAndAdd(Map lexicon, String line) { StringTokenizer tokenizer = new StringTokenizer(line,"\t"); String phones = null; String wordAndPos = tokenizer.nextToken(); String pos = wordAndPos.substring(wordAndPos.length() - 1); if (!partsOfSpeech.contains(pos)) { partsOfSpeech.add(pos); } if (tokenizer.hasMoreTokens()) { phones = tokenizer.nextToken(); } if ((phones != null) && (tokenizeOnLoad)) { lexicon.put(wordAndPos, getPhones(phones)); } else if (phones == null) { lexicon.put(wordAndPos, NO_PHONES); } else { lexicon.put(wordAndPos, phones); } } /** * Gets the phone list for a given word. If a phone list cannot * be found, returns <code>null</code>. The format is lexicon * dependent. If the part of speech does not matter, pass in * <code>null</code>. * * @param word the word to find * @param partOfSpeech the part of speech * * @return the list of phones for word or <code>null</code> */ public String[] getPhones(String word, String partOfSpeech) { return getPhones(word, partOfSpeech, true); } /** * Gets the phone list for a given word. If a phone list cannot * be found, <code>null</code> is returned. The * <code>partOfSpeech</code> is implementation dependent, but * <code>null</code> always matches. * * @param word the word to find * @param partOfSpeech the part of speech or <code>null</code> * @param useLTS whether to use the letter-to-sound rules when * the word is not in the lexicon. * * @return the list of phones for word or null */ public String[] getPhones (String word, String partOfSpeech, boolean useLTS){ String[] phones = null; phones = getPhones(addenda, word, partOfSpeech); if (phones == null) { phones = getPhones(compiled, word, partOfSpeech); } if(useLTS){ if (phones == null && letterToSound != null) { phones = letterToSound.getPhones(word, partOfSpeech); } } if(phones != null){ String[] copy = new String[phones.length];
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -