unicodereader.java

来自「具有不同语法高亮的编辑器实例」· Java 代码 · 共 311 行
JAVA
311 行
/*
 * 09/23/2004
 *
 * UnicodeReader.java - A reader for Unicode input streams that is capable of
 *                      discerning which particular encoding is being used via
 *                      the BOM.
 * Copyright (C) 2004 Robert Futrell
 * email@address.com
 * www.website.com
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package org.fife.io;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.PushbackInputStream;
import java.io.Reader;


/**
 * A reader capable of identifying Unicode streams by their BOMs.  This class
 * will recognize the following encodings:
 * <ul>
 *   <li>UTF-8</li>
 *   <li>UTF-16LE</li>
 *   <li>UTF-16BE</li>
 *   <li>UTF-32LE</li>
 *   <li>UTF-32BE</li>
 * </ul>
 * If the stream is not found to be any of the above, then a default encoding
 * is used for reading.  The user can specify this default encoding, or a system
 * default will be used.<p>
 *
 * For optimum performance, it is recommended that you wrap all instances of
 * <code>UnicodeReader</code> with a <code>java.io.BufferedReader</code>.<p>
 *
 * This class is mostly ripped off from the workaround in the description of
 * Java Bug 4508058.
 *
 * @author Robert Futrell
 * @version 0.9
 */
public class UnicodeReader extends Reader {

	/**
	 * The input stream from which we're really reading.
	 */
	private InputStreamReader internalIn = null;

	/**
	 * The encoding being used.  We keep our own instead of using the string
	 * returned by <code>java.io.InputStreamReader</code> since that class does
	 * not return user-friendly names.
	 */
	private String encoding;

	/**
	 * The size of a BOM.
	 */
	private static final int BOM_SIZE = 4;


/*****************************************************************************/


	/**
	 * This utility constructor is here because you will usually use a
	 * <code>UnicodeReader</code> on files.<p>
	 * Creates a reader using the encoding specified by the BOM in the file;
	 * if there is no recognized BOM, then a system default encoding is used.
	 *
	 * @param file The file from which you want to read.
	 * @throws IOException If an error occurs when checking for/reading the
	 *                     BOM.
	 * @throws FileNotFoundException If the file does not exist, is a
	 *                     directory, or cannot be opened for reading.
	 * @throws SecurityException If a security manager exists and its
	 *                     checkRead method denies read access to the file.
	 */
	public UnicodeReader(String file) throws IOException,
							FileNotFoundException, SecurityException {
		this(new File(file));
	}


/*****************************************************************************/


	/**
	 * This utility constructor is here because you will usually use a
	 * <code>UnicodeReader</code> on files.<p>
	 * Creates a reader using the encoding specified by the BOM in the file;
	 * if there is no recognized BOM, then a system default encoding is used.
	 *
	 * @param file The file from which you want to read.
	 * @throws IOException If an error occurs when checking for/reading the
	 *                     BOM.
	 * @throws FileNotFoundException If the file does not exist, is a
	 *                     directory, or cannot be opened for reading.
	 * @throws SecurityException If a security manager exists and its
	 *                     checkRead method denies read access to the file.
	 */
	public UnicodeReader(File file) throws IOException, FileNotFoundException,
									SecurityException {
		this(new FileInputStream(file));
	}


/*****************************************************************************/


	/**
	 * This utility constructor is here because you will usually use a
	 * <code>UnicodeReader</code> on files.<p>
	 * Creates a reader using the encoding specified by the BOM in the file;
	 * if there is no recognized BOM, then a specified default encoding is
	 * used.
	 *
	 * @param file The file from which you want to read.
	 * @param defaultEncoding The encoding to use if no BOM is found.
	 * @throws IOException If an error occurs when checking for/reading the
	 *                     BOM.
	 * @throws FileNotFoundException If the file does not exist, is a
	 *                     directory, or cannot be opened for reading.
	 * @throws SecurityException If a security manager exists and its
	 *                     checkRead method denies read access to the file.
	 */
	public UnicodeReader(File file, String defaultEncoding)
								throws IOException,
								FileNotFoundException,
								SecurityException {
		this(new FileInputStream(file), defaultEncoding);
	}


/*****************************************************************************/


	/**
	 * Creates a reader using the encoding specified by the BOM in the file;
	 * if there is no recognized BOM, then a system default encoding is used.
	 *
	 * @param in The input stream from which to read.
	 * @throws IOException If an error occurs when checking for/reading the
	 *                     BOM.
	 */
	public UnicodeReader(InputStream in) throws IOException {
		this(in, null);
	}


/*****************************************************************************/


	/**
	 * Creates a reader using the encoding specified by the BOM in the file;
	 * if there is no recognized BOM, then <code>defaultEncoding</code> is
	 * used.
	 *
	 * @param in The input stream from which to read.
	 * @param defaultEncoding The encoding to use if no recognized BOM is
	 *                        found.  If this value is <code>null</code>, a
	 *                        system default is used.
	 * @throws IOException If an error occurs when checking for/reading the
	 *                     BOM.
	 */
	public UnicodeReader(InputStream in, String defaultEncoding)
									throws IOException {
		init(in, defaultEncoding);
	}


/*****************************************************************************/


	/**
	 * Closes this reader.
	 */
	public void close() throws IOException {
		internalIn.close();
	}


/*****************************************************************************/


	/**
	 * Returns the encoding being used to read this input stream (i.e., the
	 * encoding of the file).  If a BOM was recognized, then the specific
	 * Unicode type is returned; otherwise, either the default encoding passed
	 * into the constructor or the system default is returned.
	 *
	 * @return The encoding of the stream.
	 */
	public String getEncoding() {
		return encoding;
	}


/*****************************************************************************/


	/**
	 * Read-ahead four bytes and check for BOM marks. Extra bytes are
	 * unread back to the stream, only BOM bytes are skipped.
	 *
	 * @param defaultEncoding The encoding to use if no BOM was recognized.  If
	 *                        this value is <code>null</code>, then a system
	 *                        default is used.
	 * @throws IOException If an error occurs when trying to read a BOM.
	 */
	protected void init(InputStream in, String defaultEncoding)
											throws IOException {

		PushbackInputStream tempIn = new PushbackInputStream(in, BOM_SIZE);

		byte bom[] = new byte[BOM_SIZE];
		int n, unread;
		n = tempIn.read(bom, 0, bom.length);

		if ((bom[0]==(byte)0xEF) &&
			(bom[1]==(byte)0xBB) &&
			(bom[2]==(byte)0xBF)) {
			encoding = "UTF-8";
			unread = n - 3;
		}

		else if ((bom[0]==(byte)0xFE) && (bom[1] == (byte)0xFF)) {
			encoding = "UTF-16BE";
			unread = n - 2;
		}

		else if ((bom[0]==(byte)0xFF) && (bom[1]== (byte)0xFE)) {
			encoding = "UTF-16LE";
			unread = n - 2;
		}

		else if ((bom[0]==(byte)0x00) && (bom[1]==(byte)0x00) &&
				(bom[2]==(byte)0xFE) && (bom[3]==(byte)0xFF)) {
			encoding = "UTF-32BE";
			unread = n - 4;
		}

		else if ((bom[0]==(byte)0xFF) && (bom[1]==(byte)0xFE) &&
				(bom[2]==(byte)0x00) && (bom[3]==(byte)0x00)) {
			encoding = "UTF-32LE";
			unread = n - 4;
		}
		
		else {
			// Unicode BOM mark not found, unread all bytes
			encoding = defaultEncoding;
			unread = n;
		}

		if (unread > 0)
			tempIn.unread(bom, (n - unread), unread);
		else if (unread < -1)
			tempIn.unread(bom, 0, 0);

		// Use given encoding
		if (encoding == null) {
			internalIn = new InputStreamReader(tempIn);
			encoding = internalIn.getEncoding(); // Get the default.
		}
		else
			internalIn = new InputStreamReader(tempIn, encoding);

	}


/*****************************************************************************/


	/**
	 * Read characters into a portion of an array. This method will block until
	 * some input is available, an I/O error occurs, or the end of the stream
	 * is reached.
	 *
	 * @param cbuf The buffer into which to read.
	 * @param off The offset at which to start storing characters.
	 * @param len The maximum number of characters to read.
	 *
	 * @return The number of characters read, or <code>-1</code> if the end
	 *         of the stream has been reached.
	 */
	public int read(char[] cbuf, int off, int len) throws IOException {
		return internalIn.read(cbuf, off, len);
	}


/*****************************************************************************/

}
unicodereader.java - 源码说明

本页面展示了「具有不同语法高亮的编辑器实例」中的 unicodereader.java 源码文件，采用 Java 编程语言编写，共 311 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与编辑器相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?