📄 charsettoolkit.java

📁 easyweb的使用
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package com.easyjf.web.tools;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.util.Collection;

/**
 * <p>
 * Utility class to guess the encoding of a given byte array. The guess is
 * unfortunately not 100% sure. Especially for 8-bit charsets. It's not possible
 * to know which 8-bit charset is used. Except through statistical analysis. We
 * will then infer that the charset encountered is the same as the default
 * standard charset.
 * </p>
 * 
 * <p>
 * On the other hand, unicode files encoded in UTF-16 (low or big endian) or
 * UTF-8 files with a Byte Order Marker are easy to find. For UTF-8 files with
 * no BOM, if the buffer is wide enough, it's easy to guess.
 * </p>
 * 
 * <p>
 * Tested against a complicated UTF-8 file, Sun's implementation does not render
 * bad UTF- constructs as expected by the specification. But with a buffer wide
 * enough, the method guessEncoding() did behave correctly and recognized the
 * UTF-8 charset.
 * </p>
 * 
 * <p>
 * A byte buffer of 4KB or 8KB is sufficient to be able to guess the encoding.
 * </p>
 * 
 * <p>
 * Usage:
 * </p>
 * 
 * <pre>
 * // guess the encoding 
 * Charset guessedCharset = com.glaforge.i18n.io.CharsetToolkit.guessEncoding(
 * 		file, 4096);
 * 
 * // create a reader with the charset we've just discovered 
 * FileInputStream fis = new FileInputStream(file);
 * InputStreamReader isr = new InputStreamReader(fis, guessedCharset);
 * BufferedReader br = new BufferedReader(isr);
 * 
 * // read the file content 
 * String line;
 * while ((line = br.readLine()) != null) {
 * 	System.out.println(line);
 * }
 * </pre>
 * 
 * <p>
 * Date: 18 juil. 2002
 * </p>
 * 
 * @author Guillaume LAFORGE
 */
public class CharsetToolkit {
	private byte[] buffer;

	private Charset defaultCharset;

	private boolean enforce8Bit = false;

	/**
	 * Constructor of the <code>com.glaforge.i18n.io.CharsetToolkit</code>
	 * utility class.
	 * 
	 * @param buffer
	 *            the byte buffer of which we want to know the encoding.
	 */
	public CharsetToolkit(byte[] buffer) {
		this.buffer = buffer;
		this.defaultCharset = getDefaultSystemCharset();
	}

	/**
	 * Constructor of the <code>com.glaforge.i18n.io.CharsetToolkit</code>
	 * utility class.
	 * 
	 * @param buffer
	 *            the byte buffer of which we want to know the encoding.
	 * @param defaultCharset
	 *            the default Charset to use in case an 8-bit charset is
	 *            recognized.
	 */
	public CharsetToolkit(byte[] buffer, Charset defaultCharset) {
		this.buffer = buffer;
		setDefaultCharset(defaultCharset);
	}

	/**
	 * Defines the default <code>Charset</code> used in case the buffer
	 * represents an 8-bit <code>Charset</code>.
	 * 
	 * @param defaultCharset
	 *            the default <code>Charset</code> to be returned by
	 *            <code>guessEncoding()</code> if an 8-bit
	 *            <code>Charset</code> is encountered.
	 */
	public void setDefaultCharset(Charset defaultCharset) {
		if (defaultCharset != null)
			this.defaultCharset = defaultCharset;
		else
			this.defaultCharset = getDefaultSystemCharset();
	}

	/**
	 * If US-ASCII is recognized, enforce to return the default encoding, rather
	 * than US-ASCII. It might be a file without any special character in the
	 * range 128-255, but that may be or become a file encoded with the default
	 * <code>charset</code> rather than US-ASCII.
	 * 
	 * @param enforce
	 *            a boolean specifying the use or not of US-ASCII.
	 */
	public void setEnforce8Bit(boolean enforce) {
		this.enforce8Bit = enforce;
	}

	/**
	 * Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII
	 * encoding.
	 * 
	 * @return a boolean representing the flag of use of US-ASCII.
	 */
	public boolean getEnforce8Bit() {
		return this.enforce8Bit;
	}

	/**
	 * Retrieves the default Charset
	 * 
	 * @return
	 */
	public Charset getDefaultCharset() {
		return defaultCharset;
	}

	/**
	 * <p>
	 * Guess the encoding of the provided buffer.
	 * </p>
	 * If Byte Order Markers are encountered at the beginning of the buffer, we
	 * immidiately return the charset implied by this BOM. Otherwise, the file
	 * would not be a human readable text file.
	 * </p>
	 * 
	 * <p>
	 * If there is no BOM, this method tries to discern whether the file is
	 * UTF-8 or not. If it is not UTF-8, we assume the encoding is the default
	 * system encoding (of course, it might be any 8-bit charset, but usually,
	 * an 8-bit charset is the default one).
	 * </p>
	 * 
	 * <p>
	 * It is possible to discern UTF-8 thanks to the pattern of characters with
	 * a multi-byte sequence.
	 * </p>
	 * 
	 * <pre>
	 *       
	 *       UCS-4 range (hex.)        UTF-8 octet sequence (binary) 
	 *       0000 0000-0000 007F       0xxxxxxx 
	 *       0000 0080-0000 07FF       110xxxxx 10xxxxxx 
	 *       0000 0800-0000 FFFF       1110xxxx 10xxxxxx 10xxxxxx 
	 *       0001 0000-001F FFFF       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 
	 *       0020 0000-03FF FFFF       111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 
	 *       0400 0000-7FFF FFFF       1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 
	 * </pre>
	 * 
	 * <p>
	 * With UTF-8, 0xFE and 0xFF never appear.
	 * </p>
	 * 
	 * @return the Charset recognized.
	 */
	public Charset guessEncoding() {
		// if the file has a Byte Order Marker, we can assume the file is in
		// UTF-xx
		// otherwise, the file would not be human readable
		if (hasUTF8Bom(buffer))
			return Charset.forName("UTF-8");
		if (hasUTF16LEBom(buffer))
			return Charset.forName("UTF-16LE");
		if (hasUTF16BEBom(buffer))
			return Charset.forName("UTF-16BE");

		// if a byte has its most significant bit set, the file is in UTF-8 or
		// in the default encoding
		// otherwise, the file is in US-ASCII
		boolean highOrderBit = false;

		// if the file is in UTF-8, high order bytes must have a certain value,
		// in order to be valid
		// if it's not the case, we can assume the encoding is the default
		// encoding of the system
		boolean validU8Char = true;

		// TODO the buffer is not read up to the end, but up to length -

		int length = buffer.length;
		int i = 0;
		while (i < length - 6) {
			byte b0 = buffer[i];
			byte b1 = buffer[i + 1];
			byte b2 = buffer[i + 2];
			byte b3 = buffer[i + 3];
			byte b4 = buffer[i + 4];
			byte b5 = buffer[i + 5];
			if (b0 < 0) {
				// a high order bit was encountered, thus the encoding is not
				// US-ASCII
				// it may be either an 8-bit encoding or UTF-
				highOrderBit = true;
				// a two-bytes sequence was encoutered
				if (isTwoBytesSequence(b0)) {
					// there must be one continuation byte of the form 10xxxxxx,
					// otherwise the following characteris is not a valid UTF-8
					// construct
					if (!isContinuationChar(b1))
						validU8Char = false;
					else
						i++;
				}
				// a three-bytes sequence was encoutered
				else if (isThreeBytesSequence(b0)) {
					// there must be two continuation bytes of the form
					// 10xxxxxx,
					// otherwise the following characteris is not a valid UTF-8
					// construct
					if (!(isContinuationChar(b1) && isContinuationChar(b2)))
						validU8Char = false;
					else
						i += 2;
				}
				// a four-bytes sequence was encoutered
				else if (isFourBytesSequence(b0)) {
					// there must be three continuation bytes of the form
					// 10xxxxxx,
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -