📄 charsettoolkit.java
字号:
package com.easyjf.web.tools;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.util.Collection;
/**
* <p>
* Utility class to guess the encoding of a given byte array. The guess is
* unfortunately not 100% sure. Especially for 8-bit charsets. It's not possible
* to know which 8-bit charset is used. Except through statistical analysis. We
* will then infer that the charset encountered is the same as the default
* standard charset.
* </p>
*
* <p>
* On the other hand, unicode files encoded in UTF-16 (low or big endian) or
* UTF-8 files with a Byte Order Marker are easy to find. For UTF-8 files with
* no BOM, if the buffer is wide enough, it's easy to guess.
* </p>
*
* <p>
* Tested against a complicated UTF-8 file, Sun's implementation does not render
* bad UTF- constructs as expected by the specification. But with a buffer wide
* enough, the method guessEncoding() did behave correctly and recognized the
* UTF-8 charset.
* </p>
*
* <p>
* A byte buffer of 4KB or 8KB is sufficient to be able to guess the encoding.
* </p>
*
* <p>
* Usage:
* </p>
*
* <pre>
* // guess the encoding
* Charset guessedCharset = com.glaforge.i18n.io.CharsetToolkit.guessEncoding(
* file, 4096);
*
* // create a reader with the charset we've just discovered
* FileInputStream fis = new FileInputStream(file);
* InputStreamReader isr = new InputStreamReader(fis, guessedCharset);
* BufferedReader br = new BufferedReader(isr);
*
* // read the file content
* String line;
* while ((line = br.readLine()) != null) {
* System.out.println(line);
* }
* </pre>
*
* <p>
* Date: 18 juil. 2002
* </p>
*
* @author Guillaume LAFORGE
*/
public class CharsetToolkit {
private byte[] buffer;
private Charset defaultCharset;
private boolean enforce8Bit = false;
/**
* Constructor of the <code>com.glaforge.i18n.io.CharsetToolkit</code>
* utility class.
*
* @param buffer
* the byte buffer of which we want to know the encoding.
*/
public CharsetToolkit(byte[] buffer) {
this.buffer = buffer;
this.defaultCharset = getDefaultSystemCharset();
}
/**
* Constructor of the <code>com.glaforge.i18n.io.CharsetToolkit</code>
* utility class.
*
* @param buffer
* the byte buffer of which we want to know the encoding.
* @param defaultCharset
* the default Charset to use in case an 8-bit charset is
* recognized.
*/
public CharsetToolkit(byte[] buffer, Charset defaultCharset) {
this.buffer = buffer;
setDefaultCharset(defaultCharset);
}
/**
* Defines the default <code>Charset</code> used in case the buffer
* represents an 8-bit <code>Charset</code>.
*
* @param defaultCharset
* the default <code>Charset</code> to be returned by
* <code>guessEncoding()</code> if an 8-bit
* <code>Charset</code> is encountered.
*/
public void setDefaultCharset(Charset defaultCharset) {
if (defaultCharset != null)
this.defaultCharset = defaultCharset;
else
this.defaultCharset = getDefaultSystemCharset();
}
/**
* If US-ASCII is recognized, enforce to return the default encoding, rather
* than US-ASCII. It might be a file without any special character in the
* range 128-255, but that may be or become a file encoded with the default
* <code>charset</code> rather than US-ASCII.
*
* @param enforce
* a boolean specifying the use or not of US-ASCII.
*/
public void setEnforce8Bit(boolean enforce) {
this.enforce8Bit = enforce;
}
/**
* Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII
* encoding.
*
* @return a boolean representing the flag of use of US-ASCII.
*/
public boolean getEnforce8Bit() {
return this.enforce8Bit;
}
/**
* Retrieves the default Charset
*
* @return
*/
public Charset getDefaultCharset() {
return defaultCharset;
}
/**
* <p>
* Guess the encoding of the provided buffer.
* </p>
* If Byte Order Markers are encountered at the beginning of the buffer, we
* immidiately return the charset implied by this BOM. Otherwise, the file
* would not be a human readable text file.
* </p>
*
* <p>
* If there is no BOM, this method tries to discern whether the file is
* UTF-8 or not. If it is not UTF-8, we assume the encoding is the default
* system encoding (of course, it might be any 8-bit charset, but usually,
* an 8-bit charset is the default one).
* </p>
*
* <p>
* It is possible to discern UTF-8 thanks to the pattern of characters with
* a multi-byte sequence.
* </p>
*
* <pre>
*
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
* 0000 0000-0000 007F 0xxxxxxx
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
* 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* </pre>
*
* <p>
* With UTF-8, 0xFE and 0xFF never appear.
* </p>
*
* @return the Charset recognized.
*/
public Charset guessEncoding() {
// if the file has a Byte Order Marker, we can assume the file is in
// UTF-xx
// otherwise, the file would not be human readable
if (hasUTF8Bom(buffer))
return Charset.forName("UTF-8");
if (hasUTF16LEBom(buffer))
return Charset.forName("UTF-16LE");
if (hasUTF16BEBom(buffer))
return Charset.forName("UTF-16BE");
// if a byte has its most significant bit set, the file is in UTF-8 or
// in the default encoding
// otherwise, the file is in US-ASCII
boolean highOrderBit = false;
// if the file is in UTF-8, high order bytes must have a certain value,
// in order to be valid
// if it's not the case, we can assume the encoding is the default
// encoding of the system
boolean validU8Char = true;
// TODO the buffer is not read up to the end, but up to length -
int length = buffer.length;
int i = 0;
while (i < length - 6) {
byte b0 = buffer[i];
byte b1 = buffer[i + 1];
byte b2 = buffer[i + 2];
byte b3 = buffer[i + 3];
byte b4 = buffer[i + 4];
byte b5 = buffer[i + 5];
if (b0 < 0) {
// a high order bit was encountered, thus the encoding is not
// US-ASCII
// it may be either an 8-bit encoding or UTF-
highOrderBit = true;
// a two-bytes sequence was encoutered
if (isTwoBytesSequence(b0)) {
// there must be one continuation byte of the form 10xxxxxx,
// otherwise the following characteris is not a valid UTF-8
// construct
if (!isContinuationChar(b1))
validU8Char = false;
else
i++;
}
// a three-bytes sequence was encoutered
else if (isThreeBytesSequence(b0)) {
// there must be two continuation bytes of the form
// 10xxxxxx,
// otherwise the following characteris is not a valid UTF-8
// construct
if (!(isContinuationChar(b1) && isContinuationChar(b2)))
validU8Char = false;
else
i += 2;
}
// a four-bytes sequence was encoutered
else if (isFourBytesSequence(b0)) {
// there must be three continuation bytes of the form
// 10xxxxxx,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -