charsettoolkit.java
来自「Groovy动态语言 运行在JVM中的动态语言 可以方便的处理业务逻辑变化大的业」· Java 代码 · 共 426 行 · 第 1/2 页
JAVA
426 行
if (!isContinuationChar(b1))
validU8Char = false;
else
i++;
}
// a three-bytes sequence was encoutered
else if (isThreeBytesSequence(b0)) {
// there must be two continuation bytes of the form 10xxxxxx,
// otherwise the following characteris is not a valid UTF-8 construct
if (!(isContinuationChar(b1) && isContinuationChar(b2)))
validU8Char = false;
else
i += 2;
}
// a four-bytes sequence was encoutered
else if (isFourBytesSequence(b0)) {
// there must be three continuation bytes of the form 10xxxxxx,
// otherwise the following characteris is not a valid UTF-8 construct
if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
validU8Char = false;
else
i += 3;
}
// a five-bytes sequence was encoutered
else if (isFiveBytesSequence(b0)) {
// there must be four continuation bytes of the form 10xxxxxx,
// otherwise the following characteris is not a valid UTF-8 construct
if (!(isContinuationChar(b1)
&& isContinuationChar(b2)
&& isContinuationChar(b3)
&& isContinuationChar(b4)))
validU8Char = false;
else
i += 4;
}
// a six-bytes sequence was encoutered
else if (isSixBytesSequence(b0)) {
// there must be five continuation bytes of the form 10xxxxxx,
// otherwise the following characteris is not a valid UTF-8 construct
if (!(isContinuationChar(b1)
&& isContinuationChar(b2)
&& isContinuationChar(b3)
&& isContinuationChar(b4)
&& isContinuationChar(b5)))
validU8Char = false;
else
i += 5;
}
else
validU8Char = false;
}
if (!validU8Char)
break;
i++;
}
// if no byte with an high order bit set, the encoding is US-ASCII
// (it might have been UTF-7, but this encoding is usually internally used only by mail systems)
if (!highOrderBit) {
// returns the default charset rather than US-ASCII if the enforce8Bit flag is set.
if (this.enforce8Bit)
return this.defaultCharset;
else
return Charset.forName("US-ASCII");
}
// if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
// otherwise the file would not be human readable
if (validU8Char)
return Charset.forName("UTF-8");
// finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding
return this.defaultCharset;
}
/**
* If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
*
* @param b a byte.
* @return true if it's a continuation char.
*/
private static boolean isContinuationChar(byte b) {
return -128 <= b && b <= -65;
}
/**
* If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
*
* @param b a byte.
* @return true if it's the first byte of a two-bytes sequence.
*/
private static boolean isTwoBytesSequence(byte b) {
return -64 <= b && b <= -33;
}
/**
* If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
*
* @param b a byte.
* @return true if it's the first byte of a three-bytes sequence.
*/
private static boolean isThreeBytesSequence(byte b) {
return -32 <= b && b <= -17;
}
/**
* If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
*
* @param b a byte.
* @return true if it's the first byte of a four-bytes sequence.
*/
private static boolean isFourBytesSequence(byte b) {
return -16 <= b && b <= -9;
}
/**
* If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
*
* @param b a byte.
* @return true if it's the first byte of a five-bytes sequence.
*/
private static boolean isFiveBytesSequence(byte b) {
return -8 <= b && b <= -5;
}
/**
* If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
*
* @param b a byte.
* @return true if it's the first byte of a six-bytes sequence.
*/
private static boolean isSixBytesSequence(byte b) {
return -4 <= b && b <= -3;
}
/**
* Retrieve the default charset of the system.
*
* @return the default <code>Charset</code>.
*/
public static Charset getDefaultSystemCharset() {
return Charset.forName(System.getProperty("file.encoding"));
}
/**
* Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
*
* @return true if the buffer has a BOM for UTF8.
*/
public boolean hasUTF8Bom() {
if (buffer.length >= 3)
return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65);
else
return false;
}
/**
* Has a Byte Order Marker for UTF-16 Low Endian
* (ucs-2le, ucs-4le, and ucs-16le).
*
* @return true if the buffer has a BOM for UTF-16 Low Endian.
*/
public boolean hasUTF16LEBom() {
if (buffer.length >= 2)
return (buffer[0] == -1 && buffer[1] == -2);
else
return false;
}
/**
* Has a Byte Order Marker for UTF-16 Big Endian
* (utf-16 and ucs-2).
*
* @return true if the buffer has a BOM for UTF-16 Big Endian.
*/
public boolean hasUTF16BEBom() {
if (buffer.length >= 2)
return (buffer[0] == -2 && buffer[1] == -1);
else
return false;
}
/**
* Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>
* specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the
* method <code>guessEncoding()</code>.
*
* @return a <code>BufferedReader</code>
* @throws FileNotFoundException if the file is not found.
*/
public BufferedReader getReader() throws FileNotFoundException {
LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset()));
if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) {
try {
reader.read();
}
catch (IOException e) {
// should never happen, as a file with no content
// but with a BOM has at least one char
}
}
return reader;
}
/**
* Retrieves all the available <code>Charset</code>s on the platform,
* among which the default <code>charset</code>.
*
* @return an array of <code>Charset</code>s.
*/
public static Charset[] getAvailableCharsets() {
Collection collection = Charset.availableCharsets().values();
return (Charset[]) collection.toArray(new Charset[collection.size()]);
}
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?