📄 charsettoolkit.java
字号:
else i++; } // a three-bytes sequence was encoutered else if (isThreeBytesSequence(b0)) { // there must be two continuation bytes of the form 10xxxxxx, // otherwise the following characteris is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2))) validU8Char = false; else i += 2; } // a four-bytes sequence was encoutered else if (isFourBytesSequence(b0)) { // there must be three continuation bytes of the form 10xxxxxx, // otherwise the following characteris is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3))) validU8Char = false; else i += 3; } // a five-bytes sequence was encoutered else if (isFiveBytesSequence(b0)) { // there must be four continuation bytes of the form 10xxxxxx, // otherwise the following characteris is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4))) validU8Char = false; else i += 4; } // a six-bytes sequence was encoutered else if (isSixBytesSequence(b0)) { // there must be five continuation bytes of the form 10xxxxxx, // otherwise the following characteris is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4) && isContinuationChar(b5))) validU8Char = false; else i += 5; } else validU8Char = false; } if (!validU8Char) break; i++; } // if no byte with an high order bit set, the encoding is US-ASCII // (it might have been UTF-7, but this encoding is usually internally used only by mail systems) if (!highOrderBit) { // returns the default charset rather than US-ASCII if the enforce8Bit flag is set. if (this.enforce8Bit) return this.defaultCharset; else return Charset.forName("US-ASCII"); } // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8, // otherwise the file would not be human readable if (validU8Char) return Charset.forName("UTF-8"); // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding return this.defaultCharset; } /** * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character; * * @param b a byte. * @return true if it's a continuation char. */ private static boolean isContinuationChar(byte b) { return -128 <= b && b <= -65; } /** * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character. * * @param b a byte. * @return true if it's the first byte of a two-bytes sequence. */ private static boolean isTwoBytesSequence(byte b) { return -64 <= b && b <= -33; } /** * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character. * * @param b a byte. * @return true if it's the first byte of a three-bytes sequence. */ private static boolean isThreeBytesSequence(byte b) { return -32 <= b && b <= -17; } /** * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character. * * @param b a byte. * @return true if it's the first byte of a four-bytes sequence. */ private static boolean isFourBytesSequence(byte b) { return -16 <= b && b <= -9; } /** * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character. * * @param b a byte. * @return true if it's the first byte of a five-bytes sequence. */ private static boolean isFiveBytesSequence(byte b) { return -8 <= b && b <= -5; } /** * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character. * * @param b a byte. * @return true if it's the first byte of a six-bytes sequence. */ private static boolean isSixBytesSequence(byte b) { return -4 <= b && b <= -3; } /** * Retrieve the default charset of the system. * * @return the default <code>Charset</code>. */ public static Charset getDefaultSystemCharset() { return Charset.forName(System.getProperty("file.encoding")); } /** * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors). * * @return true if the buffer has a BOM for UTF8. */ public boolean hasUTF8Bom() { if (buffer.length >= 3) return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65); else return false; } /** * Has a Byte Order Marker for UTF-16 Low Endian * (ucs-2le, ucs-4le, and ucs-16le). * * @return true if the buffer has a BOM for UTF-16 Low Endian. */ public boolean hasUTF16LEBom() { if (buffer.length >= 2) return (buffer[0] == -1 && buffer[1] == -2); else return false; } /** * Has a Byte Order Marker for UTF-16 Big Endian * (utf-16 and ucs-2). * * @return true if the buffer has a BOM for UTF-16 Big Endian. */ public boolean hasUTF16BEBom() { if (buffer.length >= 2) return (buffer[0] == -2 && buffer[1] == -1); else return false; } /** * Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code> * specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the * method <code>guessEncoding()</code>. * * @return a <code>BufferedReader</code> * @throws FileNotFoundException if the file is not found. */ public BufferedReader getReader() throws FileNotFoundException { LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset())); if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) { try { reader.read(); } catch (IOException e) { // should never happen, as a file with no content // but with a BOM has at least one char } } return reader; } /** * Retrieves all the available <code>Charset</code>s on the platform, * among which the default <code>charset</code>. * * @return an array of <code>Charset</code>s. */ public static Charset[] getAvailableCharsets() { Collection collection = Charset.availableCharsets().values(); return (Charset[]) collection.toArray(new Charset[collection.size()]); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -