charsettoolkit.java

来自「Groovy动态语言 运行在JVM中的动态语言 可以方便的处理业务逻辑变化大的业」· Java 代码 · 共 426 行 · 第 1/2 页

JAVA
426
字号
                    if (!isContinuationChar(b1))
                        validU8Char = false;
                    else
                        i++;
                }
                // a three-bytes sequence was encoutered
                else if (isThreeBytesSequence(b0)) {
                    // there must be two continuation bytes of the form 10xxxxxx,
                    // otherwise the following characteris is not a valid UTF-8 construct
                    if (!(isContinuationChar(b1) && isContinuationChar(b2)))
                        validU8Char = false;
                    else
                        i += 2;
                }
                // a four-bytes sequence was encoutered
                else if (isFourBytesSequence(b0)) {
                    // there must be three continuation bytes of the form 10xxxxxx,
                    // otherwise the following characteris is not a valid UTF-8 construct
                    if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
                        validU8Char = false;
                    else
                        i += 3;
                }
                // a five-bytes sequence was encoutered
                else if (isFiveBytesSequence(b0)) {
                    // there must be four continuation bytes of the form 10xxxxxx,
                    // otherwise the following characteris is not a valid UTF-8 construct
                    if (!(isContinuationChar(b1)
                        && isContinuationChar(b2)
                        && isContinuationChar(b3)
                        && isContinuationChar(b4)))
                        validU8Char = false;
                    else
                        i += 4;
                }
                // a six-bytes sequence was encoutered
                else if (isSixBytesSequence(b0)) {
                    // there must be five continuation bytes of the form 10xxxxxx,
                    // otherwise the following characteris is not a valid UTF-8 construct
                    if (!(isContinuationChar(b1)
                        && isContinuationChar(b2)
                        && isContinuationChar(b3)
                        && isContinuationChar(b4)
                        && isContinuationChar(b5)))
                        validU8Char = false;
                    else
                        i += 5;
                }
                else
                    validU8Char = false;
            }
            if (!validU8Char)
                break;
            i++;
        }
        // if no byte with an high order bit set, the encoding is US-ASCII
        // (it might have been UTF-7, but this encoding is usually internally used only by mail systems)
        if (!highOrderBit) {
            // returns the default charset rather than US-ASCII if the enforce8Bit flag is set.
            if (this.enforce8Bit)
                return this.defaultCharset;
            else
                return Charset.forName("US-ASCII");
        }
        // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
        // otherwise the file would not be human readable
        if (validU8Char)
            return Charset.forName("UTF-8");
        // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding
        return this.defaultCharset;
    }

    /**
     * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
     *
     * @param b a byte.
     * @return true if it's a continuation char.
     */
    private static boolean isContinuationChar(byte b) {
        return -128 <= b && b <= -65;
    }

    /**
     * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
     *
     * @param b a byte.
     * @return true if it's the first byte of a two-bytes sequence.
     */
    private static boolean isTwoBytesSequence(byte b) {
        return -64 <= b && b <= -33;
    }

    /**
     * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
     *
     * @param b a byte.
     * @return true if it's the first byte of a three-bytes sequence.
     */
    private static boolean isThreeBytesSequence(byte b) {
        return -32 <= b && b <= -17;
    }

    /**
     * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
     *
     * @param b a byte.
     * @return true if it's the first byte of a four-bytes sequence.
     */
    private static boolean isFourBytesSequence(byte b) {
        return -16 <= b && b <= -9;
    }

    /**
     * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
     *
     * @param b a byte.
     * @return true if it's the first byte of a five-bytes sequence.
     */
    private static boolean isFiveBytesSequence(byte b) {
        return -8 <= b && b <= -5;
    }

    /**
     * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
     *
     * @param b a byte.
     * @return true if it's the first byte of a six-bytes sequence.
     */
    private static boolean isSixBytesSequence(byte b) {
        return -4 <= b && b <= -3;
    }

    /**
     * Retrieve the default charset of the system.
     *
     * @return the default <code>Charset</code>.
     */
    public static Charset getDefaultSystemCharset() {
        return Charset.forName(System.getProperty("file.encoding"));
    }

    /**
     * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
     *
     * @return true if the buffer has a BOM for UTF8.
     */
    public boolean hasUTF8Bom() {
        if (buffer.length >= 3)
            return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65);
        else
            return false;
    }

    /**
     * Has a Byte Order Marker for UTF-16 Low Endian
     * (ucs-2le, ucs-4le, and ucs-16le).
     *
     * @return true if the buffer has a BOM for UTF-16 Low Endian.
     */
    public boolean hasUTF16LEBom() {
        if (buffer.length >= 2)
            return (buffer[0] == -1 && buffer[1] == -2);
        else
            return false;
    }

    /**
     * Has a Byte Order Marker for UTF-16 Big Endian
     * (utf-16 and ucs-2).
     *
     * @return true if the buffer has a BOM for UTF-16 Big Endian.
     */
    public boolean hasUTF16BEBom() {
        if (buffer.length >= 2)
            return (buffer[0] == -2 && buffer[1] == -1);
        else
            return false;
    }

    /**
     * Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>
     * specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the
     * method <code>guessEncoding()</code>.
     *
     * @return a <code>BufferedReader</code>
     * @throws FileNotFoundException if the file is not found.
     */
    public BufferedReader getReader() throws FileNotFoundException {
        LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset()));
        if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) {
            try {
                reader.read();
            }
            catch (IOException e) {
                // should never happen, as a file with no content
                // but with a BOM has at least one char
            }
        }
        return reader;
    }

    /**
     * Retrieves all the available <code>Charset</code>s on the platform,
     * among which the default <code>charset</code>.
     *
     * @return an array of <code>Charset</code>s.
     */
    public static Charset[] getAvailableCharsets() {
        Collection collection = Charset.availableCharsets().values();
        return (Charset[]) collection.toArray(new Charset[collection.size()]);
    }
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?