📄 charsettoolkit.java

📁 大名鼎鼎的java动态脚本语言。已经通过了sun的认证
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
                    else                        i++;                }                // a three-bytes sequence was encoutered                else if (isThreeBytesSequence(b0)) {                    // there must be two continuation bytes of the form 10xxxxxx,                    // otherwise the following characteris is not a valid UTF-8 construct                    if (!(isContinuationChar(b1) && isContinuationChar(b2)))                        validU8Char = false;                    else                        i += 2;                }                // a four-bytes sequence was encoutered                else if (isFourBytesSequence(b0)) {                    // there must be three continuation bytes of the form 10xxxxxx,                    // otherwise the following characteris is not a valid UTF-8 construct                    if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))                        validU8Char = false;                    else                        i += 3;                }                // a five-bytes sequence was encoutered                else if (isFiveBytesSequence(b0)) {                    // there must be four continuation bytes of the form 10xxxxxx,                    // otherwise the following characteris is not a valid UTF-8 construct                    if (!(isContinuationChar(b1)                        && isContinuationChar(b2)                        && isContinuationChar(b3)                        && isContinuationChar(b4)))                        validU8Char = false;                    else                        i += 4;                }                // a six-bytes sequence was encoutered                else if (isSixBytesSequence(b0)) {                    // there must be five continuation bytes of the form 10xxxxxx,                    // otherwise the following characteris is not a valid UTF-8 construct                    if (!(isContinuationChar(b1)                        && isContinuationChar(b2)                        && isContinuationChar(b3)                        && isContinuationChar(b4)                        && isContinuationChar(b5)))                        validU8Char = false;                    else                        i += 5;                }                else                    validU8Char = false;            }            if (!validU8Char)                break;            i++;        }        // if no byte with an high order bit set, the encoding is US-ASCII        // (it might have been UTF-7, but this encoding is usually internally used only by mail systems)        if (!highOrderBit) {            // returns the default charset rather than US-ASCII if the enforce8Bit flag is set.            if (this.enforce8Bit)                return this.defaultCharset;            else                return Charset.forName("US-ASCII");        }        // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,        // otherwise the file would not be human readable        if (validU8Char)            return Charset.forName("UTF-8");        // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding        return this.defaultCharset;    }    /**     * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;     *     * @param b a byte.     * @return true if it's a continuation char.     */    private static boolean isContinuationChar(byte b) {        return -128 <= b && b <= -65;    }    /**     * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.     *     * @param b a byte.     * @return true if it's the first byte of a two-bytes sequence.     */    private static boolean isTwoBytesSequence(byte b) {        return -64 <= b && b <= -33;    }    /**     * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.     *     * @param b a byte.     * @return true if it's the first byte of a three-bytes sequence.     */    private static boolean isThreeBytesSequence(byte b) {        return -32 <= b && b <= -17;    }    /**     * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.     *     * @param b a byte.     * @return true if it's the first byte of a four-bytes sequence.     */    private static boolean isFourBytesSequence(byte b) {        return -16 <= b && b <= -9;    }    /**     * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.     *     * @param b a byte.     * @return true if it's the first byte of a five-bytes sequence.     */    private static boolean isFiveBytesSequence(byte b) {        return -8 <= b && b <= -5;    }    /**     * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.     *     * @param b a byte.     * @return true if it's the first byte of a six-bytes sequence.     */    private static boolean isSixBytesSequence(byte b) {        return -4 <= b && b <= -3;    }    /**     * Retrieve the default charset of the system.     *     * @return the default <code>Charset</code>.     */    public static Charset getDefaultSystemCharset() {        return Charset.forName(System.getProperty("file.encoding"));    }    /**     * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).     *     * @return true if the buffer has a BOM for UTF8.     */    public boolean hasUTF8Bom() {        if (buffer.length >= 3)            return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65);        else            return false;    }    /**     * Has a Byte Order Marker for UTF-16 Low Endian     * (ucs-2le, ucs-4le, and ucs-16le).     *     * @return true if the buffer has a BOM for UTF-16 Low Endian.     */    public boolean hasUTF16LEBom() {        if (buffer.length >= 2)            return (buffer[0] == -1 && buffer[1] == -2);        else            return false;    }    /**     * Has a Byte Order Marker for UTF-16 Big Endian     * (utf-16 and ucs-2).     *     * @return true if the buffer has a BOM for UTF-16 Big Endian.     */    public boolean hasUTF16BEBom() {        if (buffer.length >= 2)            return (buffer[0] == -2 && buffer[1] == -1);        else            return false;    }    /**     * Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>     * specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the     * method <code>guessEncoding()</code>.     *     * @return a <code>BufferedReader</code>     * @throws FileNotFoundException if the file is not found.     */    public BufferedReader getReader() throws FileNotFoundException {        LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset()));        if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) {            try {                reader.read();            }            catch (IOException e) {                // should never happen, as a file with no content                // but with a BOM has at least one char            }        }        return reader;    }    /**     * Retrieves all the available <code>Charset</code>s on the platform,     * among which the default <code>charset</code>.     *     * @return an array of <code>Charset</code>s.     */    public static Charset[] getAvailableCharsets() {        Collection collection = Charset.availableCharsets().values();        return (Charset[]) collection.toArray(new Charset[collection.size()]);    }}
上一页 12
💿 文件大小 1630 K
👤 上传用户 hjf
📂 所属分类 Java编程
🏷️ 相关标签

#java #sun #动态 #脚本
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -