charsettoolkit.java

来自「Groovy动态语言 运行在JVM中的动态语言 可以方便的处理业务逻辑变化大的业」· Java 代码 · 共 426 行 · 第 1/2 页

JAVA
426
字号
/*
 * $Id: CharsetToolkit.java 4112 2006-10-13 13:21:25Z blackdrag $
 *
 * Copyright 2003 (C) Guillaume Laforge. All Rights Reserved.
 *
 * Redistribution and use of this software and associated documentation
 * ("Software"), with or without modification, are permitted provided that the
 * following conditions are met:
 *  1. Redistributions of source code must retain copyright statements and
 * notices. Redistributions must also contain a copy of this document.
 *  2. Redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution.
 *  3. The name "groovy" must not be used to endorse or promote products
 * derived from this Software without prior written permission of The Codehaus.
 * For written permission, please contact info@codehaus.org.
 *  4. Products derived from this Software may not be called "groovy" nor may
 * "groovy" appear in their names without prior written permission of The
 * Codehaus. "groovy" is a registered trademark of The Codehaus.
 *  5. Due credit should be given to The Codehaus - http://groovy.codehaus.org/
 *
 * THIS SOFTWARE IS PROVIDED BY THE CODEHAUS AND CONTRIBUTORS ``AS IS'' AND ANY
 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE CODEHAUS OR ITS CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 *
 */

package groovy.util;

import java.io.*;
import java.nio.charset.Charset;
import java.util.*;

/**
 * <p>Utility class to guess the encoding of a given text file.</p>
 *
 * <p>Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files
 * with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer
 * is wide enough, the charset should also be discovered.</p>
 *
 * <p>A byte buffer of 4KB is usually sufficient to be able to guess the encoding.</p>
 *
 * <p>Usage:</p>
 * <pre>
 * // guess the encoding
 * Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
 *
 * // create a reader with the correct charset
 * CharsetToolkit toolkit = new CharsetToolkit(file);
 * BufferedReader reader = toolkit.getReader();
 *
 * // read the file content
 * String line;
 * while ((line = br.readLine())!= null)
 * {
 *     System.out.println(line);
 * }
 * </pre>
 *
 * @author Guillaume Laforge
 */
public class CharsetToolkit {
    private byte[] buffer;
    private Charset defaultCharset;
    private Charset charset;
    private boolean enforce8Bit = true;
    private File file;

    /**
     * Constructor of the <code>CharsetToolkit</code> utility class.
     *
     * @param file of which we want to know the encoding.
     */
    public CharsetToolkit(File file) throws IOException {
        this.file = file;
        this.defaultCharset = getDefaultSystemCharset();
        this.charset = null;
        InputStream input = new FileInputStream(file);
        try {
            byte[] bytes = new byte[4096];
            int bytesRead = input.read(bytes);
            if (bytesRead == -1) {
                this.buffer = new byte[0];
            }
            else if (bytesRead < 4096) {
                byte[] bytesToGuess = new byte[bytesRead];
                System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead);
                this.buffer = bytesToGuess;
            }
            else {
                this.buffer = bytes;
            }
        } finally {
            try {input.close();} catch (IOException e){}
        }
    }

    /**
     * Defines the default <code>Charset</code> used in case the buffer represents
     * an 8-bit <code>Charset</code>.
     *
     * @param defaultCharset the default <code>Charset</code> to be returned by <code>guessEncoding()</code>
     * if an 8-bit <code>Charset</code> is encountered.
     */
    public void setDefaultCharset(Charset defaultCharset) {
        if (defaultCharset != null)
            this.defaultCharset = defaultCharset;
        else
            this.defaultCharset = getDefaultSystemCharset();
    }

    public Charset getCharset() {
        if (this.charset == null)
            this.charset = guessEncoding();
        return charset;
    }

    /**
     * If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII.
     * It might be a file without any special character in the range 128-255, but that may be or become
     * a file encoded with the default <code>charset</code> rather than US-ASCII.
     *
     * @param enforce a boolean specifying the use or not of US-ASCII.
     */
    public void setEnforce8Bit(boolean enforce) {
        this.enforce8Bit = enforce;
    }

    /**
     * Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding.
     *
     * @return a boolean representing the flag of use of US-ASCII.
     */
    public boolean getEnforce8Bit() {
        return this.enforce8Bit;
    }

    /**
     * Retrieves the default Charset
     */
    public Charset getDefaultCharset() {
        return defaultCharset;
    }

    /**
     * <p>Guess the encoding of the provided buffer.</p>
     * If Byte Order Markers are encountered at the beginning of the buffer, we immidiately
     * return the charset implied by this BOM. Otherwise, the file would not be a human
     * readable text file.</p>
     *
     * <p>If there is no BOM, this method tries to discern whether the file is UTF-8 or not.
     * If it is not UTF-8, we assume the encoding is the default system encoding
     * (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).</p>
     *
     * <p>It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.</p>
     * <pre>
     * UCS-4 range (hex.)        UTF-8 octet sequence (binary)
     * 0000 0000-0000 007F       0xxxxxxx
     * 0000 0080-0000 07FF       110xxxxx 10xxxxxx
     * 0000 0800-0000 FFFF       1110xxxx 10xxxxxx 10xxxxxx
     * 0001 0000-001F FFFF       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     * 0020 0000-03FF FFFF       111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
     * 0400 0000-7FFF FFFF       1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
     * </pre>
     * <p>With UTF-8, 0xFE and 0xFF never appear.</p>
     *
     * @return the Charset recognized.
     */
    private Charset guessEncoding() {
        // if the file has a Byte Order Marker, we can assume the file is in UTF-xx
        // otherwise, the file would not be human readable
        if (hasUTF8Bom())
            return Charset.forName("UTF-8");
        if (hasUTF16LEBom())
            return Charset.forName("UTF-16LE");
        if (hasUTF16BEBom())
            return Charset.forName("UTF-16BE");

        // if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding
        // otherwise, the file is in US-ASCII
        boolean highOrderBit = false;

        // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid
        // if it's not the case, we can assume the encoding is the default encoding of the system
        boolean validU8Char = true;

        // TODO the buffer is not read up to the end, but up to length - 6

        int length = buffer.length;
        int i = 0;
        while (i < length - 6) {
            byte b0 = buffer[i];
            byte b1 = buffer[i + 1];
            byte b2 = buffer[i + 2];
            byte b3 = buffer[i + 3];
            byte b4 = buffer[i + 4];
            byte b5 = buffer[i + 5];
            if (b0 < 0) {
                // a high order bit was encountered, thus the encoding is not US-ASCII
                // it may be either an 8-bit encoding or UTF-8
                highOrderBit = true;
                // a two-bytes sequence was encoutered
                if (isTwoBytesSequence(b0)) {
                    // there must be one continuation byte of the form 10xxxxxx,
                    // otherwise the following characteris is not a valid UTF-8 construct

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?