📄 charsettoolkit.java
字号:
/* * $Id: CharsetToolkit.java,v 1.2 2004/07/11 19:41:25 glaforge Exp $ * * Copyright 2003 (C) Guillaume Laforge. All Rights Reserved. * * Redistribution and use of this software and associated documentation * ("Software"), with or without modification, are permitted provided that the * following conditions are met: * 1. Redistributions of source code must retain copyright statements and * notices. Redistributions must also contain a copy of this document. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name "groovy" must not be used to endorse or promote products * derived from this Software without prior written permission of The Codehaus. * For written permission, please contact info@codehaus.org. * 4. Products derived from this Software may not be called "groovy" nor may * "groovy" appear in their names without prior written permission of The * Codehaus. "groovy" is a registered trademark of The Codehaus. * 5. Due credit should be given to The Codehaus - http://groovy.codehaus.org/ * * THIS SOFTWARE IS PROVIDED BY THE CODEHAUS AND CONTRIBUTORS ``AS IS'' AND ANY * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE CODEHAUS OR ITS CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */package groovy.util;import java.io.*;import java.nio.charset.Charset;import java.util.*;/** * <p>Utility class to guess the encoding of a given text file.</p> * * <p>Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files * with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer * is wide enough, the charset should also be discovered.</p> * * <p>A byte buffer of 4KB is usually sufficient to be able to guess the encoding.</p> * * <p>Usage:</p> * <pre> * // guess the encoding * Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096); * * // create a reader with the correct charset * CharsetToolkit toolkit = new CharsetToolkit(file); * BufferedReader reader = toolkit.getReader(); * * // read the file content * String line; * while ((line = br.readLine())!= null) * { * System.out.println(line); * } * </pre> * * @author Guillaume Laforge */public class CharsetToolkit { private byte[] buffer; private Charset defaultCharset; private Charset charset; private boolean enforce8Bit = true; private File file; /** * Constructor of the <code>CharsetToolkit</code> utility class. * * @param file of which we want to know the encoding. */ public CharsetToolkit(File file) throws IOException { this.file = file; InputStream input = new FileInputStream(file); byte[] bytes = new byte[4096]; int bytesRead = input.read(bytes); if (bytesRead == -1) { this.buffer = new byte[0]; } else if (bytesRead < 4096) { byte[] bytesToGuess = new byte[bytesRead]; System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead); this.buffer = bytesToGuess; } else { this.buffer = bytes; } this.defaultCharset = getDefaultSystemCharset(); this.charset = null; } /** * Defines the default <code>Charset</code> used in case the buffer represents * an 8-bit <code>Charset</code>. * * @param defaultCharset the default <code>Charset</code> to be returned by <code>guessEncoding()</code> * if an 8-bit <code>Charset</code> is encountered. */ public void setDefaultCharset(Charset defaultCharset) { if (defaultCharset != null) this.defaultCharset = defaultCharset; else this.defaultCharset = getDefaultSystemCharset(); } public Charset getCharset() { if (this.charset == null) this.charset = guessEncoding(); return charset; } /** * If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII. * It might be a file without any special character in the range 128-255, but that may be or become * a file encoded with the default <code>charset</code> rather than US-ASCII. * * @param enforce a boolean specifying the use or not of US-ASCII. */ public void setEnforce8Bit(boolean enforce) { this.enforce8Bit = enforce; } /** * Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding. * * @return a boolean representing the flag of use of US-ASCII. */ public boolean getEnforce8Bit() { return this.enforce8Bit; } /** * Retrieves the default Charset * @return */ public Charset getDefaultCharset() { return defaultCharset; } /** * <p>Guess the encoding of the provided buffer.</p> * If Byte Order Markers are encountered at the beginning of the buffer, we immidiately * return the charset implied by this BOM. Otherwise, the file would not be a human * readable text file.</p> * * <p>If there is no BOM, this method tries to discern whether the file is UTF-8 or not. * If it is not UTF-8, we assume the encoding is the default system encoding * (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).</p> * * <p>It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.</p> * <pre> * UCS-4 range (hex.) UTF-8 octet sequence (binary) * 0000 0000-0000 007F 0xxxxxxx * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * </pre> * <p>With UTF-8, 0xFE and 0xFF never appear.</p> * * @return the Charset recognized. */ private Charset guessEncoding() { // if the file has a Byte Order Marker, we can assume the file is in UTF-xx // otherwise, the file would not be human readable if (hasUTF8Bom()) return Charset.forName("UTF-8"); if (hasUTF16LEBom()) return Charset.forName("UTF-16LE"); if (hasUTF16BEBom()) return Charset.forName("UTF-16BE"); // if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding // otherwise, the file is in US-ASCII boolean highOrderBit = false; // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid // if it's not the case, we can assume the encoding is the default encoding of the system boolean validU8Char = true; // TODO the buffer is not read up to the end, but up to length - 6 int length = buffer.length; int i = 0; while (i < length - 6) { byte b0 = buffer[i]; byte b1 = buffer[i + 1]; byte b2 = buffer[i + 2]; byte b3 = buffer[i + 3]; byte b4 = buffer[i + 4]; byte b5 = buffer[i + 5]; if (b0 < 0) { // a high order bit was encountered, thus the encoding is not US-ASCII // it may be either an 8-bit encoding or UTF-8 highOrderBit = true; // a two-bytes sequence was encoutered if (isTwoBytesSequence(b0)) { // there must be one continuation byte of the form 10xxxxxx, // otherwise the following characteris is not a valid UTF-8 construct if (!isContinuationChar(b1)) validU8Char = false;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -