📄 charsettoolkit.java
字号:
// otherwise the following characteris is not a valid UTF-8
// construct
if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
validU8Char = false;
else
i += 3;
}
// a five-bytes sequence was encoutered
else if (isFiveBytesSequence(b0)) {
// there must be four continuation bytes of the form
// 10xxxxxx,
// otherwise the following characteris is not a valid UTF-8
// construct
if (!(isContinuationChar(b1) && isContinuationChar(b2)
&& isContinuationChar(b3) && isContinuationChar(b4)))
validU8Char = false;
else
i += 4;
}
// a six-bytes sequence was encoutered
else if (isSixBytesSequence(b0)) {
// there must be five continuation bytes of the form
// 10xxxxxx,
// otherwise the following characteris is not a valid UTF-8
// construct
if (!(isContinuationChar(b1) && isContinuationChar(b2)
&& isContinuationChar(b3) && isContinuationChar(b4) && isContinuationChar(b5)))
validU8Char = false;
else
i += 5;
} else
validU8Char = false;
}
if (!validU8Char)
break;
i++;
}
// if no byte with an high order bit set, the encoding is US-ASCII
// (it might have been UTF-7, but this encoding is usually internally
// used only by mail systems)
if (!highOrderBit) {
// returns the default charset rather than US-ASCII if the
// enforce8Bit flag is set.
if (this.enforce8Bit)
return this.defaultCharset;
else
return Charset.forName("US-ASCII");
}
// if no invalid UTF-8 were encountered, we can assume the encoding is
// UTF-8,
// otherwise the file would not be human readable
if (validU8Char)
return Charset.forName("UTF-8");
// finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is
// the default encoding
return this.defaultCharset;
}
public static Charset guessEncoding(File f, int bufferLength)
throws FileNotFoundException, IOException {
FileInputStream fis = new FileInputStream(f);
byte[] buffer = new byte[bufferLength];
fis.read(buffer);
fis.close();
CharsetToolkit toolkit = new CharsetToolkit(buffer);
toolkit.setDefaultCharset(getDefaultSystemCharset());
return toolkit.guessEncoding();
}
public static Charset guessEncoding(File f, int bufferLength,
Charset defaultCharset) throws FileNotFoundException, IOException {
FileInputStream fis = new FileInputStream(f);
byte[] buffer = new byte[bufferLength];
fis.read(buffer);
fis.close();
CharsetToolkit toolkit = new CharsetToolkit(buffer);
toolkit.setDefaultCharset(defaultCharset);
return toolkit.guessEncoding();
}
/**
* If the byte has the form 10xxxxx, then it's a continuation byte of a
* multiple byte character;
*
* @param b
* a byte.
* @return true if it's a continuation char.
*/
private static boolean isContinuationChar(byte b) {
return -128 <= b && b <= -65;
}
/**
* If the byte has the form 110xxxx, then it's the first byte of a two-bytes
* sequence character.
*
* @param b
* a byte.
* @return true if it's the first byte of a two-bytes sequence.
*/
private static boolean isTwoBytesSequence(byte b) {
return -64 <= b && b <= -33;
}
/**
* If the byte has the form 1110xxx, then it's the first byte of a
* three-bytes sequence character.
*
* @param b
* a byte.
* @return true if it's the first byte of a three-bytes sequence.
*/
private static boolean isThreeBytesSequence(byte b) {
return -32 <= b && b <= -17;
}
/**
* If the byte has the form 11110xx, then it's the first byte of a
* four-bytes sequence character.
*
* @param b
* a byte.
* @return true if it's the first byte of a four-bytes sequence.
*/
private static boolean isFourBytesSequence(byte b) {
return -16 <= b && b <= -9;
}
/**
* If the byte has the form 11110xx, then it's the first byte of a
* five-bytes sequence character.
*
* @param b
* a byte.
* @return true if it's the first byte of a five-bytes sequence.
*/
private static boolean isFiveBytesSequence(byte b) {
return -8 <= b && b <= -5;
}
/**
* If the byte has the form 1110xxx, then it's the first byte of a six-bytes
* sequence character.
*
* @param b
* a byte.
* @return true if it's the first byte of a six-bytes sequence.
*/
private static boolean isSixBytesSequence(byte b) {
return -4 <= b && b <= -3;
}
/**
* Retrieve the default charset of the system.
*
* @return the default <code>Charset</code>.
*/
public static Charset getDefaultSystemCharset() {
return Charset.forName(System.getProperty("file.encoding"));
}
/**
* Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other
* editors).
*
* @param bom
* a buffer.
* @return true if the buffer has a BOM for UTF8.
*/
private static boolean hasUTF8Bom(byte[] bom) {
return (bom[0] == -17 && bom[1] == -69 && bom[2] == -65);
}
/**
* Has a Byte Order Marker for UTF-16 Low Endian (ucs-2le, ucs-4le, and
* ucs-16le).
*
* @param bom
* a buffer.
* @return true if the buffer has a BOM for UTF-16 Low Endian.
*/
private static boolean hasUTF16LEBom(byte[] bom) {
return (bom[0] == -1 && bom[1] == -2);
}
/**
* Has a Byte Order Marker for UTF-16 Big Endian (utf-16 and ucs-2).
*
* @param bom
* a buffer.
* @return true if the buffer has a BOM for UTF-16 Big Endian.
*/
private static boolean hasUTF16BEBom(byte[] bom) {
return (bom[0] == -2 && bom[1] == -1);
}
/**
* Retrieves all the available <code>Charset</code>s on the platform,
* among which the default <code>charset</code>.
*
* @return an array of <code>Charset</code>s.
*/
public static Charset[] getAvailableCharsets() {
Collection collection = Charset.availableCharsets().values();
return (Charset[]) collection.toArray(new Charset[collection.size()]);
}
public static boolean createDirectory(String path) {
boolean success = (new File(path)).mkdirs();
if (success) {
success = true;
} else {
success = false;
}
return success;
}
public static void main(String[] args) throws FileNotFoundException,
IOException {
File file = new File(args[0]);
// File file = new File("y:\\test\\Download.java");
Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
System.err.println("Charset found: " + guessedCharset.displayName());
// if (!guessedCharset.displayName().equals("UTF-8")) {
FileInputStream fis = new FileInputStream(file);
DataInputStream dis = new DataInputStream(fis);
InputStreamReader isr = new InputStreamReader(fis, guessedCharset);
BufferedReader br = new BufferedReader(isr);
String dstfile = file.getParent() + "\\decoded\\";
if (dstfile.lastIndexOf("\\decoded\\decoded\\") == -1) {
String dstf = dstfile + file.getName();
File ff = new File(dstfile);
File f = new File(dstf);
createDirectory(dstfile);
FileOutputStream fos = new FileOutputStream(f);
OutputStreamWriter osr = new OutputStreamWriter(fos, "UTF-8");
BufferedWriter bw = new BufferedWriter(osr);
String line;
while ((line = br.readLine()) != null) {
osr.write(line + "\r\n");
}
osr.flush();
isr.close();
osr.close();
}
// } else {
// System.out.println("已经是UTF-8,不用编码!");
// }
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -