📄 charsettoolkit.java

📁 easyweb的使用
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
					// otherwise the following characteris is not a valid UTF-8
					// construct
					if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
						validU8Char = false;
					else
						i += 3;
				}
				// a five-bytes sequence was encoutered
				else if (isFiveBytesSequence(b0)) {
					// there must be four continuation bytes of the form
					// 10xxxxxx,
					// otherwise the following characteris is not a valid UTF-8
					// construct
					if (!(isContinuationChar(b1) && isContinuationChar(b2)
							&& isContinuationChar(b3) && isContinuationChar(b4)))
						validU8Char = false;
					else
						i += 4;
				}
				// a six-bytes sequence was encoutered
				else if (isSixBytesSequence(b0)) {
					// there must be five continuation bytes of the form
					// 10xxxxxx,
					// otherwise the following characteris is not a valid UTF-8
					// construct
					if (!(isContinuationChar(b1) && isContinuationChar(b2)
							&& isContinuationChar(b3) && isContinuationChar(b4) && isContinuationChar(b5)))
						validU8Char = false;
					else
						i += 5;
				} else
					validU8Char = false;
			}
			if (!validU8Char)
				break;
			i++;
		}
		// if no byte with an high order bit set, the encoding is US-ASCII
		// (it might have been UTF-7, but this encoding is usually internally
		// used only by mail systems)
		if (!highOrderBit) {
			// returns the default charset rather than US-ASCII if the
			// enforce8Bit flag is set.
			if (this.enforce8Bit)
				return this.defaultCharset;
			else
				return Charset.forName("US-ASCII");
		}
		// if no invalid UTF-8 were encountered, we can assume the encoding is
		// UTF-8,
		// otherwise the file would not be human readable
		if (validU8Char)
			return Charset.forName("UTF-8");
		// finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is
		// the default encoding
		return this.defaultCharset;
	}

	public static Charset guessEncoding(File f, int bufferLength)
			throws FileNotFoundException, IOException {
		FileInputStream fis = new FileInputStream(f);
		byte[] buffer = new byte[bufferLength];
		fis.read(buffer);
		fis.close();
		CharsetToolkit toolkit = new CharsetToolkit(buffer);
		toolkit.setDefaultCharset(getDefaultSystemCharset());
		return toolkit.guessEncoding();
	}

	public static Charset guessEncoding(File f, int bufferLength,
			Charset defaultCharset) throws FileNotFoundException, IOException {
		FileInputStream fis = new FileInputStream(f);
		byte[] buffer = new byte[bufferLength];
		fis.read(buffer);
		fis.close();
		CharsetToolkit toolkit = new CharsetToolkit(buffer);
		toolkit.setDefaultCharset(defaultCharset);
		return toolkit.guessEncoding();
	}

	/**
	 * If the byte has the form 10xxxxx, then it's a continuation byte of a
	 * multiple byte character;
	 * 
	 * @param b
	 *            a byte.
	 * @return true if it's a continuation char.
	 */
	private static boolean isContinuationChar(byte b) {
		return -128 <= b && b <= -65;
	}

	/**
	 * If the byte has the form 110xxxx, then it's the first byte of a two-bytes
	 * sequence character.
	 * 
	 * @param b
	 *            a byte.
	 * @return true if it's the first byte of a two-bytes sequence.
	 */
	private static boolean isTwoBytesSequence(byte b) {
		return -64 <= b && b <= -33;
	}

	/**
	 * If the byte has the form 1110xxx, then it's the first byte of a
	 * three-bytes sequence character.
	 * 
	 * @param b
	 *            a byte.
	 * @return true if it's the first byte of a three-bytes sequence.
	 */
	private static boolean isThreeBytesSequence(byte b) {
		return -32 <= b && b <= -17;
	}

	/**
	 * If the byte has the form 11110xx, then it's the first byte of a
	 * four-bytes sequence character.
	 * 
	 * @param b
	 *            a byte.
	 * @return true if it's the first byte of a four-bytes sequence.
	 */
	private static boolean isFourBytesSequence(byte b) {
		return -16 <= b && b <= -9;
	}

	/**
	 * If the byte has the form 11110xx, then it's the first byte of a
	 * five-bytes sequence character.
	 * 
	 * @param b
	 *            a byte.
	 * @return true if it's the first byte of a five-bytes sequence.
	 */
	private static boolean isFiveBytesSequence(byte b) {
		return -8 <= b && b <= -5;
	}

	/**
	 * If the byte has the form 1110xxx, then it's the first byte of a six-bytes
	 * sequence character.
	 * 
	 * @param b
	 *            a byte.
	 * @return true if it's the first byte of a six-bytes sequence.
	 */
	private static boolean isSixBytesSequence(byte b) {
		return -4 <= b && b <= -3;
	}

	/**
	 * Retrieve the default charset of the system.
	 * 
	 * @return the default <code>Charset</code>.
	 */
	public static Charset getDefaultSystemCharset() {
		return Charset.forName(System.getProperty("file.encoding"));
	}

	/**
	 * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other
	 * editors).
	 * 
	 * @param bom
	 *            a buffer.
	 * @return true if the buffer has a BOM for UTF8.
	 */
	private static boolean hasUTF8Bom(byte[] bom) {
		return (bom[0] == -17 && bom[1] == -69 && bom[2] == -65);
	}

	/**
	 * Has a Byte Order Marker for UTF-16 Low Endian (ucs-2le, ucs-4le, and
	 * ucs-16le).
	 * 
	 * @param bom
	 *            a buffer.
	 * @return true if the buffer has a BOM for UTF-16 Low Endian.
	 */
	private static boolean hasUTF16LEBom(byte[] bom) {
		return (bom[0] == -1 && bom[1] == -2);
	}

	/**
	 * Has a Byte Order Marker for UTF-16 Big Endian (utf-16 and ucs-2).
	 * 
	 * @param bom
	 *            a buffer.
	 * @return true if the buffer has a BOM for UTF-16 Big Endian.
	 */
	private static boolean hasUTF16BEBom(byte[] bom) {
		return (bom[0] == -2 && bom[1] == -1);
	}

	/**
	 * Retrieves all the available <code>Charset</code>s on the platform,
	 * among which the default <code>charset</code>.
	 * 
	 * @return an array of <code>Charset</code>s.
	 */
	public static Charset[] getAvailableCharsets() {
		Collection collection = Charset.availableCharsets().values();
		return (Charset[]) collection.toArray(new Charset[collection.size()]);
	}

	public static boolean createDirectory(String path) {
		boolean success = (new File(path)).mkdirs();
		if (success) {
			success = true;
		} else {
			success = false;
		}
		return success;
	}

	public static void main(String[] args) throws FileNotFoundException,
			IOException {
		File file = new File(args[0]);
		// File file = new File("y:\\test\\Download.java");
		Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
		System.err.println("Charset found: " + guessedCharset.displayName());
		// if (!guessedCharset.displayName().equals("UTF-8")) {
		FileInputStream fis = new FileInputStream(file);
		DataInputStream dis = new DataInputStream(fis);
		InputStreamReader isr = new InputStreamReader(fis, guessedCharset);
		BufferedReader br = new BufferedReader(isr);
		String dstfile = file.getParent() + "\\decoded\\";
		if (dstfile.lastIndexOf("\\decoded\\decoded\\") == -1) {
			String dstf = dstfile + file.getName();
			File ff = new File(dstfile);
			File f = new File(dstf);
			createDirectory(dstfile);
			FileOutputStream fos = new FileOutputStream(f);
			OutputStreamWriter osr = new OutputStreamWriter(fos, "UTF-8");
			BufferedWriter bw = new BufferedWriter(osr);
			String line;
			while ((line = br.readLine()) != null) {
				osr.write(line + "\r\n");
			}
			osr.flush();
			isr.close();
			osr.close();
		}
		// } else {
		// System.out.println("已经是UTF-8，不用编码！");
		// }
	}
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -