📄 text.java
字号:
} static { // register this comparator WritableComparator.define(Text.class, new Comparator()); } /// STATIC UTILITIES FROM HERE DOWN /** * Converts the provided byte array to a String using the * UTF-8 encoding. If the input is malformed, * replace by a default value. */ public static String decode(byte[] utf8) throws CharacterCodingException { return decode(ByteBuffer.wrap(utf8), true); } public static String decode(byte[] utf8, int start, int length) throws CharacterCodingException { return decode(ByteBuffer.wrap(utf8, start, length), true); } /** * Converts the provided byte array to a String using the * UTF-8 encoding. If <code>replace</code> is true, then * malformed input is replaced with the * substitution character, which is U+FFFD. Otherwise the * method throws a MalformedInputException. */ public static String decode(byte[] utf8, int start, int length, boolean replace) throws CharacterCodingException { return decode(ByteBuffer.wrap(utf8, start, length), replace); } private static String decode(ByteBuffer utf8, boolean replace) throws CharacterCodingException { synchronized(DECODER) { if (replace) { DECODER.onMalformedInput( java.nio.charset.CodingErrorAction.REPLACE); DECODER.onUnmappableCharacter(CodingErrorAction.REPLACE); } String str = DECODER.decode(utf8).toString(); // set decoder back to its default value: REPORT if (replace) { DECODER.onMalformedInput(CodingErrorAction.REPORT); DECODER.onUnmappableCharacter(CodingErrorAction.REPORT); } return str; } } /** * Converts the provided String to bytes using the * UTF-8 encoding. If the input is malformed, * invalid chars are replaced by a default value. * @return ByteBuffer: bytes stores at ByteBuffer.array() * and length is ByteBuffer.limit() */ public static ByteBuffer encode(String string) throws CharacterCodingException { return encode(string, true); } /** * Converts the provided String to bytes using the * UTF-8 encoding. If <code>replace</code> is true, then * malformed input is replaced with the * substitution character, which is U+FFFD. Otherwise the * method throws a MalformedInputException. * @return ByteBuffer: bytes stores at ByteBuffer.array() * and length is ByteBuffer.limit() */ public static ByteBuffer encode(String string, boolean replace) throws CharacterCodingException { synchronized(ENCODER) { if (replace) { ENCODER.onMalformedInput(CodingErrorAction.REPLACE); ENCODER.onUnmappableCharacter(CodingErrorAction.REPLACE); } ByteBuffer bytes=ENCODER.encode(CharBuffer.wrap(string.toCharArray())); if (replace) { ENCODER.onMalformedInput(CodingErrorAction.REPORT); ENCODER.onUnmappableCharacter(CodingErrorAction.REPORT); } return bytes; } } /** Read a UTF8 encoded string from in */ public static String readString(DataInput in) throws IOException { int length = WritableUtils.readVInt(in); byte [] bytes = new byte[length]; in.readFully(bytes, 0, length); return decode(bytes); } /** Write a UTF8 encoded string to out */ public static int writeString(DataOutput out, String s) throws IOException { ByteBuffer bytes = encode(s); int length = bytes.limit(); WritableUtils.writeVInt(out, length); out.write(bytes.array(), 0, length); return length; } ////// states for validateUTF8 private static final int LEAD_BYTE = 0; private static final int TRAIL_BYTE_1 = 1; private static final int TRAIL_BYTE = 2; /** * Check if a byte array contains valid utf-8 * @param utf8: byte array * @exception MalformedInputException if the byte array contains invalid utf-8 */ public static void validateUTF8(byte[] utf8) throws MalformedInputException { validateUTF8(utf8, 0, utf8.length); } /** * Check to see if a byte array is valid utf-8 * @param utf8 the array of bytes * @param start the offset of the first byte in the array * @param len the length of the byte sequence * @throws MalformedInputException if the byte array contains invalid bytes */ public static void validateUTF8(byte[] utf8, int start, int len) throws MalformedInputException { int count = start; int leadByte = 0; int length = 0; int state = LEAD_BYTE; while (count < start+len) { int aByte = ((int) utf8[count] & 0xFF); switch (state) { case LEAD_BYTE: leadByte = aByte; length = bytesFromUTF8[aByte]; switch (length) { case 0: // check for ASCII if (leadByte > 0x7F) throw new MalformedInputException(count); break; case 1: if (leadByte < 0xC2 || leadByte > 0xDF) throw new MalformedInputException(count); state = TRAIL_BYTE_1; break; case 2: if (leadByte < 0xE0 || leadByte > 0xEF) throw new MalformedInputException(count); state = TRAIL_BYTE_1; break; case 3: if (leadByte < 0xF0 || leadByte > 0xF4) throw new MalformedInputException(count); state = TRAIL_BYTE_1; break; default: // too long! Longest valid UTF-8 is 4 bytes (lead + three) // or if < 0 we got a trail byte in the lead byte position throw new MalformedInputException(count); } // switch (length) break; case TRAIL_BYTE_1: if (leadByte == 0xF0 && aByte < 0x90) throw new MalformedInputException(count); if (leadByte == 0xF4 && aByte > 0x8F) throw new MalformedInputException(count); if (leadByte == 0xE0 && aByte < 0xA0) throw new MalformedInputException(count); if (leadByte == 0xED && aByte > 0x9F) throw new MalformedInputException(count); // falls through to regular trail-byte test!! case TRAIL_BYTE: if (aByte < 0x80 || aByte > 0xBF) throw new MalformedInputException(count); if (--length == 0) { state = LEAD_BYTE; } else { state = TRAIL_BYTE; } break; } // switch (state) count++; } } /** * Magic numbers for UTF-8. These are the number of bytes * that <em>follow</em> a given lead byte. Trailing bytes * have the value -1. The values 4 and 5 are presented in * this table, even though valid UTF-8 cannot include the * five and six byte sequences. */ static final int[] bytesFromUTF8 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // trail bytes -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; /** * Returns the next code point at the current position in * the buffer. The buffer's position will be incremented. * Any mark set on this buffer will be changed by this method! */ public static int bytesToCodePoint(ByteBuffer bytes) { bytes.mark(); byte b = bytes.get(); bytes.reset(); int extraBytesToRead = bytesFromUTF8[(int)(b & 0xFF)]; if (extraBytesToRead < 0) return -1; // trailing byte! int ch = 0; switch (extraBytesToRead) { case 5: ch += (int)(bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */ case 4: ch += (int)(bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */ case 3: ch += (int)(bytes.get() & 0xFF); ch <<= 6; case 2: ch += (int)(bytes.get() & 0xFF); ch <<= 6; case 1: ch += (int)(bytes.get() & 0xFF); ch <<= 6; case 0: ch += (int)(bytes.get() & 0xFF); } ch -= offsetsFromUTF8[extraBytesToRead]; return ch; } static final int offsetsFromUTF8[] = { 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 }; /** * For the given string, returns the number of UTF-8 bytes * required to encode the string. * @param string text to encode * @return number of UTF-8 bytes required to encode */ public static int utf8Length(String string) { CharacterIterator iter = new StringCharacterIterator(string); char ch = iter.first(); int size = 0; while (ch != CharacterIterator.DONE) { if ((ch >= 0xD800) && (ch < 0xDC00)) { // surrogate pair? char trail = iter.next(); if ((trail > 0xDBFF) && (trail < 0xE000)) { // valid pair size += 4; } else { // invalid pair size += 3; iter.previous(); // rewind one } } else if (ch < 0x80) { size++; } else if (ch < 0x800) { size += 2; } else { // ch < 0x10000, that is, the largest char value size += 3; } ch = iter.next(); } return size; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -