📄 utf8.java
字号:
for (int i=b1.length; --i>=0; ) { if (b1[i] != b2[i+startIndex]) return false; } return true; } public static boolean NO_NEW = false; // Helper function. private static int addToTable_helper(byte[] b, int hash, int[] chain, int index) { if (NO_NEW) { Assert.UNREACHABLE("Trying to add Utf8 "+fromUtf8(b)); } if (++size == table.length) growTable_helper(); if (!checkUtf8(b)) { fromUtf8(b); // fromUtf8 has more informative error messages. Assert.UNREACHABLE(); } table[size] = new Utf8(b, hash); chain[index] = size+1; if (TRACE) System.out.println("allocated new Utf8: "+table[size]); return size; } // Helper function. private static void growTable_helper() { Utf8[] newtable = new Utf8[size<<1]; System.arraycopy(table, 0, newtable, 0, size); table = newtable; } /** Private constructor. Use the get() method to create a Utf8 object. */ private Utf8(byte[] data, int hash) { this.data = data; this.hash = hash; if (DEBUG) cache = fromUtf8(data); } private byte[] data; private int hash; public static int hashCode(byte[] data) { int h = 4999; int i=data.length; while(--i>=0) { h = 2999*h + data[i]; } return h; } public static int hashCode(byte[] data, int startIndex, int endIndex) { int h = 4999; int i=endIndex; while(--i>=startIndex) { h = 2999*h + data[i]; } return h; } public int hashCode() { return hash; } public static final boolean USE_CACHE = true; public static final boolean DEBUG = true; private String cache; public String toString() { if (USE_CACHE) { if (cache != null) return cache; return cache = fromUtf8(data); } else { return fromUtf8(data); } } public void dump(DataOutput out) throws IOException { Assert._assert(data.length <= Character.MAX_VALUE); out.writeChar(data.length); out.write(data); } public void debugWrite() { Debug.write(data, data.length); } //// Utf8 conversion routines /** * Strictly check the format of the utf8/pseudo-utf8 byte array in * fromUtf8. */ static final boolean STRICTLY_CHECK_FORMAT = false; /** * Set fromUtf8 to not throw an exception when given a normal utf8 * byte array. */ static final boolean ALLOW_NORMAL_UTF8 = false; /** * Set fromUtf8 to not throw an exception when given a pseudo utf8 * byte array. */ static final boolean ALLOW_PSEUDO_UTF8 = true; /** * Set toUtf8 to write in pseudo-utf8 (rather than normal utf8). */ static final boolean WRITE_PSEUDO_UTF8 = true; /** * Convert the given sequence of (pseudo-)utf8 formatted bytes * into a String. * * The acceptable input formats are controlled by the * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8 * flags. * * @param utf8 (pseudo-)utf8 byte array * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8 * @return unicode string */ public static String fromUtf8(byte[] utf8) throws UTFDataFormatError { char[] result = new char[utf8.length]; int result_index = 0; for (int i=0, n=utf8.length; i<n; ) { byte b = utf8[i++]; if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8) if (b == 0) throw new UTFDataFormatError("0 byte encountered at location "+(i-1)); if (b >= 0) { // < 0x80 unsigned // in the range '\001' to '\177' result[result_index++] = (char)b; continue; } try { byte nb = utf8[i++]; if (b < -32) { // < 0xe0 unsigned // '\000' or in the range '\200' to '\u07FF' char c = result[result_index++] = (char)(((b & 0x1f) << 6) | (nb & 0x3f)); if (STRICTLY_CHECK_FORMAT) { if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) throw new UTFDataFormatError("invalid marker bits for double byte char at location "+(i-2)); if (c < '\200') { if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) throw new UTFDataFormatError("encountered double byte char that should have been single byte at location "+(i-2)); } else if (c > '\u07FF') throw new UTFDataFormatError("encountered double byte char that should have been triple byte at location "+(i-2)); } } else { byte nnb = utf8[i++]; // in the range '\u0800' to '\uFFFF' char c = result[result_index++] = (char)(((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f)); if (STRICTLY_CHECK_FORMAT) { if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) throw new UTFDataFormatError("invalid marker bits for triple byte char at location "+(i-3)); if (c < '\u0800') throw new UTFDataFormatError("encountered triple byte char that should have been fewer bytes at location "+(i-3)); } } } catch (ArrayIndexOutOfBoundsException e) { throw new UTFDataFormatError("unexpected end at location "+i); } } return new String(result, 0, result_index); } /** * Convert the given String into a sequence of (pseudo-)utf8 * formatted bytes. * * The output format is controlled by the WRITE_PSEUDO_UTF8 flag. * * @param s String to convert * @return array containing sequence of (pseudo-)utf8 formatted bytes */ public static byte[] toUtf8(String s) { byte[] result = new byte[lengthUtf8(s)]; int result_index = 0; for (int i = 0, n = s.length(); i < n; ++i) { char c = (char)s.charAt(i); // in all shifts below, c is an (unsigned) char, // so either >>> or >> is ok if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) result[result_index++] = (byte)c; else if (c > 0x07FF) { result[result_index++] = (byte)(0xe0 | (byte)(c >> 12)); result[result_index++] = (byte)(0x80 | ((c & 0xfc0) >> 6)); result[result_index++] = (byte)(0x80 | (c & 0x3f)); } else { result[result_index++] = (byte)(0xc0 | (byte)(c >> 6)); result[result_index++] = (byte)(0x80 | (c & 0x3f)); } } return result; } /** * Returns the length of a string's utf8 encoded form. */ public static int lengthUtf8(String s) { int utflen = 0; for (int i = 0, n = s.length(); i < n; ++i) { int c = s.charAt(i); if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) ++utflen; else if (c > 0x07FF) utflen += 3; else utflen += 2; } return utflen; } /** * Check whether the given sequence of bytes is valid (pseudo-)utf8. * * @param bytes byte array to check * @return true iff the given sequence is valid (pseudo-)utf8. */ public static boolean checkUtf8(byte[] bytes) { for (int i=0, n=bytes.length; i<n; ) { byte b = bytes[i++]; if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8) if (b == 0) return false; if (b >= 0) { // < 0x80 unsigned // in the range '\001' to '\177' continue; } try { byte nb = bytes[i++]; if (b < -32) { // < 0xe0 unsigned // '\000' or in the range '\200' to '\u07FF' char c = (char)(((b & 0x1f) << 6) | (nb & 0x3f)); if (STRICTLY_CHECK_FORMAT) { if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) return false; if (c < '\200') { if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) return false; } else if (c > '\u07FF') return false; } } else { byte nnb = bytes[i++]; // in the range '\u0800' to '\uFFFF' char c = (char)(((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f)); if (STRICTLY_CHECK_FORMAT) { if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) return false; if (c < '\u0800') return false; } } } catch (ArrayIndexOutOfBoundsException e) { return false; } } return true; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -