📄 utf8utf16.java
字号:
// lower 4 bits become higher 4 bits
result[n] = (byte) ((b0 << 4) & 0xff);
//#ifdef debug
//# if (DEBUG) System.out.println("-- 3b : half result[n] = " + result[n]);
//#endif
// 10xx xxyy : get the xx xx 4 bits, as lower 4
result[n] |= (byte) (((b1 & 0x3c) >> 2) & 0xff); // 3c = 0011 1100
//#ifdef debug
//# if (DEBUG) System.out.println("-- 3b : result[n] = " + result[n]);
//#endif
// b1 lower 2 bits become higer 2 bits
result[n+1] = (byte) (((b1 & 0x3) << 6) & 0xff);
// b2 lower 6 bits, just take it as lower 6 bits
result[n+1] |= (byte) ((b2 & 0x3f) & 0xff); // 0x3f = 0011 1111 (bin)
n+=2; // ucs-2 always +2
i+=2;
continue;
}
// 2 bytes model : 0x80 - 0x7ff (special char)
if ((utf8[i] & mask1) == mask1)
{
b0 = utf8[i];
b1 = utf8[i+1];
// take 3 bits from b0 (110x xxyy , take xxx)
result[n] = (byte) ( (b0>>2) & 0x7 ); // 0x7 = 111
result[n+1] = (byte) (((b0 & 0x3) << 6) & 0xff );
result[n+1] |= (byte) (b1 & 0x3f);
n+=2; // ucs-2 always +2
i+=1;
continue;
}
// sanity check
if ((utf8[i] & mask0) != 0) {
//#ifdef debug
//# System.err.println("error encoding");
//#endif
break; // just skip
}
result[n] = 0; // filled up MSB = 0
result[n+1] = utf8[i]; // as-is
n+=2; // always + 2
}
byte[] ucs2 = new byte[n];
System.arraycopy(result, 0, ucs2, 0, n);
return ucs2;
}
//
// size=16
// 61 2e 20 e6 97 a0 e5 80 be e8 a7 92 e7 9a 84 3b
// Encoding=UTF-8 : a. 无倾角的;
//
// UTF-16 size=16
// 00 61 00 2e 00 20 65 e0 50 3e 89 d2 76 84 00 3b
//
// 65e0 703e a9d2 7684
//
//
// \u65e0\u503e\u89d2\u7684
//
// \u65e0\u503e\u89d2\u7684
//#ifdef debug
//# static boolean test_utf8_to_ucs2()
//# {
//# // 61 2e 20 e6 97 a0 e5 80 be e8 a7 92 e7 9a 84 3b
//# byte utf8[] = {
//# (byte)0x61, (byte)0x2e, (byte)0x20, (byte)0xe6, (byte)0x97
//# ,(byte)0xa0, (byte)0xe5, (byte)0x80, (byte)0xbe, (byte)0xe8
//# ,(byte)0xa7, (byte)0x92, (byte)0xe7, (byte)0x9a, (byte)0x84
//# ,(byte)0x3b
//# };
//#
//# byte ucs2[] = new byte[0];
//# String str_utf8, str_ucs2;
//# try {
//# ucs2 = utf8_to_ucs2(utf8);
//# } catch (Exception e) {
//# System.err.println("ex: " + e);
//# }
//#
//#ifdef debug
//# printArray(utf8, 0, utf8.length, "UTF-8");
//#endif
//#
//# // printArray(ucs2, 0, ucs2.length, "UCS-2");
//# // // UCS-2 : java1.4 UnsupportedEncoding
//#
//#ifdef debug
//# printArray(ucs2, 0, ucs2.length, "UTF-16"); // UTF-16 = UCS-2
//#endif
//#
//# try {
//# str_utf8 = new String(utf8, 0, utf8.length, "UTF-8");
//# str_ucs2 = new String(ucs2, 0, ucs2.length, "UTF-16");
//#
//# return str_utf8.equals(str_ucs2);
//# } catch (Exception e) {
//# System.err.println("test_utf8_to_ucs2 ex2: " + e);
//# return false;
//# }
//#
//# // return false;
//# }
//#endif
// size=16 Encoding=UTF-16
// fe ff 4f 60 59 7d 00 61 00 62 00 63 59 1f 4e 86
// 你好abc够了
//
// 1111 1110 1111 1111 (你好)
//
// case c:
// 0x800 to 0xffff : 1110 xxxx 10xx xxxx 10xx xxxx (16 bits)
//
// 1110 xxxx 10xx xxxx 10xx xxxx (16 bits)
// 1111 11 1011 11 1111 (你好 UCS-2 or UTF-16)
// ==
//
// 1110 1111 1011 1011 1011 1111 (你好 UTF-8)
// E F B B B F
// EF BB BF
//
//#ifdef debug
//# static boolean test_ucs2_to_utf8()
//# {
//# String str = "你好abc够了";
//# String utf8str ;
//# byte ucs2[], utf8[] ;
//#
//# try {
//# ucs2 = str.getBytes("UTF-16");
//# utf8 = ucs2_to_utf8(ucs2);
//#
//#ifdef debug
//# printArray(ucs2, 0, ucs2.length, "UTF-16");
//# printArray(utf8, 0, utf8.length, "UTF-8");
//#endif
//#
//# utf8str = new String(utf8, "UTF-8");
//#ifdef debug
//# if (DEBUG) System.out.println(str + " == " + utf8str);
//#endif
//# return str.equals(utf8str);
//# } catch (Exception e) {
//# System.err.println( "test_ucs2_to_utf8 ex: " + e);
//# return false;
//# }
//# }
//#endif
/**
* do not remove, it's a useful reference.
*
* special note: when converting byte to integer, we want to
* reserve the "bit pattern" instead of the value.
* e.g. when byte a = 0x80 (-128), bit pattern is 1000 0000
* but int -128 has a different bit pattern, so the
* actual value for equivalent int is +128
* byte(-128) = 1000 0000
* int(128) = 0000 0000 0000 0000 0000 0000 1000 0000
* int(-128) = 1111 1111 1111 1111 1111 1111 1000 0000
*
*/
/*public static boolean testByteToInt()
{
byte a = (byte)0x80;
byte b = (byte)0xff;
int aint = a & 0xff; // using & 0xff to convert to int
int bint = b & 0xff; // important for bit-wise operation
int awrong = a ;
int bwrong = b ;
if (DEBUG) {
System.out.println("a=" + a + " b=" + b);
System.out.println("aint=" + aint + " bint=" + bint);
System.out.println("awrong=" + awrong + " bwrong=" + bwrong);
}
return (a & 0xff) == (aint & 0xff);
}*/
/* static void printArray(byte array[], int offset, int size, String encoding)
{
String str;
int last = offset + size;
int n;
if (last > array.length) last = array.length;
System.out.println("size=" + size + " Encoding=" + encoding );
for(int i=offset; i<last; i++) {
n = array[i] & 0xff; // a simple & can avoid negative issue
// n = (array[i]>=0) ? (array[i]) : (array[i] + 256); // work
// n = array[i]; // this is buggy
str = Integer.toHexString(n);
if (str.length() < 2) {
str = "0" + str;
}
System.out.print(" " + str);
}
try {
System.out.println("\n"
+ (new String(array, offset, size, encoding)));
} catch (Exception e)
{
System.err.println("printArray new String ex: " + e);
}
}
*/
// turn on DEBUG for more debug messages
//#ifdef debug
//# public final static boolean DEBUG = false;
//#endif
/* public static void main(String arg[])
{
System.out.println("testBytetoInt(): " + testByteToInt());
System.out.println("");
System.out.println("test_utf8_to_ucs2(): " + test_utf8_to_ucs2());
System.out.println("");
System.out.println("test_ucs2_to_utf8(): " + test_ucs2_to_utf8());
System.out.println("");
}*/
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -