📄 utf8utf16.java
字号:
package wmlcparser;
/**
* a collection of conversion utility from UTF-8 to UTF-16,
* in byte-by-byte, bit-wise level, not by internal encoder/decoder
* of JDK.
*
* last update: 20070707
*
* tested on j2se 1.4.2_13
*
* @author peter peter _at_ gameislive.com
*/
import java.io.UnsupportedEncodingException;
class utf8utf16
{
/*
* About UTF-8, UCS-2
*
* UCS-2 : Unicode in 2-bytes form, very good for chinese character
* for ASCII (1 bytes character, e.g. A-Z), need to fill in 0x00 prefix
* (big endian style : MSB=[0] LSB=[1] )
* UTF-8 : 0x00 to 0x7F is directly represented,
* others the following rule applies:
*
无 (nothing, none)
UTF-8 representation:
1110 0110 1001 0111 1010 0000
MSB : 1110 prefix mean 3 bytes
2nd bytes 1001 0111 -> remove first 2 bits as prefix (10)
3rd bytes 1010 0000 -> remove first 2 bits as prefix (10)
prefix removal:
1110 0110 1001 0111 1010 0000
0110 01 0111 10 0000
actual encoding:
= 0110 0101 1110 0000 (bin)
= 65 E0 (hex)
or in native2ascii:
\u65e0 (0 = zero)
*/
/**
* convert byte sequence from ucs2(UTF-6) to UTF-8.
*
* 0x00 to 0x7f : 0xxx xxxx
* 0x80 to 0x7ff : 110x xxxx 10xx xxxx (11 bits)
* 0x800 to 0xffff : 1110 xxxx 10xx xxxx 10xx xxxx (16 bits)
*/
static byte [] ucs2_to_utf8(byte [] ucs2)
{
if (ucs2==null) return null;
final int maskb_high = 0xC0;
final int maska_high = 0xE0;
final int mask_low = 0x80; // both b, c mask low are the same
int len = ucs2.length;
int halfLen = len>>1;
byte array[] = new byte[len << 2]; // worst case is *3 / 2
int n = 0;
int b0, b1;
// odd number of length bug!
if ((len & 1)==1) {
//#ifdef debug
//# System.err.println("BUG ucs2_to_utf8 input len is odd");
//#endif
// not return continue to work but ignore the last odd byte
len ^= 1; // clear that
}
n = 0; // len of array
for (int i=0; i<len; i+=2)
{
// 3 cases:
// a: 00 xx where xx <= 0x7f : map to single byte
// b: 00 80 to 07 ff : map to double bytes
// c: 08 00 to ff ff : map to 3 bytes
b0 = (ucs2[i] & 0xff);
b1 = (ucs2[i+1] & 0xff);
// unicode signature (is there others?)
if (b0==0xfe && b1==0xff) {
continue;
}
// case a:
// 0x7f=127, 0x80=-128 (byte 2-complement)
if (b0==0 && b1<=0x7f)
{
//#ifdef debug
//# if (DEBUG) System.out.println(i + ": case A");
//#endif
array[n] = (byte)(b1 & 0xff);
n++;
continue;
}
// implicit: (b0!=0 and b1=anything) or (b0==0 and b1>0x7f)
// case b: only check b0 is enough
// 0x80 to 0x7ff : 110x xxxx 10xx xxxx (11 bits)
if (b0 < 0x7)
{
//#ifdef debug
//# if (DEBUG) System.out.println(i + ": case B");
//#endif
// take 3 bits from b0, shift up 2 step
// and take the 2 higher bits in b1, shift down 6 step.
// mask = 1100 0000
b0 = ((b0 & 0x7) << 2) | ((b1 & 0xC0) >> 6);
b0 |= maskb_high;
b1 = b1 & 0x3f;
b1 |= mask_low;
array[n] = (byte)(b0 & 0xff);
array[n+1] = (byte)(b1 & 0xff);
n+=2 ; // double bytes
continue;
}
//#ifdef debug
//# if (DEBUG) System.out.println(i + ": case C, b0=" + b0 + " b1=" + b1 );
//#endif
// test
// 1110 1111 1011 1011 1011 1111 (你好 UTF-8)
// E F B B B F
// EF BB BF
// implicitly b0 > 0x7
//
// 0x800 to 0xffff : 1110 xxxx 10xx xxxx 10xx xxxx
//
// convert to 3 bytes
// first byte: take higher 4 bits from b0 then or with maska_high
array[n] = (byte) (( maska_high | ((b0 >> 4) & 0xf) ) & 0xff) ;
// System.out.println("array[n] = "
// + Integer.toHexString(array[n] & 0xff));
// second byte: take lower 4 bits from b0, shift up 2 step,
// take higher 2 bits from b1 (shift down 6)
// then OR mask_low
array[n+1] = (byte) (( ((b0 & 0xf)<<2) | (b1>>6) | mask_low ) & 0xff);
// System.out.println("array[n+1] = "
// + Integer.toHexString(array[n+1] & 0xff));
// third byte: take lower 6 bits from b1, no shift, OR mask_low
array[n+2] = (byte) (( (b1 & 0x3f) | mask_low ) & 0xff);
// System.out.println("array[n+2] = "
// + Integer.toHexString(array[n+2] & 0xff));
n+=3;
}
byte utf8[] ;
utf8 = new byte[n];
System.arraycopy(array, 0, utf8, 0, n);
return utf8; // BUG
}
/**
* convert byte sequence from UTF8 to UCS2 (UTF-16), output is a
* byte "array" that can be used to build string by
* [new String(array, 0, array.length, "UTF-16")], assume input does not
* cover unicode character that requires more than 2 bytes (chinese is ok).
*
* UCS-2 (assume no 3 bytes) UTF-8.
* 0x00 to 0x7f : 0xxx xxxx
* 0x80 to 0x7ff : 110x xxxx 10xx xxxx (11 bits)
* 0x800 to 0xffff : 1110 xxxx 10xx xxxx 10xx xxxx (16 bits)
e.g:
// Encoding=UTF-8 :
// a. 无倾角的;
// 61 2e 20 e6 97 a0 e5 80 be e8 a7 92 e7 9a 84 3b
61 - single byte ( < 7f)
2e - single byte ( < 7f)
20 - single byte ( < 7f)
// correct UCS-2(UTF-16)
// 00 61 00 2e 00 20 65 e0 50 3e 89 d2 76 84 00 3b
// Encoding=UTF-16 : a. 无倾角的;
//
//
// native2ascii :
// \u65e0\u503e\u89d2\u7684
//
// 01010000 0011 1110 (50 3E)
// 01110000 0011 1110
//
e6 97 a0 无 1110 0110 1001 0111 1010 0000 ==ucs-2==> 0110 0101 1110 0000 = 0x65 0xE0
e5 80 be 倾 1110 0101 1000 0000 1011 1110 ==ucs-2==> 0101 0000 0011 1110 = 0x50 0x3E
e8 a7 92 角
e7 9a 84 的
3b - single byte (;)
*/
static byte [] utf8_to_ucs2(byte [] utf8) throws UnsupportedEncodingException
{
final int asciiMask = 0x7f; // 7-bits all 1, 0111 1111
final int mask0 = 0x80; // 10xx xxxx (double check)
final int mask1 = 0xC0; // 110x xxxx (means 0x80 - 0x7ff)
final int mask2 = 0xE0; // 1110 xxxx (prefix for 0x800 - 0xffff)
// sanity
if (utf8 == null) return null;
int len = utf8.length;
byte result[] = new byte[len << 1]; // worst case
int n = 0; // number of bytes in result[]
int b0, b1, b2;
n = 0; // double safety
for (int i=0; i<len; i++)
{
// 3 bytes model : 0x800 - 0xffff (mainly chinese)
if ((utf8[i] & mask2) == mask2)
{
b0 = utf8[i] & 0xff;
b1 = utf8[i+1] & 0xff; // sanity: mask0
b2 = utf8[i+2] & 0xff; // sanity: mask0
// System.out.println("b0, b1, b2 : "
// + Integer.toHexString(b0) + ", "
// + Integer.toHexString(b1) + ", "
// + Integer.toHexString(b2) );
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -