⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 utf8utf16.java

📁 手机Wap浏览器源码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
package wmlcparser;
/**
 * a collection of conversion utility from UTF-8 to UTF-16,
 * in byte-by-byte, bit-wise level, not by internal encoder/decoder
 * of JDK.
 *
 * last update: 20070707
 *
 * tested on j2se 1.4.2_13
 *
 * @author peter  peter _at_ gameislive.com 
 */

import java.io.UnsupportedEncodingException;

class utf8utf16
{
    /*
     * About UTF-8, UCS-2
     *
     * UCS-2 : Unicode in 2-bytes form, very good for chinese character
     *         for ASCII (1 bytes character, e.g. A-Z), need to fill in 0x00 prefix
     *         (big endian style : MSB=[0]   LSB=[1] )
     * UTF-8 : 0x00 to 0x7F is directly represented,
     *         others the following rule applies:
     *
        无 (nothing, none)

        UTF-8 representation: 
        1110 0110   1001 0111   1010 0000

        MSB : 1110 prefix mean 3 bytes
        2nd bytes 1001 0111  -> remove first 2 bits as prefix (10)
        3rd bytes 1010 0000  -> remove first 2 bits as prefix (10)

        prefix removal:
        1110 0110   1001 0111   1010 0000
             0110     01 0111     10 0000
             
        actual encoding:     
        = 0110 0101   1110 0000  (bin)
        = 65          E0         (hex)
             
        or in native2ascii:
        \u65e0    (0 = zero)
    */


    /**
     * convert byte sequence from ucs2(UTF-6) to UTF-8.
     *
     * 0x00     to      0x7f    :   0xxx xxxx
     * 0x80     to      0x7ff   :   110x xxxx  10xx xxxx (11 bits)
     * 0x800    to      0xffff  :   1110 xxxx  10xx xxxx  10xx xxxx (16 bits)
     */
    static byte [] ucs2_to_utf8(byte [] ucs2)
    {
        if (ucs2==null) return null;
        final int maskb_high = 0xC0;
        final int maska_high = 0xE0;
        final int mask_low = 0x80;  // both b, c mask low are the same
        int len = ucs2.length;
        int halfLen = len>>1;
        byte array[] = new byte[len << 2]; // worst case is *3 / 2
        int n = 0;
        int b0, b1;

        // odd number of length bug!
        if ((len & 1)==1) {
        	//#ifdef debug
            //# System.err.println("BUG ucs2_to_utf8 input len is odd");
            //#endif
            // not return continue to work but ignore the last odd byte

            len ^= 1;  // clear that
        }

        n = 0; // len of array
        for (int i=0; i<len; i+=2) 
        {

            // 3 cases:
            // a: 00 xx  where xx <= 0x7f  : map to single byte
            // b: 00 80 to 07 ff : map to double bytes
            // c: 08 00 to ff ff : map to 3 bytes

            b0 = (ucs2[i] & 0xff);
            b1 = (ucs2[i+1] & 0xff);

            // unicode signature  (is there others?)
            if (b0==0xfe && b1==0xff) {
                continue;
            }

            // case a:
            // 0x7f=127,  0x80=-128 (byte 2-complement)
            if (b0==0 && b1<=0x7f) 
            { 
            	//#ifdef debug
                //# if (DEBUG) System.out.println(i + ": case A");
                //#endif
                array[n] = (byte)(b1 & 0xff);
                n++;
                continue;
            }

            // implicit:  (b0!=0 and b1=anything) or (b0==0 and b1>0x7f)

            // case b:  only check b0 is enough
            // 0x80     to      0x7ff   :   110x xxxx  10xx xxxx (11 bits)
            if (b0 < 0x7) 
            {
            	//#ifdef debug
                //# if (DEBUG) System.out.println(i + ": case B");
                //#endif
                //  take 3 bits from b0, shift up 2 step
                //  and take the 2 higher bits in b1, shift down 6 step.
                //  mask = 1100 0000
                b0 = ((b0 & 0x7) << 2) | ((b1 & 0xC0) >> 6);
                b0 |= maskb_high;
                b1 = b1 & 0x3f;
                b1 |= mask_low;

                array[n] = (byte)(b0 & 0xff);
                array[n+1] = (byte)(b1 & 0xff);
                n+=2 ;  // double bytes
                continue;
            }

        	//#ifdef debug
            //# if (DEBUG) System.out.println(i + ": case C, b0=" + b0 + "  b1=" + b1 );
            //#endif

            // test
            // 1110 1111  1011 1011  1011 1111  (你好 UTF-8)
            //   E    F     B    B     B    F 
            //   EF  BB  BF
    
            // implicitly b0 > 0x7
            //
            // 0x800    to      0xffff  :   1110 xxxx  10xx xxxx  10xx xxxx 
            //
            // convert to 3 bytes
            
            // first byte: take higher 4 bits from b0 then or with maska_high
            array[n] = (byte) (( maska_high | ((b0 >> 4) & 0xf) ) & 0xff) ;
            // System.out.println("array[n] = " 
            // + Integer.toHexString(array[n] & 0xff));

            // second byte: take lower 4 bits from b0, shift up 2 step, 
            // take higher 2 bits from b1 (shift down 6)
            // then OR mask_low
            array[n+1] = (byte) (( ((b0 & 0xf)<<2) | (b1>>6) | mask_low ) & 0xff);
            // System.out.println("array[n+1] = " 
            // + Integer.toHexString(array[n+1] & 0xff));

            // third byte: take lower 6 bits from b1, no shift, OR mask_low
            array[n+2] = (byte) (( (b1 & 0x3f) | mask_low ) & 0xff);
            // System.out.println("array[n+2] = " 
            // + Integer.toHexString(array[n+2] & 0xff));

            n+=3;
        }

        byte utf8[] ;
        utf8 = new byte[n];

        System.arraycopy(array, 0, utf8, 0, n);


        return utf8; // BUG
    }
    
    /**
     * convert byte sequence from UTF8 to UCS2 (UTF-16), output is a 
     * byte "array" that can be used to build string by 
     * [new String(array, 0, array.length, "UTF-16")], assume input does not
     * cover unicode character that requires more than 2 bytes (chinese is ok).
     *
     * UCS-2 (assume no 3 bytes)     UTF-8.
     * 0x00     to      0x7f    :   0xxx xxxx
     * 0x80     to      0x7ff   :   110x xxxx  10xx xxxx (11 bits)
     * 0x800    to      0xffff  :   1110 xxxx  10xx xxxx  10xx xxxx (16 bits)
     
     e.g:
    // Encoding=UTF-8 : 
    // a. 无倾角的;
    // 61  2e  20  e6  97  a0  e5  80  be  e8  a7  92  e7  9a  84  3b

     61 - single byte ( < 7f)
     2e  - single byte ( < 7f)
     20  - single byte ( < 7f)

    // correct UCS-2(UTF-16)
    // 00  61  00  2e  00  20  65  e0  50  3e  89  d2  76  84  00  3b
    // Encoding=UTF-16 : a. 无倾角的;
    //
    //
    // native2ascii :
    // \u65e0\u503e\u89d2\u7684
    //
    // 01010000 0011 1110 (50 3E)
    // 01110000  0011 1110
    //
     e6 97 a0 无 1110 0110   1001 0111   1010 0000 ==ucs-2==> 0110 0101   1110 0000  = 0x65  0xE0
     e5 80 be 倾 1110 0101   1000 0000   1011 1110 ==ucs-2==> 0101 0000   0011 1110  = 0x50  0x3E
     e8 a7 92 角
     e7 9a 84 的
     3b - single byte (;)
     */
    static byte [] utf8_to_ucs2(byte [] utf8) throws UnsupportedEncodingException
    {
        final int asciiMask = 0x7f;  // 7-bits all 1, 0111 1111
        final int mask0 = 0x80; // 10xx xxxx (double check)
        final int mask1 = 0xC0; // 110x xxxx (means 0x80 - 0x7ff)
        final int mask2 = 0xE0; // 1110 xxxx (prefix for 0x800 - 0xffff)
        // sanity
        if (utf8 == null) return null;
        int len = utf8.length;
        byte result[] = new byte[len << 1]; // worst case
        int n = 0; // number of bytes in result[]
        int b0, b1, b2;


        n = 0;   // double safety
        for (int i=0; i<len; i++) 
        {
            // 3 bytes model : 0x800 - 0xffff (mainly chinese)
            if ((utf8[i] & mask2) == mask2) 
            {
                b0 = utf8[i] & 0xff;
                b1 = utf8[i+1] & 0xff;  // sanity:  mask0
                b2 = utf8[i+2] & 0xff;  // sanity:  mask0
                // System.out.println("b0, b1, b2 : " 
                //        + Integer.toHexString(b0) + ", " 
                //        + Integer.toHexString(b1) + ", " 
                //        + Integer.toHexString(b2)  );

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -