⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 utf_unicode_convert.c

📁 utf8 convert to unicode and unicode convert to utf8
💻 C
字号:

/*
@file : utf_unicode_convert.c

@brief: code convert

Author: ******

Version: V1.0

Copyright XXXXXXXXXXXXXX Inc. All rights reserved.

@date: 2008-11-24

*/





#include <stdio.h>
#include <stdlib.h>
#include "utf8toucs2.h"
#include "gbk_to_unicode_table.h"

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>



void UTF_8ToUnicode(wchar_t* pOut,char *pText)
{
    char* uchar = (char *)pOut;

    uchar[1] = ((pText[0] & 0x0F) << 4) + ((pText[1] >> 2) & 0x0F);
    uchar[0] = ((pText[1] & 0x03) << 6) + (pText[2] & 0x3F);

    return;
}


void UnicodeToUTF_8(char* pOut,wchar_t* pText)
{
    //  WCHAR is  little endian
    char* pchar = (char *)pText;

    pOut[0] = (0xE0 | ((pchar[1] & 0xF0) >> 4));
    pOut[1] = (0x80 | ((pchar[1] & 0x0F) << 2)) + ((pchar[0] & 0xC0) >> 6);
    pOut[2] = (0x80 | (pchar[0] & 0x3F));

    return;
}


unsigned short gbk_to_unicode(unsigned short input)
{
	unsigned char hight = (input & 0xFF00 ) >> 8;
	unsigned short unicode = 0;
	int fd = 0;
	
//	unsigned char low = (input & 0x00FF);
	if(hight>=0x81) {
	  //  printf("input = %d\n", input-0x8140);
#if 0	  
         return gbkToUnicode[input-0x8140];
#else
	  fd = open("./gbk2ucs.dat", O_RDONLY);
	  if(fd < 0)
	  {
	  	printf("open gbk2ucs.data error, %s\n", strerror(errno));

		return 0;
	  }

	  lseek(fd, (input-0x8140)<<1, SEEK_SET);

	  read(fd, (char*)&unicode, 2);

	  close(fd);

	  return unicode;

#endif	  	  
	}
	else return 0;
}


ucs2 * gbkto_ucs2(const char* src, ucs2 * pUCS2buf, int UCS2bufSize)
{
    const unsigned char *pGBKStr = (const unsigned char *) src;
    int ucs2Index=0;
    
    while(1) {
        if( ((*pGBKStr)&0x80)==0x00 )
        {
            pUCS2buf[ucs2Index] =*(pGBKStr);
           // printf("1. gbk=%02x   ucs2=%02x \n", *(pGBKStr),pUCS2buf[ucs2Index]); 
            pGBKStr++;      
            if(pUCS2buf[ucs2Index++] == '\0') {
                //printf("\n");
                return pUCS2buf; 
                
            }    
        }
        else {
            
            pUCS2buf[ucs2Index] =gbk_to_unicode(((*(pGBKStr)<<8) | (*(pGBKStr+1))));
            //printf("2. gbk=%02x   ucs2=%02x \n", ((*(pGBKStr)<<8) | (*(pGBKStr+1))),pUCS2buf[ucs2Index]);
            ucs2Index++;
            pGBKStr += 2;
            
        }
        if(ucs2Index>=UCS2bufSize) {
            //printf("3. ucs2Index = %d, UCS2bufSize = %d \n ",ucs2Index,UCS2bufSize);
            pUCS2buf[ucs2Index-1] = '\0';
            return pUCS2buf;
        }
    }
    
}

int gbk_length(const char* src)
{
    const unsigned char *pGBKStr = (const unsigned char *) src;
    int chrnum=0;
    int c=10000;
    while(c--) {
        if( ((*pGBKStr)&0x80)==0x00 )
        {
            chrnum++;
            
            if(*pGBKStr == '\0')
                 return chrnum-1;   
            pGBKStr++;  
        }
        else {
            pGBKStr += 2;
            chrnum++;
        }
    }
    return chrnum;
}

ucs2 * utf8_decode(const utf8 * pUTF8str, ucs2 * pUCS2buf, int UCS2bufSize)
{
    const utf8 *putf8 = pUTF8str;
    int ucs2Index=0;
    while (1) {
        if ((putf8[0] & 0x80) == 0x00) {
            pUCS2buf[ucs2Index] = putf8[0];
            putf8++;
            if(pUCS2buf[ucs2Index++] == '\0')
                 return pUCS2buf;
        }
        else if ((putf8[0] & 0xe0) == 0xc0 &&
    	     (putf8[1] & 0xc0) == 0x80) {
            pUCS2buf[ucs2Index++] =((putf8[0] & 0x1fL) << 6)|((putf8[1] & 0x3fL) << 0);
            putf8 += 2;
        }
        else if ((putf8[0] & 0xf0) == 0xe0 &&
    	         (putf8[1] & 0xc0) == 0x80 &&
    	         (putf8[2] & 0xc0) == 0x80) {
                pUCS2buf[ucs2Index++] =((putf8[0] & 0x0fL) << 12) |
    	               ((putf8[1] & 0x3fL) <<  6) |
    	               ((putf8[2] & 0x3fL) <<  0);
    	        putf8 += 3;
        }
		else
			return pUCS2buf;

        if(ucs2Index>=UCS2bufSize)
            return pUCS2buf;
    }
};

int utf8_size(const utf8 * pUTF8str)
{
    const utf8 *putf8 = pUTF8str;
    int Length=0;
    while (1) {
        if ((putf8[Length] & 0x80) == 0x00) {
            
            if(putf8[Length++] == '\0')
                 return Length;
        }
        else if ((putf8[Length] & 0xe0) == 0xc0 &&
    	     (putf8[Length+1] & 0xc0) == 0x80) {
             Length += 2;
        }
        else if ((putf8[Length] & 0xf0) == 0xe0 &&
    	         (putf8[Length+1] & 0xc0) == 0x80 &&
    	         (putf8[Length+2] & 0xc0) == 0x80) {
    	        Length += 3;
        }
        else
            return Length;
    }
};

int utf8_length(const utf8 * pUTF8str)
{
    const utf8 *putf8 = pUTF8str;
    int Length=0;
    int chNum=0;
    while (1) {
        if ((putf8[Length] & 0x80) == 0x00) {
            chNum++;
            if(putf8[Length++] == '\0')
                 return chNum-1;
        }
        else if ((putf8[Length] & 0xe0) == 0xc0 &&
    	     (putf8[Length+1] & 0xc0) == 0x80) {
            Length += 2;
            chNum++;
        }
        else if ((putf8[Length] & 0xf0) == 0xe0 &&
    	         (putf8[Length+1] & 0xc0) == 0x80 &&
    	         (putf8[Length+2] & 0xc0) == 0x80) {
    	    Length += 3;
            chNum++;
        }
        else
            return chNum;
    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -