unicode.c

来自「一个关于韩语unicode编码的C语言库文件」· C语言代码 · 共 667 行
667 行
/* * Unicode Library version 0.1 * Nov 23, 1999 yoshidam * */#include "ruby.h"#include "rubyio.h"#include <stdio.h>#include "wstring.h"#include "unidata.map"static VALUE mUnicode;static VALUE unicode_data;static VALUE composition_table;/* Hangul */#define SBASE   (0xac00)#define LBASE   (0x1100)#define LCOUNT  (19)#define VBASE   (0x1161)#define VCOUNT  (21)#define TBASE   (0x11a7)#define TCOUNT  (28)#define NCOUNT  (VCOUNT * TCOUNT) /* 588 */#define SCOUNT  (LCOUNT * NCOUNT) /* 11172 */static intget_cc(int ucs){  VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));  if (!NIL_P(ch)) {    return unidata[FIX2INT(ch)].combining_class;  }  return 0;}static const char*get_canon(int ucs){  VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));  if (!NIL_P(ch)) {    return unidata[FIX2INT(ch)].canon;  }  return NULL;}static const char*get_compat(int ucs){  VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));  if (!NIL_P(ch)) {    return unidata[FIX2INT(ch)].compat;  }  return NULL;}static const intget_uppercase(int ucs){  VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));  if (!NIL_P(ch)) {    int uc = unidata[FIX2INT(ch)].uppercase;    if (uc > 0) return uc;  }  return ucs;}static intget_lowercase(int ucs){  VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));  if (!NIL_P(ch)) {    int lc = unidata[FIX2INT(ch)].lowercase;    if (lc > 0) return lc;  }  return ucs;}static intget_titlecase(int ucs){  VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));  if (!NIL_P(ch)) {    int tc = unidata[FIX2INT(ch)].titlecase;    if (tc > 0) return tc;  }  return ucs;}static intget_composition(const char* str){  VALUE ch = rb_hash_aref(composition_table, rb_str_new2(str));  if (!NIL_P(ch)) {    return FIX2INT(ch);  }  return -1;}static WString*sort_canonical(WString* ustr){  int i = 1;  int len = ustr->len;  if (len < 2) return ustr;  while (i < len) {    int last = ustr->str[i - 1];    int ch = ustr->str[i];    int last_cc = get_cc(last);    int cc = get_cc(ch);    if (cc != 0 && last_cc != 0 && last_cc > cc) {      ustr->str[i] = last;      ustr->str[i-1] = ch;      if (i > 1) i--;    }    else {      i++;    }  }  return ustr;}static voiddecompose_hangul(int ucs, int* l, int* v, int* t){  int sindex = ucs - SBASE;  if (sindex < 0 || sindex >= SCOUNT) {    *l = ucs;    *v = *t = 0;    return;  }  *l = LBASE + sindex / NCOUNT;  *v = VBASE + (sindex % NCOUNT) / TCOUNT;  *t = TBASE + sindex % TCOUNT;  if (*t == TBASE) *t = 0;}/* * push decomposed str into result  */static WString*decompose_internal(WString* ustr, WString* result){  int i;  int len = ustr->len;  for (i = 0; i < len; i++) {    int ucs = ustr->str[i];    if (ucs >= SBASE && ucs < SBASE + SCOUNT) {      int l, v, t;      decompose_hangul(ucs, &l, &v, &t);      WStr_addWChar(result, l);      if (v) WStr_addWChar(result, v);      if (t) WStr_addWChar(result, t);    }    else {      const char* dc = get_canon(ucs);      if (!dc) {	WStr_addWChar(result, ucs);      }      else {	WString wdc;	WStr_allocWithUTF8(&wdc, dc);	decompose_internal(&wdc, result);	WStr_free(&wdc);      }    }  }  return result;}/* * push compatibility decomposed str into result  */static WString*decompose_compat_internal(WString* ustr, WString* result){  int i;  int len = ustr->len;  for (i = 0; i < len; i++) {    int ucs = ustr->str[i];    if (ucs >= SBASE && ucs < SBASE + SCOUNT) {      int l, v, t;      decompose_hangul(ucs, &l, &v, &t);      WStr_addWChar(result, l);      if (v) WStr_addWChar(result, v);      if (t) WStr_addWChar(result, t);    }    else {      const char* dc = get_compat(ucs);      if (!dc) {	WStr_addWChar(result, ucs);      }      else {	WString wdc;	WStr_allocWithUTF8(&wdc, dc);	decompose_compat_internal(&wdc, result);	WStr_free(&wdc);      }    }  }  return result;}#define UCS4toUTF8(p, c) \  do { \    if (c < 128) { \      *p++ = c; \    } \    else if (c < 2048) { \      *p++ = (c >> 6) | 192; \      *p++ = (c & 63) | 128; \    } \    else if (c < 0x10000) { \      *p++ = (c >> 12) | 224; \      *p++ = ((c >> 6) & 63) | 128; \      *p++ = (c & 63) | 128; \    } \    else if (c < 0x200000) { \      *p++ = (c >> 18) | 240; \      *p++ = ((c >> 12) & 63) | 128; \      *p++ = ((c >> 6) & 63) | 128; \      *p++ = (c & 63) | 128; \    } \    else if (c < 0x4000000) { \      *p++ = (c >> 24) | 248; \      *p++ = ((c >> 18) & 63) | 128; \      *p++ = ((c >> 12) & 63) | 128; \      *p++ = ((c >> 6) & 63) | 128; \      *p++ = (c & 63) | 128; \    } \    else if (c < 0x80000000) { \      *p++ = (c >> 30) | 252; \      *p++ = ((c >> 24) & 63) | 128; \      *p++ = ((c >> 18) & 63) | 128; \      *p++ = ((c >> 12) & 63) | 128; \      *p++ = ((c >> 6) & 63) | 128; \      *p++ = (c & 63) | 128; \    } \  } while (0)static intcompose_pair(int c1, int c2){  int ret;  char ustr[13]; /* stored two UTF-8 chars */  char *p = ustr;  /* Hangul L + V */  if (c1 >= LBASE && c1 < LBASE + LCOUNT &&      c2 >= VBASE && c2 < VBASE + VCOUNT) {    return SBASE + ((c1 - LBASE) * VCOUNT + (c2 - VBASE)) * TCOUNT;  }  /* Hangul LV + T */  else if (c1 >= SBASE && c1 < SBASE + SCOUNT &&	   (c1 - SBASE) % TCOUNT == 0 &&	   c2 >= TBASE && c2 < TBASE + TCOUNT) {    return c1 + (c2 - TBASE);  }  UCS4toUTF8(p, c1);  UCS4toUTF8(p, c2);  *p = '\0';  ret = get_composition(ustr);  return ret;}/* * push canonical composed str into result  */static WString*compose_internal(WString* ustr, WString* result){  int len = ustr->len;  int starter;  int startercc;  int i;  if (len == 0) return result;  starter = ustr->str[0];  startercc = get_cc(starter);  if (startercc != 0) startercc = 256;  for (i = 1; i < len; i++) {    int ch = ustr->str[i];    int cc = get_cc(ch);    int composite;    if (startercc == 0 &&	(composite = compose_pair(starter, ch)) >= 0) {      starter = composite;      startercc = get_cc(composite);    }    else {      WStr_addWChar(result, starter);      starter = ch;      startercc = cc;    }  }  WStr_addWChar(result, starter);  return result;}static WString*upcase_internal(WString* str){  int i;  for (i = 0; i < str->len; i++) {    int uc = get_uppercase(str->str[i]);    if (uc > 0) str->str[i] = uc;  }  return str;}static WString*downcase_internal(WString* str){  int i;  for (i = 0; i < str->len; i++) {    int lc = get_lowercase(str->str[i]);    if (lc > 0) str->str[i] = lc;  }  return str;}static WString*capitalize_internal(WString* str){  int i;  if (str->len > 1) {    int tc = get_titlecase(str->str[0]);    if (tc > 0) str->str[0] = tc;  }  for (i = 1; i < str->len; i++) {    int lc = get_lowercase(str->str[i]);    if (lc > 0) str->str[i] = lc;  }  return str;}static VALUEunicode_strcmp(VALUE obj, VALUE str1, VALUE str2){  WString wstr1;  WString wstr2;  WString result1;  WString result2;  UString ustr1;  UString ustr2;  int ret;  Check_Type(str1, T_STRING);  Check_Type(str2, T_STRING);  WStr_allocWithUTF8(&wstr1, RSTRING(str1)->ptr);  WStr_allocWithUTF8(&wstr2, RSTRING(str2)->ptr);  WStr_alloc(&result1);  WStr_alloc(&result2);  decompose_internal(&wstr1, &result1);  decompose_internal(&wstr2, &result2);  WStr_free(&wstr1);  WStr_free(&wstr2);  sort_canonical(&result1);  sort_canonical(&result2);  UStr_alloc(&ustr1);  UStr_alloc(&ustr2);  WStr_convertIntoUString(&result1, &ustr1);  WStr_convertIntoUString(&result2, &ustr2);  WStr_free(&result1);  WStr_free(&result2);  UStr_addChar(&ustr1, '\0');  UStr_addChar(&ustr2, '\0');  ret = strcmp(ustr1.str, ustr2.str);  UStr_free(&ustr1);  UStr_free(&ustr2);  return INT2FIX(ret);}static VALUEunicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2){  WString wstr1;  WString wstr2;  WString result1;  WString result2;  UString ustr1;  UString ustr2;  int ret;  Check_Type(str1, T_STRING);  Check_Type(str2, T_STRING);  WStr_allocWithUTF8(&wstr1, RSTRING(str1)->ptr);  WStr_allocWithUTF8(&wstr2, RSTRING(str2)->ptr);  WStr_alloc(&result1);  WStr_alloc(&result2);  decompose_compat_internal(&wstr1, &result1);  decompose_compat_internal(&wstr2, &result2);  WStr_free(&wstr1);  WStr_free(&wstr2);  sort_canonical(&result1);  sort_canonical(&result2);  UStr_alloc(&ustr1);  UStr_alloc(&ustr2);  WStr_convertIntoUString(&result1, &ustr1);  WStr_convertIntoUString(&result2, &ustr2);  WStr_free(&result1);  WStr_free(&result2);  UStr_addChar(&ustr1, '\0');  UStr_addChar(&ustr2, '\0');  ret = strcmp(ustr1.str, ustr2.str);  UStr_free(&ustr1);  UStr_free(&ustr2);  return INT2FIX(ret);}static VALUEunicode_decompose(VALUE obj, VALUE str){  WString ustr;  WString result;  UString ret;  VALUE vret;  Check_Type(str, T_STRING);  WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);  WStr_alloc(&result);  decompose_internal(&ustr, &result);  WStr_free(&ustr);  sort_canonical(&result);  UStr_alloc(&ret);  WStr_convertIntoUString(&result, &ret);  WStr_free(&result);  vret = rb_str_new(ret.str, ret.len);  UStr_free(&ret);  return vret;}static VALUEunicode_decompose_compat(VALUE obj, VALUE str){  WString ustr;  WString result;  UString ret;  VALUE vret;  Check_Type(str, T_STRING);  WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);  WStr_alloc(&result);  decompose_compat_internal(&ustr, &result);  WStr_free(&ustr);  sort_canonical(&result);  UStr_alloc(&ret);  WStr_convertIntoUString(&result, &ret);  WStr_free(&result);  vret = rb_str_new(ret.str, ret.len);  UStr_free(&ret);  return vret;}static VALUEunicode_compose(VALUE obj, VALUE str){  WString ustr;  WString result;  UString ret;  VALUE vret;  Check_Type(str, T_STRING);  WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);  sort_canonical(&ustr);  WStr_alloc(&result);  compose_internal(&ustr, &result);  WStr_free(&ustr);  UStr_alloc(&ret);  WStr_convertIntoUString(&result, &ret);  WStr_free(&result);  vret = rb_str_new(ret.str, ret.len);  UStr_free(&ret);  return vret;}static VALUEunicode_normalize_C(VALUE obj, VALUE str){  WString ustr1;  WString ustr2;  WString result;  UString ret;  VALUE vret;  Check_Type(str, T_STRING);  WStr_allocWithUTF8(&ustr1, RSTRING(str)->ptr);  WStr_alloc(&ustr2);  decompose_internal(&ustr1, &ustr2);  WStr_free(&ustr1);  sort_canonical(&ustr2);  WStr_alloc(&result);  compose_internal(&ustr2, &result);  WStr_free(&ustr2);  UStr_alloc(&ret);  WStr_convertIntoUString(&result, &ret);  WStr_free(&result);  vret = rb_str_new(ret.str, ret.len);  UStr_free(&ret);  return vret;}static VALUEunicode_normalize_KC(VALUE obj, VALUE str){  WString ustr1;  WString ustr2;  WString result;  UString ret;  VALUE vret;  Check_Type(str, T_STRING);  WStr_allocWithUTF8(&ustr1, RSTRING(str)->ptr);  WStr_alloc(&ustr2);  decompose_compat_internal(&ustr1, &ustr2);  WStr_free(&ustr1);  sort_canonical(&ustr2);  WStr_alloc(&result);  compose_internal(&ustr2, &result);  WStr_free(&ustr2);  UStr_alloc(&ret);  WStr_convertIntoUString(&result, &ret);  WStr_free(&result);  vret = rb_str_new(ret.str, ret.len);  UStr_free(&ret);  return vret;}static VALUEunicode_upcase(VALUE obj, VALUE str){  WString ustr;  UString ret;  VALUE vret;  Check_Type(str, T_STRING);  WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);  upcase_internal(&ustr);  UStr_alloc(&ret);  WStr_convertIntoUString(&ustr, &ret);  WStr_free(&ustr);  vret = rb_str_new(ret.str, ret.len);  UStr_free(&ret);  return vret;}static VALUEunicode_downcase(VALUE obj, VALUE str){  WString ustr;  UString ret;  VALUE vret;  Check_Type(str, T_STRING);  WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);  downcase_internal(&ustr);  UStr_alloc(&ret);  WStr_convertIntoUString(&ustr, &ret);  WStr_free(&ustr);  vret = rb_str_new(ret.str, ret.len);  UStr_free(&ret);  return vret;}static VALUEunicode_capitalize(VALUE obj, VALUE str){  WString ustr;  UString ret;  VALUE vret;  Check_Type(str, T_STRING);  WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);  capitalize_internal(&ustr);  UStr_alloc(&ret);  WStr_convertIntoUString(&ustr, &ret);  WStr_free(&ustr);  vret = rb_str_new(ret.str, ret.len);  UStr_free(&ret);  return vret;}voidInit_unicode(){  int i;  mUnicode = rb_define_module("Unicode");  unicode_data = rb_hash_new();  composition_table = rb_hash_new();  rb_global_variable(&unicode_data);  rb_global_variable(&composition_table);  for (i = 0; unidata[i].code != -1; i++) {    int code = unidata[i].code;    const char* canon = unidata[i].canon;    int exclusion = unidata[i].exclusion;    rb_hash_aset(unicode_data, INT2FIX(code), INT2FIX(i));    if (canon && exclusion == 0) {      rb_hash_aset(composition_table, rb_str_new2(canon), INT2FIX(code));    }  }  rb_define_module_function(mUnicode, "strcmp",			    unicode_strcmp, 2);  rb_define_module_function(mUnicode, "strcmp_compat",			    unicode_strcmp_compat, 2);  rb_define_module_function(mUnicode, "decompose",			    unicode_decompose, 1);  rb_define_module_function(mUnicode, "decompose_compat",			    unicode_decompose_compat, 1);  rb_define_module_function(mUnicode, "compose",			    unicode_compose, 1);  rb_define_module_function(mUnicode, "normalize_D",			    unicode_decompose, 1);  rb_define_module_function(mUnicode, "normalize_KD",			    unicode_decompose_compat, 1);  rb_define_module_function(mUnicode, "normalize_C",			    unicode_normalize_C, 1);  rb_define_module_function(mUnicode, "normalize_KC",			    unicode_normalize_KC, 1);  rb_define_module_function(mUnicode, "upcase",			    unicode_upcase, 1);  rb_define_module_function(mUnicode, "downcase",			    unicode_downcase, 1);  rb_define_module_function(mUnicode, "capitalize",			    unicode_capitalize, 1);}
unicode.c - 源码说明

本页面展示了「一个关于韩语unicode编码的C语言库文件」中的 unicode.c 源码文件，采用 C语言编程语言编写，共 667 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与unicode相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?