📄 cjk_tab_to_h.c

📁 libiconv是一个很不错的字符集转换库。程序接口也很简单
💻 C
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/* Copyright (C) 1999-2002 Free Software Foundation, Inc.   This file is part of the GNU LIBICONV Tools.   This program is free software; you can redistribute it and/or modify   it under the terms of the GNU General Public License as published by   the Free Software Foundation; either version 2, or (at your option)   any later version.   This program is distributed in the hope that it will be useful,   but WITHOUT ANY WARRANTY; without even the implied warranty of   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the   GNU General Public License for more details.   You should have received a copy of the GNU General Public License   along with this program; if not, write to the Free Software Foundation,   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  *//* * Generates a CJK character set table from a .TXT table as found on * ftp.unicode.org or in the X nls directory. * Examples: * *   ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < gb2312 *   ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < jis0208 *   ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < ksc5601 * *   ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < GB2312.TXT *   ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < JIS0208.TXT *   ./cjk_tab_to_h JISX0212.1990-0 jisx0212 > jisx0212.h < JIS0212.TXT *   ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < KSC5601.TXT *   ./cjk_tab_to_h KSX1001.1992-0 ksc5601 > ksc5601.h < KSX1001.TXT * *   ./cjk_tab_to_h BIG5 big5 > big5.h < BIG5.TXT * *   ./cjk_tab_to_h JOHAB johab > johab.h < JOHAB.TXT * *   ./cjk_tab_to_h JISX0213:2000 jisx0213 > jisx0213.h < JISX0213.TXT */#include <stdio.h>#include <stdlib.h>#include <stdbool.h>#include <string.h>#include <ctype.h>#include <assert.h>typedef struct {  int start;  int end;} Block;typedef struct {  int rows;    /* number of possible values for the 1st byte */  int cols;    /* number of possible values for the 2nd byte */  int (*row_byte) (int row); /* returns the 1st byte value for a given row */  int (*col_byte) (int col); /* returns the 2nd byte value for a given col */  int (*byte_row) (int byte); /* converts a 1st byte value to a row, else -1 */  int (*byte_col) (int byte); /* converts a 2nd byte value to a col, else -1 */  const char* check_row_expr; /* format string for 1st byte value checking */  const char* check_col_expr; /* format string for 2nd byte value checking */  const char* byte_row_expr; /* format string for 1st byte value to row */  const char* byte_col_expr; /* format string for 2nd byte value to col */  int** charset2uni; /* charset2uni[0..rows-1][0..cols-1] is valid */  /* You'll understand the terms "row" and "col" when you buy Ken Lunde's book.     Once a row is fixed, choosing a "col" is the same as choosing a "cell". */  int* charsetpage; /* charsetpage[0..rows]: how large is a page for a row */  int ncharsetblocks;  Block* charsetblocks; /* blocks[0..nblocks-1] */  int* uni2charset; /* uni2charset[0x0000..0xffff] */  int fffd;    /* uni representation of the invalid character */} Encoding;/* * Outputs the file title. */static void output_title (const char *charsetname){  printf("/*\n");  printf(" * Copyright (C) 1999-2002 Free Software Foundation, Inc.\n");  printf(" * This file is part of the GNU LIBICONV Library.\n");  printf(" *\n");  printf(" * The GNU LIBICONV Library is free software; you can redistribute it\n");  printf(" * and/or modify it under the terms of the GNU Library General Public\n");  printf(" * License as published by the Free Software Foundation; either version 2\n");  printf(" * of the License, or (at your option) any later version.\n");  printf(" *\n");  printf(" * The GNU LIBICONV Library is distributed in the hope that it will be\n");  printf(" * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n");  printf(" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n");  printf(" * Library General Public License for more details.\n");  printf(" *\n");  printf(" * You should have received a copy of the GNU Library General Public\n");  printf(" * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n");  printf(" * If not, write to the Free Software Foundation, Inc., 59 Temple Place -\n");  printf(" * Suite 330, Boston, MA 02111-1307, USA.\n");  printf(" */\n");  printf("\n");  printf("/*\n");  printf(" * %s\n", charsetname);  printf(" */\n");  printf("\n");}/* * Reads the charset2uni table from standard input. */static void read_table (Encoding* enc){  int row, col, i, i1, i2, c, j;  enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));  for (row = 0; row < enc->rows; row++)    enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));  for (row = 0; row < enc->rows; row++)    for (col = 0; col < enc->cols; col++)      enc->charset2uni[row][col] = 0xfffd;  c = getc(stdin);  ungetc(c,stdin);  if (c == '#') {    /* Read a unicode.org style .TXT file. */    for (;;) {      c = getc(stdin);      if (c == EOF)        break;      if (c == '\n' || c == ' ' || c == '\t')        continue;      if (c == '#') {        do { c = getc(stdin); } while (!(c == EOF || c == '\n'));        continue;      }      ungetc(c,stdin);      if (scanf("0x%x", &j) != 1)        exit(1);      i1 = j >> 8;      i2 = j & 0xff;      row = enc->byte_row(i1);      col = enc->byte_col(i2);      if (row < 0 || col < 0) {        fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);        exit(1);      }      if (scanf(" 0x%x", &enc->charset2uni[row][col]) != 1)        exit(1);    }  } else {    /* Read a table of hexadecimal Unicode values. */    for (i1 = 32; i1 < 132; i1++)      for (i2 = 32; i2 < 132; i2++) {        i = scanf("%x", &j);        if (i == EOF)          goto read_done;        if (i != 1)          exit(1);        if (j < 0 || j == 0xffff)          j = 0xfffd;        if (j != 0xfffd) {          if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {            fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);            exit (1);          }          enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;        }      }   read_done: ;  }}/* * Determine whether the Unicode range goes outside the BMP. */static bool is_charset2uni_large (Encoding* enc){  int row, col;  for (row = 0; row < enc->rows; row++)    for (col = 0; col < enc->cols; col++)      if (enc->charset2uni[row][col] >= 0x10000)        return true;  return false;}/* * Compactify the Unicode range by use of an auxiliary table, * so 16 bits suffice to store each value. */static int compact_large_charset2uni (Encoding* enc, unsigned int **urows, unsigned int *urowshift){  unsigned int shift;  for (shift = 8; ; shift--) {    int *upages = (int *) malloc((0x110000>>shift) * sizeof(int));    int i, row, col, nurows;    for (i = 0; i < 0x110000>>shift; i++)      upages[i] = -1;    for (row = 0; row < enc->rows; row++)      for (col = 0; col < enc->cols; col++)        upages[enc->charset2uni[row][col] >> shift] = 0;    nurows = 0;    for (i = 0; i < 0x110000>>shift; i++)      if (upages[i] == 0)        nurows++;    /* We want all table entries to fit in an 'unsigned short'. */    if (nurows <= 1<<(16-shift)) {      int** old_charset2uni;      *urows = (unsigned int *) malloc(nurows * sizeof(unsigned int));      *urowshift = shift;      nurows = 0;      for (i = 0; i < 0x110000>>shift; i++)        if (upages[i] == 0) {          upages[i] = nurows;          (*urows)[nurows] = i;          nurows++;        }      old_charset2uni = enc->charset2uni;      enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));      for (row = 0; row < enc->rows; row++)        enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));      for (row = 0; row < enc->rows; row++)        for (col = 0; col < enc->cols; col++) {          int u = old_charset2uni[row][col];          enc->charset2uni[row][col] =            (upages[u >> shift] << shift) | (u & ((1 << shift) - 1));        }      enc->fffd =        (upages[0xfffd >> shift] << shift) | (0xfffd & ((1 << shift) - 1));      return nurows;    }  }  abort();}/* * Computes the charsetpage[0..rows] array. */static void find_charset2uni_pages (Encoding* enc){  int row, col;  enc->charsetpage = (int*) malloc((enc->rows+1)*sizeof(int));  for (row = 0; row <= enc->rows; row++)    enc->charsetpage[row] = 0;  for (row = 0; row < enc->rows; row++) {    int used = 0;    for (col = 0; col < enc->cols; col++)      if (enc->charset2uni[row][col] != enc->fffd)        used = col+1;    enc->charsetpage[row] = used;  }}/* * Fills in nblocks and blocks. */static void find_charset2uni_blocks (Encoding* enc){  int n, row, lastrow;  enc->charsetblocks = (Block*) malloc(enc->rows*sizeof(Block));  n = 0;  for (row = 0; row < enc->rows; row++)    if (enc->charsetpage[row] > 0 && (row == 0 || enc->charsetpage[row-1] == 0)) {      for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);      enc->charsetblocks[n].start = row * enc->cols;      enc->charsetblocks[n].end = lastrow * enc->cols + enc->charsetpage[lastrow];      n++;    }  enc->ncharsetblocks = n;}/* * Outputs the charset to unicode table and function. */static void output_charset2uni (const char* name, Encoding* enc){  int nurows, row, col, lastrow, col_max, i, i1_min, i1_max;  bool is_large;  unsigned int* urows;  unsigned int urowshift;  Encoding tmpenc;  is_large = is_charset2uni_large(enc);  if (is_large) {    /* Use a temporary copy of enc. */    tmpenc = *enc;    enc = &tmpenc;    nurows = compact_large_charset2uni(enc,&urows,&urowshift);  } else {    nurows = 0; urows = NULL; urowshift = 0; enc->fffd = 0xfffd;  }  find_charset2uni_pages(enc);  find_charset2uni_blocks(enc);  for (row = 0; row < enc->rows; row++)    if (enc->charsetpage[row] > 0) {      if (row == 0 || enc->charsetpage[row-1] == 0) {        /* Start a new block. */        for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);        printf("static const unsigned short %s_2uni_page%02x[%d] = {\n",               name, enc->row_byte(row),               (lastrow-row) * enc->cols + enc->charsetpage[lastrow]);      }      printf("  /""* 0x%02x *""/\n ", enc->row_byte(row));      col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]);      for (col = 0; col < col_max; col++) {        printf(" 0x%04x,", enc->charset2uni[row][col]);        if ((col % 8) == 7 && (col+1 < col_max)) printf("\n ");      }      printf("\n");      if (enc->charsetpage[row+1] == 0) {        /* End a block. */        printf("};\n");      }    }  printf("\n");  if (is_large) {    printf("static const ucs4_t %s_2uni_upages[%d] = {\n ", name, nurows);    for (i = 0; i < nurows; i++) {      printf(" 0x%05x,", urows[i] << urowshift);      if ((i % 8) == 7 && (i+1 < nurows)) printf("\n ");    }    printf("\n");    printf("};\n");    printf("\n");  }  printf("static int\n");  printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);  printf("{\n");  printf("  unsigned char c1 = s[0];\n");  printf("  if (");  for (i = 0; i < enc->ncharsetblocks; i++) {    i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols);    i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols);    if (i > 0)      printf(" || ");    if (i1_min == i1_max)      printf("(c1 == 0x%02x)", i1_min);    else      printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max);  }  printf(") {\n");  printf("    if (n >= 2) {\n");  printf("      unsigned char c2 = s[1];\n");  printf("      if (");  printf(enc->check_col_expr, "c2");  printf(") {\n");  printf("        unsigned int i = %d * (", enc->cols);  printf(enc->byte_row_expr, "c1");  printf(") + (");  printf(enc->byte_col_expr, "c2");  printf(");\n");  printf("        %s wc = 0xfffd;\n", is_large ? "ucs4_t" : "unsigned short");  if (is_large) printf("        unsigned short swc;\n");  for (i = 0; i < enc->ncharsetblocks; i++) {    printf("        ");    if (i > 0)      printf("} else ");    if (i < enc->ncharsetblocks-1)      printf("if (i < %d) ", enc->charsetblocks[i+1].start);    printf("{\n");    printf("          if (i < %d)\n", enc->charsetblocks[i].end);    printf("            %s = ", is_large ? "swc" : "wc");    printf("%s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols));    if (enc->charsetblocks[i].start > 0)      printf("-%d", enc->charsetblocks[i].start);    printf("]");    if (is_large) printf(",\n            wc = %s_2uni_upages[swc>>%d] | (swc & 0x%x)", name, urowshift, (1 << urowshift) - 1);    printf(";\n");  }  printf("        }\n");  printf("        if (wc != 0xfffd) {\n");  printf("          *pwc = %swc;\n", is_large ? "" : "(ucs4_t) ");  printf("          return 2;\n");  printf("        }\n");  printf("      }\n");  printf("      return RET_ILSEQ;\n");  printf("    }\n");  printf("    return RET_TOOFEW(0);\n");  printf("  }\n");  printf("  return RET_ILSEQ;\n");  printf("}\n");  printf("\n");}/* * Outputs the charset to unicode table and function. * (Suitable if the mapping function is well defined, i.e. has no holes, and * is monotonically increasing with small gaps only.) */static void output_charset2uni_noholes_monotonic (const char* name, Encoding* enc){  int row, col, lastrow, r, col_max, i, i1_min, i1_max;  /* Choose stepsize so that stepsize*steps_per_row >= enc->cols, and     enc->charset2uni[row][col] - enc->charset2uni[row][col/stepsize*stepsize]     is always < 0x100. */
12 3 4 5 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -