📄 cjk_tab_to_h.c
字号:
/* Copyright (C) 1999-2002 Free Software Foundation, Inc. This file is part of the GNU LIBICONV Tools. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *//* * Generates a CJK character set table from a .TXT table as found on * ftp.unicode.org or in the X nls directory. * Examples: * * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < gb2312 * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < jis0208 * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < ksc5601 * * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < GB2312.TXT * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < JIS0208.TXT * ./cjk_tab_to_h JISX0212.1990-0 jisx0212 > jisx0212.h < JIS0212.TXT * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < KSC5601.TXT * ./cjk_tab_to_h KSX1001.1992-0 ksc5601 > ksc5601.h < KSX1001.TXT * * ./cjk_tab_to_h BIG5 big5 > big5.h < BIG5.TXT * * ./cjk_tab_to_h JOHAB johab > johab.h < JOHAB.TXT * * ./cjk_tab_to_h JISX0213:2000 jisx0213 > jisx0213.h < JISX0213.TXT */#include <stdio.h>#include <stdlib.h>#include <stdbool.h>#include <string.h>#include <ctype.h>#include <assert.h>typedef struct { int start; int end;} Block;typedef struct { int rows; /* number of possible values for the 1st byte */ int cols; /* number of possible values for the 2nd byte */ int (*row_byte) (int row); /* returns the 1st byte value for a given row */ int (*col_byte) (int col); /* returns the 2nd byte value for a given col */ int (*byte_row) (int byte); /* converts a 1st byte value to a row, else -1 */ int (*byte_col) (int byte); /* converts a 2nd byte value to a col, else -1 */ const char* check_row_expr; /* format string for 1st byte value checking */ const char* check_col_expr; /* format string for 2nd byte value checking */ const char* byte_row_expr; /* format string for 1st byte value to row */ const char* byte_col_expr; /* format string for 2nd byte value to col */ int** charset2uni; /* charset2uni[0..rows-1][0..cols-1] is valid */ /* You'll understand the terms "row" and "col" when you buy Ken Lunde's book. Once a row is fixed, choosing a "col" is the same as choosing a "cell". */ int* charsetpage; /* charsetpage[0..rows]: how large is a page for a row */ int ncharsetblocks; Block* charsetblocks; /* blocks[0..nblocks-1] */ int* uni2charset; /* uni2charset[0x0000..0xffff] */ int fffd; /* uni representation of the invalid character */} Encoding;/* * Outputs the file title. */static void output_title (const char *charsetname){ printf("/*\n"); printf(" * Copyright (C) 1999-2002 Free Software Foundation, Inc.\n"); printf(" * This file is part of the GNU LIBICONV Library.\n"); printf(" *\n"); printf(" * The GNU LIBICONV Library is free software; you can redistribute it\n"); printf(" * and/or modify it under the terms of the GNU Library General Public\n"); printf(" * License as published by the Free Software Foundation; either version 2\n"); printf(" * of the License, or (at your option) any later version.\n"); printf(" *\n"); printf(" * The GNU LIBICONV Library is distributed in the hope that it will be\n"); printf(" * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); printf(" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n"); printf(" * Library General Public License for more details.\n"); printf(" *\n"); printf(" * You should have received a copy of the GNU Library General Public\n"); printf(" * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n"); printf(" * If not, write to the Free Software Foundation, Inc., 59 Temple Place -\n"); printf(" * Suite 330, Boston, MA 02111-1307, USA.\n"); printf(" */\n"); printf("\n"); printf("/*\n"); printf(" * %s\n", charsetname); printf(" */\n"); printf("\n");}/* * Reads the charset2uni table from standard input. */static void read_table (Encoding* enc){ int row, col, i, i1, i2, c, j; enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*)); for (row = 0; row < enc->rows; row++) enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int)); for (row = 0; row < enc->rows; row++) for (col = 0; col < enc->cols; col++) enc->charset2uni[row][col] = 0xfffd; c = getc(stdin); ungetc(c,stdin); if (c == '#') { /* Read a unicode.org style .TXT file. */ for (;;) { c = getc(stdin); if (c == EOF) break; if (c == '\n' || c == ' ' || c == '\t') continue; if (c == '#') { do { c = getc(stdin); } while (!(c == EOF || c == '\n')); continue; } ungetc(c,stdin); if (scanf("0x%x", &j) != 1) exit(1); i1 = j >> 8; i2 = j & 0xff; row = enc->byte_row(i1); col = enc->byte_col(i2); if (row < 0 || col < 0) { fprintf(stderr, "lost entry for %02x %02x\n", i1, i2); exit(1); } if (scanf(" 0x%x", &enc->charset2uni[row][col]) != 1) exit(1); } } else { /* Read a table of hexadecimal Unicode values. */ for (i1 = 32; i1 < 132; i1++) for (i2 = 32; i2 < 132; i2++) { i = scanf("%x", &j); if (i == EOF) goto read_done; if (i != 1) exit(1); if (j < 0 || j == 0xffff) j = 0xfffd; if (j != 0xfffd) { if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) { fprintf(stderr, "lost entry at %02x %02x\n", i1, i2); exit (1); } enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j; } } read_done: ; }}/* * Determine whether the Unicode range goes outside the BMP. */static bool is_charset2uni_large (Encoding* enc){ int row, col; for (row = 0; row < enc->rows; row++) for (col = 0; col < enc->cols; col++) if (enc->charset2uni[row][col] >= 0x10000) return true; return false;}/* * Compactify the Unicode range by use of an auxiliary table, * so 16 bits suffice to store each value. */static int compact_large_charset2uni (Encoding* enc, unsigned int **urows, unsigned int *urowshift){ unsigned int shift; for (shift = 8; ; shift--) { int *upages = (int *) malloc((0x110000>>shift) * sizeof(int)); int i, row, col, nurows; for (i = 0; i < 0x110000>>shift; i++) upages[i] = -1; for (row = 0; row < enc->rows; row++) for (col = 0; col < enc->cols; col++) upages[enc->charset2uni[row][col] >> shift] = 0; nurows = 0; for (i = 0; i < 0x110000>>shift; i++) if (upages[i] == 0) nurows++; /* We want all table entries to fit in an 'unsigned short'. */ if (nurows <= 1<<(16-shift)) { int** old_charset2uni; *urows = (unsigned int *) malloc(nurows * sizeof(unsigned int)); *urowshift = shift; nurows = 0; for (i = 0; i < 0x110000>>shift; i++) if (upages[i] == 0) { upages[i] = nurows; (*urows)[nurows] = i; nurows++; } old_charset2uni = enc->charset2uni; enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*)); for (row = 0; row < enc->rows; row++) enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int)); for (row = 0; row < enc->rows; row++) for (col = 0; col < enc->cols; col++) { int u = old_charset2uni[row][col]; enc->charset2uni[row][col] = (upages[u >> shift] << shift) | (u & ((1 << shift) - 1)); } enc->fffd = (upages[0xfffd >> shift] << shift) | (0xfffd & ((1 << shift) - 1)); return nurows; } } abort();}/* * Computes the charsetpage[0..rows] array. */static void find_charset2uni_pages (Encoding* enc){ int row, col; enc->charsetpage = (int*) malloc((enc->rows+1)*sizeof(int)); for (row = 0; row <= enc->rows; row++) enc->charsetpage[row] = 0; for (row = 0; row < enc->rows; row++) { int used = 0; for (col = 0; col < enc->cols; col++) if (enc->charset2uni[row][col] != enc->fffd) used = col+1; enc->charsetpage[row] = used; }}/* * Fills in nblocks and blocks. */static void find_charset2uni_blocks (Encoding* enc){ int n, row, lastrow; enc->charsetblocks = (Block*) malloc(enc->rows*sizeof(Block)); n = 0; for (row = 0; row < enc->rows; row++) if (enc->charsetpage[row] > 0 && (row == 0 || enc->charsetpage[row-1] == 0)) { for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++); enc->charsetblocks[n].start = row * enc->cols; enc->charsetblocks[n].end = lastrow * enc->cols + enc->charsetpage[lastrow]; n++; } enc->ncharsetblocks = n;}/* * Outputs the charset to unicode table and function. */static void output_charset2uni (const char* name, Encoding* enc){ int nurows, row, col, lastrow, col_max, i, i1_min, i1_max; bool is_large; unsigned int* urows; unsigned int urowshift; Encoding tmpenc; is_large = is_charset2uni_large(enc); if (is_large) { /* Use a temporary copy of enc. */ tmpenc = *enc; enc = &tmpenc; nurows = compact_large_charset2uni(enc,&urows,&urowshift); } else { nurows = 0; urows = NULL; urowshift = 0; enc->fffd = 0xfffd; } find_charset2uni_pages(enc); find_charset2uni_blocks(enc); for (row = 0; row < enc->rows; row++) if (enc->charsetpage[row] > 0) { if (row == 0 || enc->charsetpage[row-1] == 0) { /* Start a new block. */ for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++); printf("static const unsigned short %s_2uni_page%02x[%d] = {\n", name, enc->row_byte(row), (lastrow-row) * enc->cols + enc->charsetpage[lastrow]); } printf(" /""* 0x%02x *""/\n ", enc->row_byte(row)); col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]); for (col = 0; col < col_max; col++) { printf(" 0x%04x,", enc->charset2uni[row][col]); if ((col % 8) == 7 && (col+1 < col_max)) printf("\n "); } printf("\n"); if (enc->charsetpage[row+1] == 0) { /* End a block. */ printf("};\n"); } } printf("\n"); if (is_large) { printf("static const ucs4_t %s_2uni_upages[%d] = {\n ", name, nurows); for (i = 0; i < nurows; i++) { printf(" 0x%05x,", urows[i] << urowshift); if ((i % 8) == 7 && (i+1 < nurows)) printf("\n "); } printf("\n"); printf("};\n"); printf("\n"); } printf("static int\n"); printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name); printf("{\n"); printf(" unsigned char c1 = s[0];\n"); printf(" if ("); for (i = 0; i < enc->ncharsetblocks; i++) { i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols); i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols); if (i > 0) printf(" || "); if (i1_min == i1_max) printf("(c1 == 0x%02x)", i1_min); else printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max); } printf(") {\n"); printf(" if (n >= 2) {\n"); printf(" unsigned char c2 = s[1];\n"); printf(" if ("); printf(enc->check_col_expr, "c2"); printf(") {\n"); printf(" unsigned int i = %d * (", enc->cols); printf(enc->byte_row_expr, "c1"); printf(") + ("); printf(enc->byte_col_expr, "c2"); printf(");\n"); printf(" %s wc = 0xfffd;\n", is_large ? "ucs4_t" : "unsigned short"); if (is_large) printf(" unsigned short swc;\n"); for (i = 0; i < enc->ncharsetblocks; i++) { printf(" "); if (i > 0) printf("} else "); if (i < enc->ncharsetblocks-1) printf("if (i < %d) ", enc->charsetblocks[i+1].start); printf("{\n"); printf(" if (i < %d)\n", enc->charsetblocks[i].end); printf(" %s = ", is_large ? "swc" : "wc"); printf("%s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols)); if (enc->charsetblocks[i].start > 0) printf("-%d", enc->charsetblocks[i].start); printf("]"); if (is_large) printf(",\n wc = %s_2uni_upages[swc>>%d] | (swc & 0x%x)", name, urowshift, (1 << urowshift) - 1); printf(";\n"); } printf(" }\n"); printf(" if (wc != 0xfffd) {\n"); printf(" *pwc = %swc;\n", is_large ? "" : "(ucs4_t) "); printf(" return 2;\n"); printf(" }\n"); printf(" }\n"); printf(" return RET_ILSEQ;\n"); printf(" }\n"); printf(" return RET_TOOFEW(0);\n"); printf(" }\n"); printf(" return RET_ILSEQ;\n"); printf("}\n"); printf("\n");}/* * Outputs the charset to unicode table and function. * (Suitable if the mapping function is well defined, i.e. has no holes, and * is monotonically increasing with small gaps only.) */static void output_charset2uni_noholes_monotonic (const char* name, Encoding* enc){ int row, col, lastrow, r, col_max, i, i1_min, i1_max; /* Choose stepsize so that stepsize*steps_per_row >= enc->cols, and enc->charset2uni[row][col] - enc->charset2uni[row][col/stepsize*stepsize] is always < 0x100. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -