📄 gen_collate.c
字号:
/* TODO: * * add UNDEFINED at end if not specified * convert POSITION -> FORWARD,POSITION * * * deal with lowercase in <Uhhhh> * * what about reorders that keep the same rule? * * remove "unused" collation elements? (probably doesn't save much) * * add_rule function ... returns index into rule table after possibly adding custom-indexed rule * but don't forget about multichar weights... replace with strings of indexes * */#ifndef _GNU_SOURCE#define _GNU_SOURCE#endif#include <stddef.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <stdint.h>#include <stdarg.h>#include <limits.h>#include <ctype.h>#include <assert.h>#include <search.h>typedef struct { char *name; /* */ int num_weights; /* */ int ii_shift; /* */ int ti_shift; /* */ int ii_len; /* */ int ti_len; /* */ int max_weight; /* */ int num_col_base; /* */ int max_col_index; /* */ int undefined_idx; /* */ int range_low; /* */ int range_count; /* high - low */ int range_base_weight; /* */ int num_starters; /* */ int range_rule_offset; /* */ int wcs2colidt_offset; /* */ int index2weight_offset; /* */ int index2ruleidx_offset; /* */ int multistart_offset; /* */} base_locale_t;#define BASE_LOCALE_LEN 20static base_locale_t base_locale_array[BASE_LOCALE_LEN];static size_t base_locale_len;typedef struct { char *name; /* */ int base_idx; /* */ int undefined_idx; /* */ int overrides_offset; /* */ int multistart_offset; /* */} der_locale_t;#define DER_LOCALE_LEN 300static der_locale_t der_locale_array[DER_LOCALE_LEN];static size_t der_locale_len;#define OVERRIDE_LEN 50000static uint16_t override_buffer[OVERRIDE_LEN];static size_t override_len;#define MULTISTART_LEN 10000static uint16_t multistart_buffer[MULTISTART_LEN];static size_t multistart_len;#define WCS2COLIDT_LEN 200000static uint16_t wcs2colidt_buffer[WCS2COLIDT_LEN];static size_t wcs2colidt_len;#define INDEX2WEIGHT_LEN 200000static uint16_t index2weight_buffer[INDEX2WEIGHT_LEN];static size_t index2weight_len;static uint16_t index2ruleidx_buffer[INDEX2WEIGHT_LEN];static size_t index2ruleidx_len;#define WEIGHTSTR_LEN 10000static uint16_t weightstr_buffer[WEIGHTSTR_LEN];static size_t weightstr_len;#define RULETABLE_LEN (1L<<16)static uint16_t ruletable_buffer[RULETABLE_LEN];static size_t ruletable_len;#define RANGE (0x10000UL)typedef uint16_t tbl_item;static uint16_t u16_buf[10000];static int u16_buf_len;static int u16_starter;typedef struct { uint16_t ii_len; uint16_t ti_len; uint16_t ut_len; unsigned char ii_shift; unsigned char ti_shift; tbl_item *ii; tbl_item *ti; tbl_item *ut;} table_data;static size_t newopt(tbl_item *ut, size_t usize, int shift, table_data *tbl);#define MAX_COLLATION_WEIGHTS 4#define MAX_FNO 1#define MAX_FILES (MAX_FNO + 1)static FILE *fstack[MAX_FILES];static char *fname[MAX_FILES];static int lineno[MAX_FILES];static int fno = -1;static tbl_item wcs2index[RANGE];static char linebuf[1024];static char *pos;static char *pos_e = NULL;static char end_of_token = 0; /* slot to save */#define IN_ORDER 0x01#define IN_REORDER 0x02#define IN_REORDER_SECTIONS 0x04static int order_state;static int cur_num_weights; /* number of weights in current use */static char cur_rule[MAX_COLLATION_WEIGHTS];static int anonsection = 0;typedef struct ll_item_struct ll_item_t;struct ll_item_struct { ll_item_t *next; ll_item_t *prev; void *data; int data_type; int idx;};static ll_item_t *reorder_section_ptr = NULL;static int superset;static int superset_order_start_cnt; /* only support one order for now */static int superset_in_sync;static ll_item_t *comm_cur_ptr;static ll_item_t *comm_prev_ptr;enum { R_FORWARD = 0x01, R_POSITION = 0x02, R_BACKWARD = 0x04 /* must be largest in value */};typedef struct { size_t num_weights; char rule[MAX_COLLATION_WEIGHTS]; const char *colitem[MAX_COLLATION_WEIGHTS];} weight_t;static void *root_weight = NULL;size_t unique_weights = 0;typedef struct { const char *symbol; weight_t *weight;} weighted_item_t;typedef struct { const char *symbol1; const char *symbol2; int length; weight_t *weight;} range_item_t;typedef struct { const char *name; ll_item_t *itm_list; /* weighted_item_t list .. circular!!! */ size_t num_items; size_t num_rules; char rules[MAX_COLLATION_WEIGHTS];} section_t;static section_t *cur_section = NULL;typedef struct { const char *symbol; ll_item_t *node;} wi_index_t;typedef struct col_locale_struct col_locale_t;struct col_locale_struct { char *name; void *root_colitem; /* all base and derived, or just derived */ void *root_element; void *root_scripts; void *root_wi_index; void *root_wi_index_reordered; ll_item_t *section_list; col_locale_t *base_locale; /* null if this is a base */ void *root_derived_wi; ll_item_t *derived_list; void *root_starter_char; void *root_starter_all; ll_item_t *undefined_idx;};typedef struct { const char *symbol; int idx;} col_index_t;static void *root_col_locale = NULL;typedef struct { const char *keyword; void (*handler)(void);} keyword_table_t;typedef struct { const char *string; const char *element; /* NULL if collating symbol */} colitem_t;static col_locale_t *cur_base = NULL;static col_locale_t *cur_derived = NULL;static col_locale_t *cur_col = NULL;static void *root_sym = NULL;static size_t num_sym = 0;static size_t mem_sym = 0;static void error_msg(const char *fmt, ...) __attribute__ ((noreturn, format (printf, 1, 2)));static void *xmalloc(size_t n);static char *xsymdup(const char *s); /* only allocate once... store in a tree */static void pushfile(char *filename);static void popfile(void);static void processfile(void);static int iscommentchar(int);static void eatwhitespace(void);static int next_line(void);static char *next_token(void);static void do_unrecognized(void);static col_locale_t *new_col_locale(char *name);static ll_item_t *new_ll_item(int data_type, void *data);static weight_t *register_weight(weight_t *w);static size_t ll_len(ll_item_t *l);static size_t ll_count(ll_item_t *l, int mask);static void add_wi_index(ll_item_t *l);static size_t tnumnodes(const void *root);static ll_item_t *find_wi_index(const char *sym, col_locale_t *cl);static void mark_reordered(const char *sym);static ll_item_t *find_wi_index_reordered(const char *sym);static ll_item_t *next_comm_ptr(void);static ll_item_t *init_comm_ptr(void);static ll_item_t *find_ll_last(ll_item_t *p);static void dump_weights(const char *name);static void finalize_base(void);static int is_ucode(const char *s);static int sym_cmp(const void *n1, const void *n2);static void do_starter_lists(col_locale_t *cl);static void dump_base_locale(int n);static void dump_der_locale(int n);static void dump_collate(FILE *fp);enum { DT_SECTION = 0x01, DT_WEIGHTED = 0x02, DT_REORDER = 0x04, /* a section to support reorder_after */ DT_COL_LOCALE = 0x08, DT_RANGE = 0x10,};static section_t *new_section(const char *name){ section_t *p; char buf[128]; p = xmalloc(sizeof(section_t)); if (!name) { /* anonymous section */ name = buf; snprintf(buf, sizeof(buf), "anon%05d", anonsection); ++anonsection; } else if (*name != '<') { /* reorder */ name = buf; snprintf(buf, sizeof(buf), "%s %05d", cur_col->name, anonsection); ++anonsection; }#warning devel code/* fprintf(stderr, "section %s\n", name); */ p->name = xsymdup(name); p->itm_list = NULL; p->num_items = 0; p->num_rules = 0; memset(p->rules, 0, MAX_COLLATION_WEIGHTS);/* cur_num_weights = p->num_rules = 0; *//* memset(p->rules, 0, MAX_COLLATION_WEIGHTS); *//* memset(cur_rule, R_FORWARD, 4); */#warning devel code if (*p->name == 'a') { cur_num_weights = p->num_rules = 4; memset(p->rules, R_FORWARD, 4); memset(cur_rule, R_FORWARD, 4); p->rules[3] |= R_POSITION; cur_rule[3] |= R_POSITION; }/* fprintf(stderr, "new section %s -- cur_num_weights = %d\n", p->name, cur_num_weights); */ return p;}static void do_order_start(void);static void do_order_end(void);static void do_reorder_after(void);static void do_reorder_end(void);static void do_reorder_sections_after(void);static void do_reorder_sections_end(void);static void do_copy(void);static void do_colsym(void);static void do_colele(void);static void do_script(void);static void do_range(void);static col_locale_t *new_col_locale(char *name);static int colitem_cmp(const void *n1, const void *n2);static int colelement_cmp(const void *n1, const void *n2);static void del_colitem(colitem_t *p);static colitem_t *new_colitem(char *item, char *def);static void add_colitem(char *item, char *def);static void add_script(const char *s);static unsigned int add_rule(weighted_item_t *wi);static unsigned int add_range_rule(range_item_t *ri);static const keyword_table_t keyword_table[] = { { "collating-symbol", do_colsym }, { "collating-element", do_colele }, { "script", do_script }, { "copy", do_copy }, { "order_start", do_order_start }, { "order_end", do_order_end }, { "order-end", do_order_end }, { "reorder-after", do_reorder_after }, { "reorder-end", do_reorder_end }, { "reorder-sections-after", do_reorder_sections_after }, { "reorder-sections-end", do_reorder_sections_end }, { "UCLIBC_RANGE", do_range }, { NULL, do_unrecognized }};static void do_unrecognized(void){#if 1 error_msg("warning: unrecognized: %s", pos);#else/* fprintf(stderr, "warning: unrecognized initial keyword \"%s\"\n", pos); */ fprintf(stderr, "warning: unrecognized: %s", pos); if (end_of_token) { fprintf(stderr, "%c%s", end_of_token, pos_e+1); } fprintf(stderr, "\n");#endif}/* typedef struct { *//* const char *symbol1; *//* const char *symbol2; *//* int length; *//* weight_t *weight; *//* } range_item_t; */static void do_range(void){ range_item_t *ri; weight_t w; int i; char *s; char *s1; char *s2; const char **ci; ll_item_t *lli; assert(!superset); assert(order_state == IN_ORDER); s1 = next_token(); if (!s1) { error_msg("missing start of range"); } if (!is_ucode(s1)) { error_msg("start of range is not a ucode: %s", s1); } s1 = xsymdup(s1); s2 = next_token(); if (!s2) { error_msg("missing end of range"); } if (!is_ucode(s2)) { error_msg("end of range is not a ucode: %s", s2); } s2 = xsymdup(s2); ri = (range_item_t *) xmalloc(sizeof(range_item_t)); ri->symbol1 = s1; ri->symbol2 = s2; ri->length = strtoul(s2+2, NULL, 16) - strtoul(s1+2, NULL, 16); if (ri->length <= 0) { error_msg("illegal range length %d", ri->length); } s = next_token(); w.num_weights = cur_num_weights; for (i=0 ; i < cur_num_weights ; i++) { w.rule[i] = cur_rule[i]; } ci = w.colitem + (i-1); /* now i == cur_num_weights */#define STR_DITTO "." while (s && *s && i) { --i; if (*s == ';') { ci[-i] = xsymdup(STR_DITTO); if (*++s) { continue; } } if (*s) { ci[-i] = xsymdup(s); } s = next_token(); if (s) { if (*s == ';') { ++s; } else if (i) { error_msg("missing seperator"); } } } if (s) { error_msg("too many weights: %d %d |%s| %d", cur_num_weights, i, s, (int)*s); } while (i) { /* missing weights are not an error */ --i; ci[-i] = xsymdup(STR_DITTO); } ri->weight = register_weight(&w);/* if ((i = is_ucode(t)) != 0) { *//* assert(!t[i]); *//* add_colitem(t, NULL); *//* } */ lli = new_ll_item(DT_RANGE, ri); if (!cur_section->itm_list) {/* printf("creating new item list: %s\n", wi->symbol); */ cur_section->itm_list = lli; lli->prev = lli->next = lli; ++cur_section->num_items; } else { insque(lli, cur_section->itm_list->prev);/* printf("adding item to list: %d - %s\n", ll_len(cur_section->itm_list), wi->symbol); */ ++cur_section->num_items; }/* add_wi_index(lli); */}static weighted_item_t *add_weight(char *t){ weighted_item_t *wi; weight_t w; int i; char *s; const char **ci; t = xsymdup(t); s = next_token(); w.num_weights = cur_num_weights; for (i=0 ; i < cur_num_weights ; i++) { w.rule[i] = cur_rule[i]; } ci = w.colitem + (i-1); /* now i == cur_num_weights */ while (s && *s && i) { --i; if (*s == ';') { ci[-i] = xsymdup(STR_DITTO); if (*++s) { continue; } } if (*s) { if (!strcmp(s,t)) { s = STR_DITTO; } ci[-i] = xsymdup(s); } s = next_token(); if (s) { if (*s == ';') { ++s; } else if (i) { error_msg("missing seperator"); } } } if (s) { error_msg("too many weights: %d %d |%s| %d", cur_num_weights, i, s, (int)*s); } while (i) { /* missing weights are not an error */ --i; ci[-i] = xsymdup(STR_DITTO); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -