📄 xmltok.c
字号:
/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd See the file COPYING for copying permission.*/#ifdef COMPILED_FROM_DSP#include "winconfig.h"#elif defined(OS2_32)#include "os2config.h"#elif defined(__MSDOS__)#include "dosconfig.h"#elif defined(MACOS_CLASSIC)#include "macconfig.h"#else#include "expat_config.h"#endif /* ndef COMPILED_FROM_DSP */#include "internal.h"#include "xmltok.h"#include "nametab.h"#ifdef XML_DTD#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)#else#define IGNORE_SECTION_TOK_VTABLE /* as nothing */#endif#define VTABLE1 \ { PREFIX(prologTok), PREFIX(contentTok), \ PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \ { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ PREFIX(sameName), \ PREFIX(nameMatchesAscii), \ PREFIX(nameLength), \ PREFIX(skipS), \ PREFIX(getAtts), \ PREFIX(charRefNumber), \ PREFIX(predefinedEntityName), \ PREFIX(updatePosition), \ PREFIX(isPublicId)#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)#define UCS2_GET_NAMING(pages, hi, lo) \ (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))/* A 2 byte UTF-8 representation splits the characters 11 bits between the bottom 5 and 6 bits of the bytes. We need 8 bits to index into pages, 3 bits to add to that index and 5 bits to generate the mask.*/#define UTF8_GET_NAMING2(pages, byte) \ (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ + ((((byte)[0]) & 3) << 1) \ + ((((byte)[1]) >> 5) & 1)] \ & (1 << (((byte)[1]) & 0x1F)))/* A 3 byte UTF-8 representation splits the characters 16 bits between the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index into pages, 3 bits to add to that index and 5 bits to generate the mask.*/#define UTF8_GET_NAMING3(pages, byte) \ (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \ + ((((byte)[1]) >> 2) & 0xF)] \ << 3) \ + ((((byte)[1]) & 3) << 1) \ + ((((byte)[2]) >> 5) & 1)] \ & (1 << (((byte)[2]) & 0x1F)))#define UTF8_GET_NAMING(pages, p, n) \ ((n) == 2 \ ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ : ((n) == 3 \ ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \ : 0))/* Detection of invalid UTF-8 sequences is based on Table 3.1B of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ with the additional restriction of not allowing the Unicode code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE). Implementation details: (A & 0x80) == 0 means A < 0x80 and (A & 0xC0) == 0xC0 means A > 0xBF*/#define UTF8_INVALID2(p) \ ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)#define UTF8_INVALID3(p) \ (((p)[2] & 0x80) == 0 \ || \ ((*p) == 0xEF && (p)[1] == 0xBF \ ? \ (p)[2] > 0xBD \ : \ ((p)[2] & 0xC0) == 0xC0) \ || \ ((*p) == 0xE0 \ ? \ (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \ : \ ((p)[1] & 0x80) == 0 \ || \ ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))#define UTF8_INVALID4(p) \ (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \ || \ ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \ || \ ((*p) == 0xF0 \ ? \ (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \ : \ ((p)[1] & 0x80) == 0 \ || \ ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))static int PTRFASTCALLisNever(const ENCODING *enc, const char *p){ return 0;}static int PTRFASTCALLutf8_isName2(const ENCODING *enc, const char *p){ return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);}static int PTRFASTCALLutf8_isName3(const ENCODING *enc, const char *p){ return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);}#define utf8_isName4 isNeverstatic int PTRFASTCALLutf8_isNmstrt2(const ENCODING *enc, const char *p){ return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);}static int PTRFASTCALLutf8_isNmstrt3(const ENCODING *enc, const char *p){ return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);}#define utf8_isNmstrt4 isNeverstatic int PTRFASTCALLutf8_isInvalid2(const ENCODING *enc, const char *p){ return UTF8_INVALID2((const unsigned char *)p);}static int PTRFASTCALLutf8_isInvalid3(const ENCODING *enc, const char *p){ return UTF8_INVALID3((const unsigned char *)p);}static int PTRFASTCALLutf8_isInvalid4(const ENCODING *enc, const char *p){ return UTF8_INVALID4((const unsigned char *)p);}struct normal_encoding { ENCODING enc; unsigned char type[256];#ifdef XML_MIN_SIZE int (PTRFASTCALL *byteType)(const ENCODING *, const char *); int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *); int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *); int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *); int (PTRCALL *charMatches)(const ENCODING *, const char *, int);#endif /* XML_MIN_SIZE */ int (PTRFASTCALL *isName2)(const ENCODING *, const char *); int (PTRFASTCALL *isName3)(const ENCODING *, const char *); int (PTRFASTCALL *isName4)(const ENCODING *, const char *); int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *); int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *); int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *); int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *); int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *); int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);};#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))#ifdef XML_MIN_SIZE#define STANDARD_VTABLE(E) \ E ## byteType, \ E ## isNameMin, \ E ## isNmstrtMin, \ E ## byteToAscii, \ E ## charMatches,#else#define STANDARD_VTABLE(E) /* as nothing */#endif#define NORMAL_VTABLE(E) \ E ## isName2, \ E ## isName3, \ E ## isName4, \ E ## isNmstrt2, \ E ## isNmstrt3, \ E ## isNmstrt4, \ E ## isInvalid2, \ E ## isInvalid3, \ E ## isInvalid4static int FASTCALL checkCharRefNumber(int);#include "xmltok_impl.h"#include "ascii.h"#ifdef XML_MIN_SIZE#define sb_isNameMin isNever#define sb_isNmstrtMin isNever#endif#ifdef XML_MIN_SIZE#define MINBPC(enc) ((enc)->minBytesPerChar)#else/* minimum bytes per character */#define MINBPC(enc) 1#endif#define SB_BYTE_TYPE(enc, p) \ (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])#ifdef XML_MIN_SIZEstatic int PTRFASTCALLsb_byteType(const ENCODING *enc, const char *p){ return SB_BYTE_TYPE(enc, p);}#define BYTE_TYPE(enc, p) \ (AS_NORMAL_ENCODING(enc)->byteType(enc, p))#else#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)#endif#ifdef XML_MIN_SIZE#define BYTE_TO_ASCII(enc, p) \ (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))static int PTRFASTCALLsb_byteToAscii(const ENCODING *enc, const char *p){ return *p;}#else#define BYTE_TO_ASCII(enc, p) (*(p))#endif#define IS_NAME_CHAR(enc, p, n) \ (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))#define IS_NMSTRT_CHAR(enc, p, n) \ (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))#define IS_INVALID_CHAR(enc, p, n) \ (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))#ifdef XML_MIN_SIZE#define IS_NAME_CHAR_MINBPC(enc, p) \ (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))#define IS_NMSTRT_CHAR_MINBPC(enc, p) \ (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))#else#define IS_NAME_CHAR_MINBPC(enc, p) (0)#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)#endif#ifdef XML_MIN_SIZE#define CHAR_MATCHES(enc, p, c) \ (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))static int PTRCALLsb_charMatches(const ENCODING *enc, const char *p, int c){ return *p == c;}#else/* c is an ASCII character */#define CHAR_MATCHES(enc, p, c) (*(p) == c)#endif#define PREFIX(ident) normal_ ## ident#include "xmltok_impl.c"#undef MINBPC#undef BYTE_TYPE#undef BYTE_TO_ASCII#undef CHAR_MATCHES#undef IS_NAME_CHAR#undef IS_NAME_CHAR_MINBPC#undef IS_NMSTRT_CHAR#undef IS_NMSTRT_CHAR_MINBPC#undef IS_INVALID_CHARenum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ UTF8_cval1 = 0x00, UTF8_cval2 = 0xc0, UTF8_cval3 = 0xe0, UTF8_cval4 = 0xf0};static void PTRCALLutf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, char **toP, const char *toLim){ char *to; const char *from; if (fromLim - *fromP > toLim - *toP) { /* Avoid copying partial characters. */ for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--) if (((unsigned char)fromLim[-1] & 0xc0) != 0x80) break; } for (to = *toP, from = *fromP; from != fromLim; from++, to++) *to = *from; *fromP = from; *toP = to;}static void PTRCALLutf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, unsigned short **toP, const unsigned short *toLim){ unsigned short *to = *toP; const char *from = *fromP; while (from != fromLim && to != toLim) { switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { case BT_LEAD2: *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f)); from += 2; break; case BT_LEAD3: *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f)); from += 3; break; case BT_LEAD4: { unsigned long n; if (to + 1 == toLim) goto after; n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); n -= 0x10000; to[0] = (unsigned short)((n >> 10) | 0xD800); to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); to += 2; from += 4; } break; default: *to++ = *from++; break; } }after: *fromP = from; *toP = to;}#ifdef XML_NSstatic const struct normal_encoding utf8_encoding_ns = { { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, {#include "asciitab.h"#include "utf8tab.h" }, STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};#endifstatic const struct normal_encoding utf8_encoding = { { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, {#define BT_COLON BT_NMSTRT#include "asciitab.h"#undef BT_COLON#include "utf8tab.h" }, STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};#ifdef XML_NSstatic const struct normal_encoding internal_utf8_encoding_ns = { { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, {#include "iasciitab.h"#include "utf8tab.h" }, STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};#endifstatic const struct normal_encoding internal_utf8_encoding = { { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, {#define BT_COLON BT_NMSTRT#include "iasciitab.h"#undef BT_COLON#include "utf8tab.h" }, STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};static void PTRCALLlatin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, char **toP, const char *toLim){ for (;;) { unsigned char c; if (*fromP == fromLim) break; c = (unsigned char)**fromP; if (c & 0x80) { if (toLim - *toP < 2) break; *(*toP)++ = (char)((c >> 6) | UTF8_cval2); *(*toP)++ = (char)((c & 0x3f) | 0x80); (*fromP)++; } else { if (*toP == toLim) break; *(*toP)++ = *(*fromP)++; } }}static void PTRCALLlatin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, unsigned short **toP, const unsigned short *toLim){ while (*fromP != fromLim && *toP != toLim) *(*toP)++ = (unsigned char)*(*fromP)++;}#ifdef XML_NSstatic const struct normal_encoding latin1_encoding_ns = { { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, {#include "asciitab.h"#include "latin1tab.h" }, STANDARD_VTABLE(sb_)};#endifstatic const struct normal_encoding latin1_encoding = { { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, {#define BT_COLON BT_NMSTRT#include "asciitab.h"#undef BT_COLON#include "latin1tab.h" }, STANDARD_VTABLE(sb_)};static void PTRCALLascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, char **toP, const char *toLim){ while (*fromP != fromLim && *toP != toLim) *(*toP)++ = *(*fromP)++;}#ifdef XML_NSstatic const struct normal_encoding ascii_encoding_ns = { { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, {#include "asciitab.h"/* BT_NONXML == 0 */ }, STANDARD_VTABLE(sb_)};#endifstatic const struct normal_encoding ascii_encoding = { { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, {#define BT_COLON BT_NMSTRT#include "asciitab.h"#undef BT_COLON/* BT_NONXML == 0 */ }, STANDARD_VTABLE(sb_)};static int PTRFASTCALLunicode_byte_type(char hi, char lo){ switch ((unsigned char)hi) { case 0xD8: case 0xD9: case 0xDA: case 0xDB: return BT_LEAD4; case 0xDC: case 0xDD: case 0xDE: case 0xDF: return BT_TRAIL; case 0xFF: switch ((unsigned char)lo) { case 0xFF: case 0xFE: return BT_NONXML; } break; } return BT_NONASCII;}#define DEFINE_UTF16_TO_UTF8(E) \static void PTRCALL \E ## toUtf8(const ENCODING *enc, \ const char **fromP, const char *fromLim, \ char **toP, const char *toLim) \{ \ const char *from; \ for (from = *fromP; from != fromLim; from += 2) { \ int plane; \ unsigned char lo2; \ unsigned char lo = GET_LO(from); \ unsigned char hi = GET_HI(from); \ switch (hi) { \ case 0: \ if (lo < 0x80) { \ if (*toP == toLim) { \
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -