📄 xmltok.c
字号:
staticint doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *, const char *), int isGeneralTextEntity, const ENCODING *enc, const char *ptr, const char *end, const char **badPtr, const char **versionPtr, const char **encodingName, const ENCODING **encoding, int *standalone){ const char *val = 0; const char *name = 0; ptr += 5 * enc->minBytesPerChar; end -= 2 * enc->minBytesPerChar; if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) || !name) { *badPtr = ptr; return 0; } if (!XmlNameMatchesAscii(enc, name, "version")) { if (!isGeneralTextEntity) { *badPtr = name; return 0; } } else { if (versionPtr) *versionPtr = val; if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) { *badPtr = ptr; return 0; } if (!name) { if (isGeneralTextEntity) { /* a TextDecl must have an EncodingDecl */ *badPtr = ptr; return 0; } return 1; } } if (XmlNameMatchesAscii(enc, name, "encoding")) { int c = toAscii(enc, val, end); if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) { *badPtr = val; return 0; } if (encodingName) *encodingName = val; if (encoding) *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar); if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) { *badPtr = ptr; return 0; } if (!name) return 1; } if (!XmlNameMatchesAscii(enc, name, "standalone") || isGeneralTextEntity) { *badPtr = name; return 0; } if (XmlNameMatchesAscii(enc, val, "yes")) { if (standalone) *standalone = 1; } else if (XmlNameMatchesAscii(enc, val, "no")) { if (standalone) *standalone = 0; } else { *badPtr = val; return 0; } while (isSpace(toAscii(enc, ptr, end))) ptr += enc->minBytesPerChar; if (ptr != end) { *badPtr = ptr; return 0; } return 1;}staticint checkCharRefNumber(int result){ switch (result >> 8) { case 0xD8: case 0xD9: case 0xDA: case 0xDB: case 0xDC: case 0xDD: case 0xDE: case 0xDF: return -1; case 0: if (latin1_encoding.type[result] == BT_NONXML) return -1; break; case 0xFF: if (result == 0xFFFE || result == 0xFFFF) return -1; break; } return result;}int XmlUtf8Encode(int c, char *buf){ enum { /* minN is minimum legal resulting value for N byte sequence */ min2 = 0x80, min3 = 0x800, min4 = 0x10000 }; if (c < 0) return 0; if (c < min2) { buf[0] = (c | UTF8_cval1); return 1; } if (c < min3) { buf[0] = ((c >> 6) | UTF8_cval2); buf[1] = ((c & 0x3f) | 0x80); return 2; } if (c < min4) { buf[0] = ((c >> 12) | UTF8_cval3); buf[1] = (((c >> 6) & 0x3f) | 0x80); buf[2] = ((c & 0x3f) | 0x80); return 3; } if (c < 0x110000) { buf[0] = ((c >> 18) | UTF8_cval4); buf[1] = (((c >> 12) & 0x3f) | 0x80); buf[2] = (((c >> 6) & 0x3f) | 0x80); buf[3] = ((c & 0x3f) | 0x80); return 4; } return 0;}int XmlUtf16Encode(int charNum, unsigned short *buf){ if (charNum < 0) return 0; if (charNum < 0x10000) { buf[0] = charNum; return 1; } if (charNum < 0x110000) { charNum -= 0x10000; buf[0] = (charNum >> 10) + 0xD800; buf[1] = (charNum & 0x3FF) + 0xDC00; return 2; } return 0;}struct unknown_encoding { struct normal_encoding normal; int (*convert)(void *userData, const char *p); void *userData; unsigned short utf16[256]; char utf8[256][4];};int XmlSizeOfUnknownEncoding(){ return sizeof(struct unknown_encoding);}staticint unknown_isName(const ENCODING *enc, const char *p){ int c = ((const struct unknown_encoding *)enc) ->convert(((const struct unknown_encoding *)enc)->userData, p); if (c & ~0xFFFF) return 0; return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);}staticint unknown_isNmstrt(const ENCODING *enc, const char *p){ int c = ((const struct unknown_encoding *)enc) ->convert(((const struct unknown_encoding *)enc)->userData, p); if (c & ~0xFFFF) return 0; return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);}staticint unknown_isInvalid(const ENCODING *enc, const char *p){ int c = ((const struct unknown_encoding *)enc) ->convert(((const struct unknown_encoding *)enc)->userData, p); return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;}staticvoid unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, char **toP, const char *toLim){ char buf[XML_UTF8_ENCODE_MAX]; for (;;) { const char *utf8; int n; if (*fromP == fromLim) break; utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP]; n = *utf8++; if (n == 0) { int c = ((const struct unknown_encoding *)enc) ->convert(((const struct unknown_encoding *)enc)->userData, *fromP); n = XmlUtf8Encode(c, buf); if (n > toLim - *toP) break; utf8 = buf; *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP] - (BT_LEAD2 - 2); } else { if (n > toLim - *toP) break; (*fromP)++; } do { *(*toP)++ = *utf8++; } while (--n != 0); }}staticvoid unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, unsigned short **toP, const unsigned short *toLim){ while (*fromP != fromLim && *toP != toLim) { unsigned short c = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP]; if (c == 0) { c = (unsigned short)((const struct unknown_encoding *)enc) ->convert(((const struct unknown_encoding *)enc)->userData, *fromP); *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP] - (BT_LEAD2 - 2); } else (*fromP)++; *(*toP)++ = c; }}ENCODING *XmlInitUnknownEncoding(void *mem, int *table, int (*convert)(void *userData, const char *p), void *userData){ int i; struct unknown_encoding *e = mem; for (i = 0; i < (int)sizeof(struct normal_encoding); i++) ((char *)mem)[i] = ((char *)&latin1_encoding)[i]; for (i = 0; i < 128; i++) if (latin1_encoding.type[i] != BT_OTHER && latin1_encoding.type[i] != BT_NONXML && table[i] != i) return 0; for (i = 0; i < 256; i++) { int c = table[i]; if (c == -1) { e->normal.type[i] = BT_MALFORM; /* This shouldn't really get used. */ e->utf16[i] = 0xFFFF; e->utf8[i][0] = 1; e->utf8[i][1] = 0; } else if (c < 0) { if (c < -4) return 0; e->normal.type[i] = BT_LEAD2 - (c + 2); e->utf8[i][0] = 0; e->utf16[i] = 0; } else if (c < 0x80) { if (latin1_encoding.type[c] != BT_OTHER && latin1_encoding.type[c] != BT_NONXML && c != i) return 0; e->normal.type[i] = latin1_encoding.type[c]; e->utf8[i][0] = 1; e->utf8[i][1] = (char)c; e->utf16[i] = c == 0 ? 0xFFFF : c; } else if (checkCharRefNumber(c) < 0) { e->normal.type[i] = BT_NONXML; /* This shouldn't really get used. */ e->utf16[i] = 0xFFFF; e->utf8[i][0] = 1; e->utf8[i][1] = 0; } else { if (c > 0xFFFF) return 0; if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) e->normal.type[i] = BT_NMSTRT; else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)) e->normal.type[i] = BT_NAME; else e->normal.type[i] = BT_OTHER; e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); e->utf16[i] = c; } } e->userData = userData; e->convert = convert; if (convert) { e->normal.isName2 = unknown_isName; e->normal.isName3 = unknown_isName; e->normal.isName4 = unknown_isName; e->normal.isNmstrt2 = unknown_isNmstrt; e->normal.isNmstrt3 = unknown_isNmstrt; e->normal.isNmstrt4 = unknown_isNmstrt; e->normal.isInvalid2 = unknown_isInvalid; e->normal.isInvalid3 = unknown_isInvalid; e->normal.isInvalid4 = unknown_isInvalid; } e->normal.enc.utf8Convert = unknown_toUtf8; e->normal.enc.utf16Convert = unknown_toUtf16; return &(e->normal.enc);}/* If this enumeration is changed, getEncodingIndex and encodingsmust also be changed. */enum { UNKNOWN_ENC = -1, ISO_8859_1_ENC = 0, US_ASCII_ENC, UTF_8_ENC, UTF_16_ENC, UTF_16BE_ENC, UTF_16LE_ENC, /* must match encodingNames up to here */ NO_ENC};staticint getEncodingIndex(const char *name){ static const char *encodingNames[] = { "ISO-8859-1", "US-ASCII", "UTF-8", "UTF-16", "UTF-16BE" "UTF-16LE", }; unsigned int i; if (name == 0) return NO_ENC; for (i = 0; i < sizeof(encodingNames)/sizeof(encodingNames[0]); i++) if (streqci(name, encodingNames[i])) return i; return UNKNOWN_ENC;}/* For binary compatibility, we store the index of the encoding specifiedat initialization in the isUtf16 member. */#define INIT_ENC_INDEX(enc) ((enc)->initEnc.isUtf16)/* This is what detects the encoding.encodingTable maps from encoding indices to encodings;INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;state is XML_CONTENT_STATE if we're parsing an external text entity,and XML_PROLOG_STATE otherwise.*/staticint initScan(const ENCODING **encodingTable, const INIT_ENCODING *enc, int state, const char *ptr, const char *end, const char **nextTokPtr){ const ENCODING **encPtr; if (ptr == end) return XML_TOK_NONE; encPtr = enc->encPtr; if (ptr + 1 == end) { /* only a single byte available for auto-detection */ /* a well-formed document entity must have more than one byte */ if (state != XML_CONTENT_STATE) return XML_TOK_PARTIAL; /* so we're parsing an external text entity... */ /* if UTF-16 was externally specified, then we need at least 2 bytes */ switch (INIT_ENC_INDEX(enc)) { case UTF_16_ENC: case UTF_16LE_ENC: case UTF_16BE_ENC: return XML_TOK_PARTIAL; } switch ((unsigned char)*ptr) { case 0xFE: case 0xFF: case 0xEF: /* possibly first byte of UTF-8 BOM */ if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) break; /* fall through */ case 0x00: case 0x3C: return XML_TOK_PARTIAL; } } else { switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { case 0xFEFF: if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) break; *nextTokPtr = ptr + 2; *encPtr = encodingTable[UTF_16BE_ENC]; return XML_TOK_BOM; /* 00 3C is handled in the default case */ case 0x3C00: if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC || INIT_ENC_INDEX(enc) == UTF_16_ENC) && state == XML_CONTENT_STATE) break; *encPtr = encodingTable[UTF_16LE_ENC]; return XmlTok(*encPtr, state, ptr, end, nextTokPtr); case 0xFFFE: if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) break; *nextTokPtr = ptr + 2; *encPtr = encodingTable[UTF_16LE_ENC]; return XML_TOK_BOM; case 0xEFBB: /* Maybe a UTF-8 BOM (EF BB BF) */ /* If there's an explicitly specified (external) encoding of ISO-8859-1 or some flavour of UTF-16 and this is an external text entity, don't look for the BOM, because it might be a legal data. */ if (state == XML_CONTENT_STATE) { int e = INIT_ENC_INDEX(enc); if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC) break; } if (ptr + 2 == end) return XML_TOK_PARTIAL; if ((unsigned char)ptr[2] == 0xBF) { *encPtr = encodingTable[UTF_8_ENC]; return XML_TOK_BOM; } break; default: if (ptr[0] == '\0') { /* 0 isn't a legal data character. Furthermore a document entity can only start with ASCII characters. So the only way this can fail to be big-endian UTF-16 if it it's an external parsed general entity that's labelled as UTF-16LE. */ if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC) break; *encPtr = encodingTable[UTF_16BE_ENC]; return XmlTok(*encPtr, state, ptr, end, nextTokPtr); } else if (ptr[1] == '\0') { /* We could recover here in the case: - parsing an external entity - second byte is 0 - no externally specified encoding - no encoding declaration by assuming UTF-16LE. But we don't, because this would mean when presented just with a single byte, we couldn't reliably determine whether we needed further bytes. */ if (state == XML_CONTENT_STATE) break; *encPtr = encodingTable[UTF_16LE_ENC]; return XmlTok(*encPtr, state, ptr, end, nextTokPtr); } break; } } *encPtr = encodingTable[(unsigned int)INIT_ENC_INDEX(enc)]; return XmlTok(*encPtr, state, ptr, end, nextTokPtr);}#define NS(x) x#define ns(x) x#include "xmltok_ns.c"#undef NS#undef ns#ifdef XML_NS#define NS(x) x ## NS#define ns(x) x ## _ns#include "xmltok_ns.c"#undef NS#undef nsENCODING *XmlInitUnknownEncodingNS(void *mem, int *table, int (*convert)(void *userData, const char *p), void *userData){ ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData); if (enc) ((struct normal_encoding *)enc)->type[':'] = BT_COLON; return enc;}#endif /* XML_NS */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -