⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 xmltok.c

📁 Simple Jabber Client for Symbian Platform
💻 C
📖 第 1 页 / 共 3 页
字号:
    ptr += 5 * enc->minBytesPerChar;
    end -= 2 * enc->minBytesPerChar;
    if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) || !name) {
        *badPtr = ptr;
        return 0;
    }
    if (!XmlNameMatchesAscii(enc, name, "version")) {
        if (!isGeneralTextEntity) {
            *badPtr = name;
            return 0;
        }
    }
    else {
        if (versionPtr)
            *versionPtr = val;
        if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
            *badPtr = ptr;
            return 0;
        }
        if (!name) {
            if (isGeneralTextEntity) {
                /* a TextDecl must have an EncodingDecl */
                *badPtr = ptr;
                return 0;
            }
            return 1;
        }
    }
    if (XmlNameMatchesAscii(enc, name, "encoding")) {
        int c = toAscii(enc, val, end);
        if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) {
            *badPtr = val;
            return 0;
        }
        if (encodingName)
            *encodingName = val;
        if (encoding)
            *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
        if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
            *badPtr = ptr;
            return 0;
        }
        if (!name)
            return 1;
    }
    if (!XmlNameMatchesAscii(enc, name, "standalone") || isGeneralTextEntity) {
        *badPtr = name;
        return 0;
    }
    if (XmlNameMatchesAscii(enc, val, "yes")) {
        if (standalone)
            *standalone = 1;
    }
    else if (XmlNameMatchesAscii(enc, val, "no")) {
        if (standalone)
            *standalone = 0;
    }
    else {
        *badPtr = val;
        return 0;
    }
    while (isSpace(toAscii(enc, ptr, end)))
        ptr += enc->minBytesPerChar;
    if (ptr != end) {
        *badPtr = ptr;
        return 0;
    }
    return 1;
}

static
int checkCharRefNumber(int result)
{
    switch (result >> 8) {
case 0xD8: case 0xD9: case 0xDA: case 0xDB:
case 0xDC: case 0xDD: case 0xDE: case 0xDF:
        return -1;
    case 0:
        if (latin1_encoding.type[result] == BT_NONXML)
            return -1;
        break;
    case 0xFF:
        if (result == 0xFFFE || result == 0xFFFF)
            return -1;
        break;
    }
    return result;
}

int XmlUtf8Encode(int c, char *buf)
{
    enum {
        /* minN is minimum legal resulting value for N byte sequence */
        min2 = 0x80,
        min3 = 0x800,
        min4 = 0x10000
    };

    if (c < 0)
        return 0;
    if (c < min2) {
        buf[0] = (c | UTF8_cval1);
        return 1;
    }
    if (c < min3) {
        buf[0] = ((c >> 6) | UTF8_cval2);
        buf[1] = ((c & 0x3f) | 0x80);
        return 2;
    }
    if (c < min4) {
        buf[0] = ((c >> 12) | UTF8_cval3);
        buf[1] = (((c >> 6) & 0x3f) | 0x80);
        buf[2] = ((c & 0x3f) | 0x80);
        return 3;
    }
    if (c < 0x110000) {
        buf[0] = ((c >> 18) | UTF8_cval4);
        buf[1] = (((c >> 12) & 0x3f) | 0x80);
        buf[2] = (((c >> 6) & 0x3f) | 0x80);
        buf[3] = ((c & 0x3f) | 0x80);
        return 4;
    }
    return 0;
}

int XmlUtf16Encode(int charNum, unsigned short *buf)
{
    if (charNum < 0)
        return 0;
    if (charNum < 0x10000) {
        buf[0] = charNum;
        return 1;
    }
    if (charNum < 0x110000) {
        charNum -= 0x10000;
        buf[0] = (charNum >> 10) + 0xD800;
        buf[1] = (charNum & 0x3FF) + 0xDC00;
        return 2;
    }
    return 0;
}

struct unknown_encoding {
    struct normal_encoding normal;
    int (*convert)(void *userData, const char *p);
    void *userData;
    unsigned short utf16[256];
    char utf8[256][4];
};

int XmlSizeOfUnknownEncoding()
{
    return sizeof(struct unknown_encoding);
}

static
int unknown_isName(const ENCODING *enc, const char *p)
{
    int c = ((const struct unknown_encoding *)enc)
            ->convert(((const struct unknown_encoding *)enc)->userData, p);
    if (c & ~0xFFFF)
        return 0;
    return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
}

static
int unknown_isNmstrt(const ENCODING *enc, const char *p)
{
    int c = ((const struct unknown_encoding *)enc)
            ->convert(((const struct unknown_encoding *)enc)->userData, p);
    if (c & ~0xFFFF)
        return 0;
    return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
}

static
int unknown_isInvalid(const ENCODING *enc, const char *p)
{
    int c = ((const struct unknown_encoding *)enc)
            ->convert(((const struct unknown_encoding *)enc)->userData, p);
    return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
}

static
void unknown_toUtf8(const ENCODING *enc,
                    const char **fromP, const char *fromLim,
                    char **toP, const char *toLim)
{
    char buf[XML_UTF8_ENCODE_MAX];
    for (;;) {
        const char *utf8;
        int n;
        if (*fromP == fromLim)
            break;
        utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
        n = *utf8++;
        if (n == 0) {
            int c = ((const struct unknown_encoding *)enc)
                    ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
            n = XmlUtf8Encode(c, buf);
            if (n > toLim - *toP)
                break;
            utf8 = buf;
            *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
                      - (BT_LEAD2 - 2);
        }
        else {
            if (n > toLim - *toP)
                break;
            (*fromP)++;
        }
        do {
            *(*toP)++ = *utf8++;
        } while (--n != 0);
    }
}

static
void unknown_toUtf16(const ENCODING *enc,
                     const char **fromP, const char *fromLim,
                     unsigned short **toP, const unsigned short *toLim)
{
    while (*fromP != fromLim && *toP != toLim) {
        unsigned short c
        = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
        if (c == 0) {
            c = (unsigned short)((const struct unknown_encoding *)enc)
                ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
            *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
                      - (BT_LEAD2 - 2);
        }
        else
            (*fromP)++;
        *(*toP)++ = c;
    }
}

ENCODING *
XmlInitUnknownEncoding(void *mem,
                       int *table,
                       int (*convert)(void *userData, const char *p),
                       void *userData)
{
    int i;
    struct unknown_encoding *e = mem;
    for (i = 0; i < sizeof(struct normal_encoding); i++)
        ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
    for (i = 0; i < 128; i++)
        if (latin1_encoding.type[i] != BT_OTHER
                && latin1_encoding.type[i] != BT_NONXML
                && table[i] != i)
            return 0;
    for (i = 0; i < 256; i++) {
        int c = table[i];
        if (c == -1) {
            e->normal.type[i] = BT_MALFORM;
            /* This shouldn't really get used. */
            e->utf16[i] = 0xFFFF;
            e->utf8[i][0] = 1;
            e->utf8[i][1] = 0;
        }
        else if (c < 0) {
            if (c < -4)
                return 0;
            e->normal.type[i] = BT_LEAD2 - (c + 2);
            e->utf8[i][0] = 0;
            e->utf16[i] = 0;
        }
        else if (c < 0x80) {
            if (latin1_encoding.type[c] != BT_OTHER
                    && latin1_encoding.type[c] != BT_NONXML
                    && c != i)
                return 0;
            e->normal.type[i] = latin1_encoding.type[c];
            e->utf8[i][0] = 1;
            e->utf8[i][1] = (char)c;
            e->utf16[i] = c == 0 ? 0xFFFF : c;
        }
        else if (checkCharRefNumber(c) < 0) {
            e->normal.type[i] = BT_NONXML;
            /* This shouldn't really get used. */
            e->utf16[i] = 0xFFFF;
            e->utf8[i][0] = 1;
            e->utf8[i][1] = 0;
        }
        else {
            if (c > 0xFFFF)
                return 0;
            if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
                e->normal.type[i] = BT_NMSTRT;
            else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
                e->normal.type[i] = BT_NAME;
            else
                e->normal.type[i] = BT_OTHER;
            e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
            e->utf16[i] = c;
        }
    }
    e->userData = userData;
    e->convert = convert;
    if (convert) {
        e->normal.isName2 = unknown_isName;
        e->normal.isName3 = unknown_isName;
        e->normal.isName4 = unknown_isName;
        e->normal.isNmstrt2 = unknown_isNmstrt;
        e->normal.isNmstrt3 = unknown_isNmstrt;
        e->normal.isNmstrt4 = unknown_isNmstrt;
        e->normal.isInvalid2 = unknown_isInvalid;
        e->normal.isInvalid3 = unknown_isInvalid;
        e->normal.isInvalid4 = unknown_isInvalid;
    }
    e->normal.enc.utf8Convert = unknown_toUtf8;
    e->normal.enc.utf16Convert = unknown_toUtf16;
    return &(e->normal.enc);
}

/* If this enumeration is changed, getEncodingIndex and encodings
must also be changed. */
enum {
    UNKNOWN_ENC = -1,
    ISO_8859_1_ENC = 0,
    US_ASCII_ENC,
    UTF_8_ENC,
    UTF_16_ENC,
    UTF_16BE_ENC,
    UTF_16LE_ENC,
    /* must match encodingNames up to here */
    NO_ENC
};

static
int getEncodingIndex(const char *name)
{
    static const char* const encodingNames[] = {
        "ISO-8859-1",
        "US-ASCII",
        "UTF-8",
        "UTF-16",
        "UTF-16BE"
        "UTF-16LE",
    };
    int i;
    if (name == 0)
        return NO_ENC;
    for (i = 0; i < sizeof(encodingNames)/sizeof(encodingNames[0]); i++)
        if (streqci(name, encodingNames[i]))
            return i;
    return UNKNOWN_ENC;
}

/* For binary compatibility, we store the index of the encoding specified
at initialization in the isUtf16 member. */

#define INIT_ENC_INDEX(enc) ((enc)->initEnc.isUtf16)

/* This is what detects the encoding.
encodingTable maps from encoding indices to encodings;
INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
state is XML_CONTENT_STATE if we're parsing an external text entity,
and XML_PROLOG_STATE otherwise.
*/


static
int initScan(const ENCODING **encodingTable,
             const INIT_ENCODING *enc,
             int state,
             const char *ptr,
             const char *end,
             const char **nextTokPtr)
{
    const ENCODING **encPtr;

    if (ptr == end)
        return XML_TOK_NONE;
    encPtr = enc->encPtr;
    if (ptr + 1 == end) {
        /* only a single byte available for auto-detection */
        /* a well-formed document entity must have more than one byte */
        if (state != XML_CONTENT_STATE)
            return XML_TOK_PARTIAL;
        /* so we're parsing an external text entity... */
        /* if UTF-16 was externally specified, then we need at least 2 bytes */
        switch (INIT_ENC_INDEX(enc)) {
        case UTF_16_ENC:
        case UTF_16LE_ENC:
        case UTF_16BE_ENC:
            return XML_TOK_PARTIAL;
        }
        switch ((unsigned char)*ptr) {
        case 0xFE:
        case 0xFF:
        case 0xEF: /* possibly first byte of UTF-8 BOM */
            if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
                    && state == XML_CONTENT_STATE)
                break;
            /* fall through */
        case 0x00:
        case 0x3C:
            return XML_TOK_PARTIAL;
        }
    }
    else {
        switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
        case 0xFEFF:
            if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
                    && state == XML_CONTENT_STATE)
                break;
            *nextTokPtr = ptr + 2;
            *encPtr = encodingTable[UTF_16BE_ENC];
            return XML_TOK_BOM;
            /* 00 3C is handled in the default case */
        case 0x3C00:
            if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
                    || INIT_ENC_INDEX(enc) == UTF_16_ENC)
                    && state == XML_CONTENT_STATE)
                break;
            *encPtr = encodingTable[UTF_16LE_ENC];
            return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
        case 0xFFFE:
            if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
                    && state == XML_CONTENT_STATE)
                break;
            *nextTokPtr = ptr + 2;
            *encPtr = encodingTable[UTF_16LE_ENC];
            return XML_TOK_BOM;
        case 0xEFBB:
            /* Maybe a UTF-8 BOM (EF BB BF) */
            /* If there's an explicitly specified (external) encoding
               of ISO-8859-1 or some flavour of UTF-16
               and this is an external text entity,
            don't look for the BOM,
               because it might be a legal data. */
            if (state == XML_CONTENT_STATE) {
                int e = INIT_ENC_INDEX(enc);
                if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
                    break;
            }
            if (ptr + 2 == end)
                return XML_TOK_PARTIAL;
            if ((unsigned char)ptr[2] == 0xBF) {
                *encPtr = encodingTable[UTF_8_ENC];
                return XML_TOK_BOM;
            }
            break;
        default:
            if (ptr[0] == '\0') {
                /* 0 isn't a legal data character. Furthermore a document entity can only
                   start with ASCII characters.  So the only way this can fail to be big-endian
                   UTF-16 if it it's an external parsed general entity that's labelled as
                   UTF-16LE. */
                if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
                    break;
                *encPtr = encodingTable[UTF_16BE_ENC];
                return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
            }
            else if (ptr[1] == '\0') {
                /* We could recover here in the case:
                    - parsing an external entity
                    - second byte is 0
                    - no externally specified encoding
                    - no encoding declaration
                   by assuming UTF-16LE.  But we don't, because this would mean when
                   presented just with a single byte, we couldn't reliably determine
                   whether we needed further bytes. */
                if (state == XML_CONTENT_STATE)
                    break;
                *encPtr = encodingTable[UTF_16LE_ENC];
                return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
            }
            break;
        }
    }
    *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
    return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
}


#define NS(x) x
#define ns(x) x
#include "xmltok_ns_c.h"
#undef NS
#undef ns

#ifdef XML_NS

#define NS(x) x ## NS
#define ns(x) x ## _ns

#include "xmltok_ns_c.h"

#undef NS
#undef ns

ENCODING *
XmlInitUnknownEncodingNS(void *mem,
                         int *table,
                         int (*convert)(void *userData, const char *p),
                         void *userData)
{
    ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
    if (enc)
        ((struct normal_encoding *)enc)->type[':'] = BT_COLON;
    return enc;
}

#endif /* XML_NS */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -