📄 htmlparser.c
字号:
{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },{ 8704, "forall","for all, U+2200 ISOtech" },{ 8706, "part", "partial differential, U+2202 ISOtech" },{ 8707, "exist","there exists, U+2203 ISOtech" },{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },{ 8712, "isin", "element of, U+2208 ISOtech" },{ 8713, "notin","not an element of, U+2209 ISOtech" },{ 8715, "ni", "contains as member, U+220B ISOtech" },{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },{ 8722, "minus","minus sign, U+2212 ISOtech" },{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },{ 8730, "radic","square root = radical sign, U+221A ISOtech" },{ 8733, "prop", "proportional to, U+221D ISOtech" },{ 8734, "infin","infinity, U+221E ISOtech" },{ 8736, "ang", "angle, U+2220 ISOamso" },{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },{ 8744, "or", "logical or = vee, U+2228 ISOtech" },{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },{ 8746, "cup", "union = cup, U+222A ISOtech" },{ 8747, "int", "integral, U+222B ISOtech" },{ 8756, "there4","therefore, U+2234 ISOtech" },{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },{ 8800, "ne", "not equal to, U+2260 ISOtech" },{ 8801, "equiv","identical to, U+2261 ISOtech" },{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },{ 8834, "sub", "subset of, U+2282 ISOtech" },{ 8835, "sup", "superset of, U+2283 ISOtech" },{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },{ 8971, "rfloor","right floor, U+230B ISOamsc" },{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },{ 9674, "loz", "lozenge, U+25CA ISOpub" },{ 9824, "spades","black spade suit, U+2660 ISOpub" },{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },{ 9830, "diams","black diamond suit, U+2666 ISOpub" },{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },{ 732, "tilde","small tilde, U+02DC ISOdia" },{ 8194, "ensp", "en space, U+2002 ISOpub" },{ 8195, "emsp", "em space, U+2003 ISOpub" },{ 8201, "thinsp","thin space, U+2009 ISOpub" },{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },{ 8211, "ndash","en dash, U+2013 ISOpub" },{ 8212, "mdash","em dash, U+2014 ISOpub" },{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },{ 8224, "dagger","dagger, U+2020 ISOpub" },{ 8225, "Dagger","double dagger, U+2021 ISOpub" },{ 8240, "permil","per mille sign, U+2030 ISOtech" },{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },{ 8364, "euro", "euro sign, U+20AC NEW" }};/************************************************************************ * * * Commodity functions to handle entities * * * ************************************************************************//* * Macro used to grow the current buffer. */#define growBuffer(buffer) { \ buffer##_size *= 2; \ buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \ if (buffer == NULL) { \ perror("realloc failed"); \ return(NULL); \ } \}/** * htmlEntityLookup: * @name: the entity name * * Lookup the given entity in EntitiesTable * * TODO: the linear scan is really ugly, an hash table is really needed. * * Returns the associated htmlEntityDescPtr if found, NULL otherwise. */htmlEntityDescPtrhtmlEntityLookup(const xmlChar *name) { int i; for (i = 0;i < (sizeof(html40EntitiesTable)/ sizeof(html40EntitiesTable[0]));i++) { if (!xmlStrcmp(name, BAD_CAST html40EntitiesTable[i].name)) {#ifdef DEBUG fprintf(stderr,"Found entity %s\n", name);#endif return(&html40EntitiesTable[i]); } } return(NULL);}/** * htmlDecodeEntities: * @ctxt: the parser context * @len: the len to decode (in bytes !), -1 for no size limit * @end: an end marker xmlChar, 0 if none * @end2: an end marker xmlChar, 0 if none * @end3: an end marker xmlChar, 0 if none * * Subtitute the HTML entities by their value * * DEPRECATED !!!! * * Returns A newly allocated string with the substitution done. The caller * must deallocate it ! */xmlChar *htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, xmlChar end, xmlChar end2, xmlChar end3) { xmlChar *buffer = NULL; int buffer_size = 0; xmlChar *out = NULL; xmlChar *name = NULL; xmlChar *cur = NULL; htmlEntityDescPtr ent; int nbchars = 0; unsigned int max = (unsigned int) len; /* * allocate a translation buffer. */ buffer_size = HTML_PARSER_BIG_BUFFER_SIZE; buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar)); if (buffer == NULL) { perror("htmlDecodeEntities: malloc failed"); return(NULL); } out = buffer; /* * Ok loop until we reach one of the ending char or a size limit. */ while ((nbchars < max) && (CUR != end) && (CUR != end2) && (CUR != end3)) { if (CUR == '&') { if (NXT(1) == '#') { int val = htmlParseCharRef(ctxt); /* invalid for UTF-8 variable encoding !!!!! */ *out++ = val; nbchars += 3; /* !!!! */ } else { ent = htmlParseEntityRef(ctxt, &name); if (name != NULL) { if ((ent == NULL) || (ent->value <= 0) || (ent->value >= 255)) { *out++ = '&'; cur = name; while (*cur != 0) { if (out - buffer > buffer_size - 100) { int index = out - buffer; growBuffer(buffer); out = &buffer[index]; } *out++ = *cur++; } *out++ = ';'; } else { /* invalid for UTF-8 variable encoding !!!!! */ *out++ = (xmlChar)ent->value; if (out - buffer > buffer_size - 100) { int index = out - buffer; growBuffer(buffer); out = &buffer[index]; } } nbchars += 2 + xmlStrlen(name); xmlFree(name); } } } else { /* invalid for UTF-8 , use COPY(out); !!!!! */ *out++ = CUR; nbchars++; if (out - buffer > buffer_size - 100) { int index = out - buffer; growBuffer(buffer); out = &buffer[index]; } NEXT; } } *out++ = 0; return(buffer);}/************************************************************************ * * * Commodity functions to handle encodings * * * ************************************************************************//** * htmlSwitchEncoding: * @ctxt: the parser context * @len: the len of @cur * * change the input functions when discovering the character encoding * of a given entity. * */voidhtmlSwitchEncoding(htmlParserCtxtPtr ctxt, xmlCharEncoding enc){ switch (enc) { case XML_CHAR_ENCODING_ERROR: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "encoding unknown\n"); ctxt->wellFormed = 0; break; case XML_CHAR_ENCODING_NONE: /* let's assume it's UTF-8 without the XML decl */ return; case XML_CHAR_ENCODING_UTF8: /* default encoding, no conversion should be needed */ return; case XML_CHAR_ENCODING_UTF16LE: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding UTF16 little endian not supported\n"); break; case XML_CHAR_ENCODING_UTF16BE: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding UTF16 big endian not supported\n"); break; case XML_CHAR_ENCODING_UCS4LE: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding USC4 little endian not supported\n"); break; case XML_CHAR_ENCODING_UCS4BE: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding USC4 big endian not supported\n"); break; case XML_CHAR_ENCODING_EBCDIC: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding EBCDIC not supported\n"); break; case XML_CHAR_ENCODING_UCS4_2143: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding UCS4 2143 not supported\n"); break; case XML_CHAR_ENCODING_UCS4_3412: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding UCS4 3412 not supported\n"); break; case XML_CHAR_ENCODING_UCS2: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding UCS2 not supported\n"); break; case XML_CHAR_ENCODING_8859_1: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO_8859_1 ISO Latin 1 not supported\n"); break; case XML_CHAR_ENCODING_8859_2: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO_8859_2 ISO Latin 2 not supported\n"); break; case XML_CHAR_ENCODING_8859_3: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO_8859_3 not supported\n"); break; case XML_CHAR_ENCODING_8859_4: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO_8859_4 not supported\n"); break; case XML_CHAR_ENCODING_8859_5: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO_8859_5 not supported\n"); break; case XML_CHAR_ENCODING_8859_6: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO_8859_6 not supported\n"); break; case XML_CHAR_ENCODING_8859_7: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO_8859_7 not supported\n"); break; case XML_CHAR_ENCODING_8859_8: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO_8859_8 not supported\n"); break; case XML_CHAR_ENCODING_8859_9: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO_8859_9 not supported\n"); break; case XML_CHAR_ENCODING_2022_JP: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO-2022-JPnot supported\n"); break; case XML_CHAR_ENCODING_SHIFT_JIS: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding Shift_JISnot supported\n"); break; case XML_CHAR_ENCODING_EUC_JP: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding EUC-JPnot supported\n"); break; }}/************************************************************************ * * * Commodity functions to handle streams * * * ************************************************************************//** * htmlFreeInputStream: * @input: an htmlParserInputPtr * * Free up an input stream. */voidhtmlFreeInputStream(htmlParserInputPtr input) { if (input == NULL) return; if (input->filename != NULL) xmlFree((char *) input->filename); if (input->directory != NULL) xmlFree((char *) input->directory); if ((input->free != NULL) && (input->base != NULL)) input->free((xmlChar *) input->base); if (input->buf != NULL) xmlFreeParserInputBuffer(input->buf); memset(input, -1, sizeof(htmlParserInput)); xmlFree(input);}/** * htmlNewInputStream: * @ctxt: an HTML parser context * * Create a new input stream structure
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -