⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htmlparser.c

📁 SIP 1.5.0源代码
💻 C
📖 第 1 页 / 共 5 页
字号:
{ 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },{ 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },{ 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },{ 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },{ 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },{ 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },{ 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },{ 8704,	"forall","for all, U+2200 ISOtech" },{ 8706,	"part",	"partial differential, U+2202 ISOtech" },{ 8707,	"exist","there exists, U+2203 ISOtech" },{ 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },{ 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },{ 8712,	"isin",	"element of, U+2208 ISOtech" },{ 8713,	"notin","not an element of, U+2209 ISOtech" },{ 8715,	"ni",	"contains as member, U+220B ISOtech" },{ 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },{ 8721,	"sum",	"n-ary sumation, U+2211 ISOamsb" },{ 8722,	"minus","minus sign, U+2212 ISOtech" },{ 8727,	"lowast","asterisk operator, U+2217 ISOtech" },{ 8730,	"radic","square root = radical sign, U+221A ISOtech" },{ 8733,	"prop",	"proportional to, U+221D ISOtech" },{ 8734,	"infin","infinity, U+221E ISOtech" },{ 8736,	"ang",	"angle, U+2220 ISOamso" },{ 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },{ 8744,	"or",	"logical or = vee, U+2228 ISOtech" },{ 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },{ 8746,	"cup",	"union = cup, U+222A ISOtech" },{ 8747,	"int",	"integral, U+222B ISOtech" },{ 8756,	"there4","therefore, U+2234 ISOtech" },{ 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },{ 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },{ 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },{ 8800,	"ne",	"not equal to, U+2260 ISOtech" },{ 8801,	"equiv","identical to, U+2261 ISOtech" },{ 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },{ 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },{ 8834,	"sub",	"subset of, U+2282 ISOtech" },{ 8835,	"sup",	"superset of, U+2283 ISOtech" },{ 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },{ 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },{ 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },{ 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },{ 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },{ 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },{ 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },{ 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },{ 8969,	"rceil","right ceiling, U+2309 ISOamsc" },{ 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },{ 8971,	"rfloor","right floor, U+230B ISOamsc" },{ 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },{ 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },{ 9674,	"loz",	"lozenge, U+25CA ISOpub" },{ 9824,	"spades","black spade suit, U+2660 ISOpub" },{ 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },{ 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },{ 9830,	"diams","black diamond suit, U+2666 ISOpub" },{ 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },{ 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },{ 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },{ 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },{ 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },{ 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },{ 732,	"tilde","small tilde, U+02DC ISOdia" },{ 8194,	"ensp",	"en space, U+2002 ISOpub" },{ 8195,	"emsp",	"em space, U+2003 ISOpub" },{ 8201,	"thinsp","thin space, U+2009 ISOpub" },{ 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },{ 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },{ 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },{ 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },{ 8211,	"ndash","en dash, U+2013 ISOpub" },{ 8212,	"mdash","em dash, U+2014 ISOpub" },{ 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },{ 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },{ 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },{ 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },{ 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },{ 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },{ 8224,	"dagger","dagger, U+2020 ISOpub" },{ 8225,	"Dagger","double dagger, U+2021 ISOpub" },{ 8240,	"permil","per mille sign, U+2030 ISOtech" },{ 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },{ 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },{ 8364,	"euro",	"euro sign, U+20AC NEW" }};/************************************************************************ *									* *		Commodity functions to handle entities			* *									* ************************************************************************//* * Macro used to grow the current buffer. */#define growBuffer(buffer) {						\    buffer##_size *= 2;							\    buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar));	\    if (buffer == NULL) {						\	perror("realloc failed");					\	return(NULL);							\    }									\}/** * htmlEntityLookup: * @name: the entity name * * Lookup the given entity in EntitiesTable * * TODO: the linear scan is really ugly, an hash table is really needed. * * Returns the associated htmlEntityDescPtr if found, NULL otherwise. */htmlEntityDescPtrhtmlEntityLookup(const xmlChar *name) {    int i;    for (i = 0;i < (sizeof(html40EntitiesTable)/                    sizeof(html40EntitiesTable[0]));i++) {        if (!xmlStrcmp(name, BAD_CAST html40EntitiesTable[i].name)) {#ifdef DEBUG            fprintf(stderr,"Found entity %s\n", name);#endif            return(&html40EntitiesTable[i]);	}    }    return(NULL);}/** * htmlDecodeEntities: * @ctxt:  the parser context * @len:  the len to decode (in bytes !), -1 for no size limit * @end:  an end marker xmlChar, 0 if none * @end2:  an end marker xmlChar, 0 if none * @end3:  an end marker xmlChar, 0 if none * * Subtitute the HTML entities by their value * * DEPRECATED !!!! * * Returns A newly allocated string with the substitution done. The caller *      must deallocate it ! */xmlChar *htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,                  xmlChar end, xmlChar  end2, xmlChar end3) {    xmlChar *buffer = NULL;    int buffer_size = 0;    xmlChar *out = NULL;    xmlChar *name = NULL;    xmlChar *cur = NULL;    htmlEntityDescPtr ent;    int nbchars = 0;    unsigned int max = (unsigned int) len;    /*     * allocate a translation buffer.     */    buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;    buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));    if (buffer == NULL) {	perror("htmlDecodeEntities: malloc failed");	return(NULL);    }    out = buffer;    /*     * Ok loop until we reach one of the ending char or a size limit.     */    while ((nbchars < max) && (CUR != end) &&           (CUR != end2) && (CUR != end3)) {        if (CUR == '&') {	    if (NXT(1) == '#') {		int val = htmlParseCharRef(ctxt);		/* invalid for UTF-8 variable encoding !!!!! */		*out++ = val;		nbchars += 3; /* !!!! */	    } else {		ent = htmlParseEntityRef(ctxt, &name);		if (name != NULL) {		    if ((ent == NULL) || (ent->value <= 0) ||		        (ent->value >= 255)) {		        *out++ = '&';		        cur = name;			while (*cur != 0) {			    if (out - buffer > buffer_size - 100) {				int index = out - buffer;				growBuffer(buffer);				out = &buffer[index];			    }			    *out++ = *cur++;			}		        *out++ = ';';		    } else {			/* invalid for UTF-8 variable encoding !!!!! */			*out++ = (xmlChar)ent->value;			if (out - buffer > buffer_size - 100) {			    int index = out - buffer;			    growBuffer(buffer);			    out = &buffer[index];			}		    }		    nbchars += 2 + xmlStrlen(name);		    xmlFree(name);		}	    }	} else {	    /*  invalid for UTF-8 , use COPY(out); !!!!! */	    *out++ = CUR;	    nbchars++;	    if (out - buffer > buffer_size - 100) {	      int index = out - buffer;	      	      growBuffer(buffer);	      out = &buffer[index];	    }	    NEXT;	}    }    *out++ = 0;    return(buffer);}/************************************************************************ *									* *		Commodity functions to handle encodings			* *									* ************************************************************************//** * htmlSwitchEncoding: * @ctxt:  the parser context * @len:  the len of @cur * * change the input functions when discovering the character encoding * of a given entity. * */voidhtmlSwitchEncoding(htmlParserCtxtPtr ctxt, xmlCharEncoding enc){    switch (enc) {        case XML_CHAR_ENCODING_ERROR:	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))	        ctxt->sax->error(ctxt->userData, "encoding unknown\n");	    ctxt->wellFormed = 0;            break;        case XML_CHAR_ENCODING_NONE:	    /* let's assume it's UTF-8 without the XML decl */            return;        case XML_CHAR_ENCODING_UTF8:	    /* default encoding, no conversion should be needed */            return;        case XML_CHAR_ENCODING_UTF16LE:	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,		  "char encoding UTF16 little endian not supported\n");            break;        case XML_CHAR_ENCODING_UTF16BE:	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,		  "char encoding UTF16 big endian not supported\n");            break;        case XML_CHAR_ENCODING_UCS4LE:	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,		  "char encoding USC4 little endian not supported\n");            break;        case XML_CHAR_ENCODING_UCS4BE:	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,		  "char encoding USC4 big endian not supported\n");            break;        case XML_CHAR_ENCODING_EBCDIC:	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,		  "char encoding EBCDIC not supported\n");            break;        case XML_CHAR_ENCODING_UCS4_2143:	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,		  "char encoding UCS4 2143 not supported\n");            break;        case XML_CHAR_ENCODING_UCS4_3412:	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,		  "char encoding UCS4 3412 not supported\n");            break;        case XML_CHAR_ENCODING_UCS2:	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,		  "char encoding UCS2 not supported\n");            break;        case XML_CHAR_ENCODING_8859_1:	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,		  "char encoding ISO_8859_1 ISO Latin 1 not supported\n");            break;        case XML_CHAR_ENCODING_8859_2:	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,		  "char encoding ISO_8859_2 ISO Latin 2 not supported\n");            break;        case XML_CHAR_ENCODING_8859_3:	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,		  "char encoding ISO_8859_3 not supported\n");            break;        case XML_CHAR_ENCODING_8859_4:	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,		  "char encoding ISO_8859_4 not supported\n");            break;        case XML_CHAR_ENCODING_8859_5:	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,		  "char encoding ISO_8859_5 not supported\n");            break;        case XML_CHAR_ENCODING_8859_6:	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,		  "char encoding ISO_8859_6 not supported\n");            break;        case XML_CHAR_ENCODING_8859_7:	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,		  "char encoding ISO_8859_7 not supported\n");            break;        case XML_CHAR_ENCODING_8859_8:	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,		  "char encoding ISO_8859_8 not supported\n");            break;        case XML_CHAR_ENCODING_8859_9:	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,		  "char encoding ISO_8859_9 not supported\n");            break;        case XML_CHAR_ENCODING_2022_JP:            if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,                  "char encoding ISO-2022-JPnot supported\n");            break;        case XML_CHAR_ENCODING_SHIFT_JIS:            if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,                  "char encoding Shift_JISnot supported\n");            break;        case XML_CHAR_ENCODING_EUC_JP:            if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))                ctxt->sax->error(ctxt->userData,                  "char encoding EUC-JPnot supported\n");            break;    }}/************************************************************************ *									* *		Commodity functions to handle streams			* *									* ************************************************************************//** * htmlFreeInputStream: * @input:  an htmlParserInputPtr * * Free up an input stream. */voidhtmlFreeInputStream(htmlParserInputPtr input) {    if (input == NULL) return;    if (input->filename != NULL) xmlFree((char *) input->filename);    if (input->directory != NULL) xmlFree((char *) input->directory);    if ((input->free != NULL) && (input->base != NULL))        input->free((xmlChar *) input->base);    if (input->buf != NULL)         xmlFreeParserInputBuffer(input->buf);    memset(input, -1, sizeof(htmlParserInput));    xmlFree(input);}/** * htmlNewInputStream: * @ctxt:  an HTML parser context * * Create a new input stream structure

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -