📄 encoding.c
字号:
xmlNewCharEncodingHandler("UTF-8", UTF8ToUTF8, UTF8ToUTF8);#ifdef LIBXML_OUTPUT_ENABLED xmlUTF16LEHandler = xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE); xmlUTF16BEHandler = xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE); xmlNewCharEncodingHandler("UTF-16", UTF16LEToUTF8, UTF8ToUTF16); xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1); xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii); xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);#ifdef LIBXML_HTML_ENABLED xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);#endif#else xmlUTF16LEHandler = xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, NULL); xmlUTF16BEHandler = xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, NULL); xmlNewCharEncodingHandler("UTF-16", UTF16LEToUTF8, NULL); xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, NULL); xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL); xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL);#endif /* LIBXML_OUTPUT_ENABLED */#ifndef LIBXML_ICONV_ENABLED#ifdef LIBXML_ISO8859X_ENABLED xmlRegisterCharEncodingHandlersISO8859x ();#endif#endif}/** * xmlCleanupCharEncodingHandlers: * * Cleanup the memory allocated for the char encoding support, it * unregisters all the encoding handlers and the aliases. */voidxmlCleanupCharEncodingHandlers(void) { xmlCleanupEncodingAliases(); if (handlers == NULL) return; for (;nbCharEncodingHandler > 0;) { nbCharEncodingHandler--; if (handlers[nbCharEncodingHandler] != NULL) { if (handlers[nbCharEncodingHandler]->name != NULL) xmlFree(handlers[nbCharEncodingHandler]->name); xmlFree(handlers[nbCharEncodingHandler]); } } xmlFree(handlers); handlers = NULL; nbCharEncodingHandler = 0; xmlDefaultCharEncodingHandler = NULL;}/** * xmlRegisterCharEncodingHandler: * @handler: the xmlCharEncodingHandlerPtr handler block * * Register the char encoding handler, surprising, isn't it ? */voidxmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) { if (handlers == NULL) xmlInitCharEncodingHandlers(); if (handler == NULL) { xmlEncodingErr(XML_I18N_NO_HANDLER, "xmlRegisterCharEncodingHandler: NULL handler !\n", NULL); return; } if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) { xmlEncodingErr(XML_I18N_EXCESS_HANDLER, "xmlRegisterCharEncodingHandler: Too many handler registered, see %s\n", "MAX_ENCODING_HANDLERS"); return; } handlers[nbCharEncodingHandler++] = handler;}/** * xmlGetCharEncodingHandler: * @enc: an xmlCharEncoding value. * * Search in the registered set the handler able to read/write that encoding. * * Returns the handler or NULL if not found */xmlCharEncodingHandlerPtrxmlGetCharEncodingHandler(xmlCharEncoding enc) { xmlCharEncodingHandlerPtr handler; if (handlers == NULL) xmlInitCharEncodingHandlers(); switch (enc) { case XML_CHAR_ENCODING_ERROR: return(NULL); case XML_CHAR_ENCODING_NONE: return(NULL); case XML_CHAR_ENCODING_UTF8: return(NULL); case XML_CHAR_ENCODING_UTF16LE: return(xmlUTF16LEHandler); case XML_CHAR_ENCODING_UTF16BE: return(xmlUTF16BEHandler); case XML_CHAR_ENCODING_EBCDIC: handler = xmlFindCharEncodingHandler("EBCDIC"); if (handler != NULL) return(handler); handler = xmlFindCharEncodingHandler("ebcdic"); if (handler != NULL) return(handler); break; case XML_CHAR_ENCODING_UCS4BE: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4"); if (handler != NULL) return(handler); handler = xmlFindCharEncodingHandler("UCS-4"); if (handler != NULL) return(handler); handler = xmlFindCharEncodingHandler("UCS4"); if (handler != NULL) return(handler); break; case XML_CHAR_ENCODING_UCS4LE: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4"); if (handler != NULL) return(handler); handler = xmlFindCharEncodingHandler("UCS-4"); if (handler != NULL) return(handler); handler = xmlFindCharEncodingHandler("UCS4"); if (handler != NULL) return(handler); break; case XML_CHAR_ENCODING_UCS4_2143: break; case XML_CHAR_ENCODING_UCS4_3412: break; case XML_CHAR_ENCODING_UCS2: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2"); if (handler != NULL) return(handler); handler = xmlFindCharEncodingHandler("UCS-2"); if (handler != NULL) return(handler); handler = xmlFindCharEncodingHandler("UCS2"); if (handler != NULL) return(handler); break; /* * We used to keep ISO Latin encodings native in the * generated data. This led to so many problems that * this has been removed. One can still change this * back by registering no-ops encoders for those */ case XML_CHAR_ENCODING_8859_1: handler = xmlFindCharEncodingHandler("ISO-8859-1"); if (handler != NULL) return(handler); break; case XML_CHAR_ENCODING_8859_2: handler = xmlFindCharEncodingHandler("ISO-8859-2"); if (handler != NULL) return(handler); break; case XML_CHAR_ENCODING_8859_3: handler = xmlFindCharEncodingHandler("ISO-8859-3"); if (handler != NULL) return(handler); break; case XML_CHAR_ENCODING_8859_4: handler = xmlFindCharEncodingHandler("ISO-8859-4"); if (handler != NULL) return(handler); break; case XML_CHAR_ENCODING_8859_5: handler = xmlFindCharEncodingHandler("ISO-8859-5"); if (handler != NULL) return(handler); break; case XML_CHAR_ENCODING_8859_6: handler = xmlFindCharEncodingHandler("ISO-8859-6"); if (handler != NULL) return(handler); break; case XML_CHAR_ENCODING_8859_7: handler = xmlFindCharEncodingHandler("ISO-8859-7"); if (handler != NULL) return(handler); break; case XML_CHAR_ENCODING_8859_8: handler = xmlFindCharEncodingHandler("ISO-8859-8"); if (handler != NULL) return(handler); break; case XML_CHAR_ENCODING_8859_9: handler = xmlFindCharEncodingHandler("ISO-8859-9"); if (handler != NULL) return(handler); break; case XML_CHAR_ENCODING_2022_JP: handler = xmlFindCharEncodingHandler("ISO-2022-JP"); if (handler != NULL) return(handler); break; case XML_CHAR_ENCODING_SHIFT_JIS: handler = xmlFindCharEncodingHandler("SHIFT-JIS"); if (handler != NULL) return(handler); handler = xmlFindCharEncodingHandler("SHIFT_JIS"); if (handler != NULL) return(handler); handler = xmlFindCharEncodingHandler("Shift_JIS"); if (handler != NULL) return(handler); break; case XML_CHAR_ENCODING_EUC_JP: handler = xmlFindCharEncodingHandler("EUC-JP"); if (handler != NULL) return(handler); break; default: break; } #ifdef DEBUG_ENCODING xmlGenericError(xmlGenericErrorContext, "No handler found for encoding %d\n", enc);#endif return(NULL);}/** * xmlFindCharEncodingHandler: * @name: a string describing the char encoding. * * Search in the registered set the handler able to read/write that encoding. * * Returns the handler or NULL if not found */xmlCharEncodingHandlerPtrxmlFindCharEncodingHandler(const char *name) { const char *nalias; const char *norig; xmlCharEncoding alias;#ifdef LIBXML_ICONV_ENABLED xmlCharEncodingHandlerPtr enc; iconv_t icv_in, icv_out;#endif /* LIBXML_ICONV_ENABLED */ char upper[100]; int i; if (handlers == NULL) xmlInitCharEncodingHandlers(); if (name == NULL) return(xmlDefaultCharEncodingHandler); if (name[0] == 0) return(xmlDefaultCharEncodingHandler); /* * Do the alias resolution */ norig = name; nalias = xmlGetEncodingAlias(name); if (nalias != NULL) name = nalias; /* * Check first for directly registered encoding names */ for (i = 0;i < 99;i++) { upper[i] = toupper(name[i]); if (upper[i] == 0) break; } upper[i] = 0; for (i = 0;i < nbCharEncodingHandler; i++) if (!strcmp(upper, handlers[i]->name)) {#ifdef DEBUG_ENCODING xmlGenericError(xmlGenericErrorContext, "Found registered handler for encoding %s\n", name);#endif return(handlers[i]); }#ifdef LIBXML_ICONV_ENABLED /* check whether iconv can handle this */ icv_in = iconv_open("UTF-8", name); icv_out = iconv_open(name, "UTF-8"); if (icv_in == (iconv_t) -1) { icv_in = iconv_open("UTF-8", upper); } if (icv_out == (iconv_t) -1) { icv_out = iconv_open(upper, "UTF-8"); } if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) { enc = (xmlCharEncodingHandlerPtr) xmlMalloc(sizeof(xmlCharEncodingHandler)); if (enc == NULL) { iconv_close(icv_in); iconv_close(icv_out); return(NULL); } enc->name = xmlMemStrdup(name); enc->input = NULL; enc->output = NULL; enc->iconv_in = icv_in; enc->iconv_out = icv_out;#ifdef DEBUG_ENCODING xmlGenericError(xmlGenericErrorContext, "Found iconv handler for encoding %s\n", name);#endif return enc; } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) { xmlEncodingErr(XML_ERR_INTERNAL_ERROR, "iconv : problems with filters for '%s'\n", name); }#endif /* LIBXML_ICONV_ENABLED */#ifdef DEBUG_ENCODING xmlGenericError(xmlGenericErrorContext, "No handler found for encoding %s\n", name);#endif /* * Fallback using the canonical names */ alias = xmlParseCharEncoding(norig); if (alias != XML_CHAR_ENCODING_ERROR) { const char* canon; canon = xmlGetCharEncodingName(alias); if ((canon != NULL) && (strcmp(name, canon))) { return(xmlFindCharEncodingHandler(canon)); } } /* If "none of the above", give up */ return(NULL);}/************************************************************************ * * * ICONV based generic conversion functions * * * ************************************************************************/#ifdef LIBXML_ICONV_ENABLED/** * xmlIconvWrapper: * @cd: iconv converter data structure * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out * @in: a pointer to an array of ISO Latin 1 chars * @inlen: the length of @in * * Returns 0 if success, or * -1 by lack of space, or * -2 if the transcoding fails (for *in is not valid utf8 string or * the result of transformation can't fit into the encoding we want), or * -3 if there the last byte can't form a single output char. * * The value of @inlen after return is the number of octets consumed * as the return value is positive, else unpredictable. * The value of @outlen after return is the number of ocetes consumed. */static intxmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen, const unsigned char *in, int *inlen) { size_t icv_inlen, icv_outlen; const char *icv_in = (const char *) in; char *icv_out = (char *) out; int ret; if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) { if (outlen != NULL) *outlen = 0; return(-1); } icv_inlen = *inlen; icv_outlen = *outlen; ret = iconv(cd, (char **) &icv_in, &icv_inlen, &icv_out, &icv_outlen); *inlen -= icv_inlen; *outlen -= icv_outlen; if ((icv_inlen != 0) || (ret == -1)) {#ifdef EILSEQ if (errno == EILSEQ) { return -2; } else#endif#ifdef E2BIG if (errno == E2BIG) { return -1; } else#endif#ifdef EINVAL if (errno == EINVAL) { return -3; } else#endif { return -3; } } return 0;}#endif /* LIBXML_ICONV_ENABLED *//************************************************************************ * * * The real API used by libxml for on-the-fly conversion * * * ************************************************************************//** * xmlCharEncFirstLine: * @handler: char enconding transformation data structure * @out: an xmlBuffer for the output. * @in: an xmlBuffer for the input * * Front-end for the encoding handler input function, but handle only * the very first line, i.e. limit itself to 45 chars. * * Returns the number of byte written if success, or * -1 general error * -2 if the transcoding fails (for *in is not valid utf8 string or * the result of transformation can't fit into the encoding we want), or */intxmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, xmlBufferPtr in) { int ret = -2; int written; int toconv; if (handler == NULL) return(-1); if (out == NULL) return(-1); if (in == NULL) return(-1); /* calculate space available */ written = out->size - out->use; toconv = in->use; /* * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38 * 45 chars should be sufficient to reach the end of the encoding * declaration without going too far inside the document content. */ if (toconv > 45) toconv = 45; if (toconv * 2 >= written) { xmlBufferGrow(out, toconv); written = out->size - out->use - 1; } if (handler->input != NULL) { ret = handler->input(&out->content[out->use], &written, in->content, &toconv); xmlBufferShrink(in, toconv); out->use += written; out->content[out->use] = 0; }#ifdef LIBXML_ICONV_ENABLED else if (handler->iconv_in != NULL) { ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use], &written, in->content, &toconv); xmlBufferShrink(in, toconv); out->use += written; out->content[out->use] = 0; if (ret == -1) ret = -3; }#endif /* LIBXML_ICONV_ENABLED */#ifdef DEBUG_ENCODING switch (ret) { case 0: xmlGenericError(xmlGenericErrorContext, "converted %d bytes to %d bytes of input\n",
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -