📄 encoder.php
字号:
} } else { // ((0xC0 & (*in) != 0x80) && (mState != 0)) // Incomplete multi-octet sequence. // used to result in complete fail, but we'll reset $mState = 0; $mUcs4 = 0; $mBytes = 1; $char =''; } } } return $out; } /** * Translates a Unicode codepoint into its corresponding UTF-8 character. * @static * @note Based on Feyd's function at * <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>, * which is in public domain. * @note While we're going to do code point parsing anyway, a good * optimization would be to refuse to translate code points that * are non-SGML characters. However, this could lead to duplication. * @note This is very similar to the unichr function in * maintenance/generate-entity-file.php (although this is superior, * due to its sanity checks). */ // +----------+----------+----------+----------+ // | 33222222 | 22221111 | 111111 | | // | 10987654 | 32109876 | 54321098 | 76543210 | bit // +----------+----------+----------+----------+ // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF // +----------+----------+----------+----------+ // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF) // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes // +----------+----------+----------+----------+ function unichr($code) { if($code > 1114111 or $code < 0 or ($code >= 55296 and $code <= 57343) ) { // bits are set outside the "valid" range as defined // by UNICODE 4.1.0 return ''; } $x = $y = $z = $w = 0; if ($code < 128) { // regular ASCII character $x = $code; } else { // set up bits for UTF-8 $x = ($code & 63) | 128; if ($code < 2048) { $y = (($code & 2047) >> 6) | 192; } else { $y = (($code & 4032) >> 6) | 128; if($code < 65536) { $z = (($code >> 12) & 15) | 224; } else { $z = (($code >> 12) & 63) | 128; $w = (($code >> 18) & 7) | 240; } } } // set up the actual character $ret = ''; if($w) $ret .= chr($w); if($z) $ret .= chr($z); if($y) $ret .= chr($y); $ret .= chr($x); return $ret; } /** * Converts a string to UTF-8 based on configuration. * @static */ function convertToUTF8($str, $config, &$context) { $encoding = $config->get('Core', 'Encoding'); if ($encoding === 'utf-8') return $str; static $iconv = null; if ($iconv === null) $iconv = function_exists('iconv'); set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); if ($iconv && !$config->get('Test', 'ForceNoIconv')) { $str = iconv($encoding, 'utf-8//IGNORE', $str); // If the string is bjorked by Shift_JIS or a similar encoding // that doesn't support all of ASCII, convert the naughty // characters to their true byte-wise ASCII/UTF-8 equivalents. $str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding)); restore_error_handler(); return $str; } elseif ($encoding === 'iso-8859-1') { $str = utf8_encode($str); restore_error_handler(); return $str; } trigger_error('Encoding not supported', E_USER_ERROR); } /** * Converts a string from UTF-8 based on configuration. * @static * @note Currently, this is a lossy conversion, with unexpressable * characters being omitted. */ function convertFromUTF8($str, $config, &$context) { $encoding = $config->get('Core', 'Encoding'); if ($encoding === 'utf-8') return $str; static $iconv = null; if ($iconv === null) $iconv = function_exists('iconv'); if ($escape = $config->get('Core', 'EscapeNonASCIICharacters')) { $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str); } set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); if ($iconv && !$config->get('Test', 'ForceNoIconv')) { // Undo our previous fix in convertToUTF8, otherwise iconv will barf $ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding); if (!$escape && !empty($ascii_fix)) { $clear_fix = array(); foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = ''; $str = strtr($str, $clear_fix); } $str = strtr($str, array_flip($ascii_fix)); // Normal stuff $str = iconv('utf-8', $encoding . '//IGNORE', $str); restore_error_handler(); return $str; } elseif ($encoding === 'iso-8859-1') { $str = utf8_decode($str); restore_error_handler(); return $str; } trigger_error('Encoding not supported', E_USER_ERROR); } /** * Lossless (character-wise) conversion of HTML to ASCII * @static * @param $str UTF-8 string to be converted to ASCII * @returns ASCII encoded string with non-ASCII character entity-ized * @warning Adapted from MediaWiki, claiming fair use: this is a common * algorithm. If you disagree with this license fudgery, * implement it yourself. * @note Uses decimal numeric entities since they are best supported. * @note This is a DUMB function: it has no concept of keeping * character entities that the projected character encoding * can allow. We could possibly implement a smart version * but that would require it to also know which Unicode * codepoints the charset supported (not an easy task). * @note Sort of with cleanUTF8() but it assumes that $str is * well-formed UTF-8 */ function convertToASCIIDumbLossless($str) { $bytesleft = 0; $result = ''; $working = 0; $len = strlen($str); for( $i = 0; $i < $len; $i++ ) { $bytevalue = ord( $str[$i] ); if( $bytevalue <= 0x7F ) { //0xxx xxxx $result .= chr( $bytevalue ); $bytesleft = 0; } elseif( $bytevalue <= 0xBF ) { //10xx xxxx $working = $working << 6; $working += ($bytevalue & 0x3F); $bytesleft--; if( $bytesleft <= 0 ) { $result .= "&#" . $working . ";"; } } elseif( $bytevalue <= 0xDF ) { //110x xxxx $working = $bytevalue & 0x1F; $bytesleft = 1; } elseif( $bytevalue <= 0xEF ) { //1110 xxxx $working = $bytevalue & 0x0F; $bytesleft = 2; } else { //1111 0xxx $working = $bytevalue & 0x07; $bytesleft = 3; } } return $result; } /** * This expensive function tests whether or not a given character * encoding supports ASCII. 7/8-bit encodings like Shift_JIS will * fail this test, and require special processing. Variable width * encodings shouldn't ever fail. * * @param string $encoding Encoding name to test, as per iconv format * @param bool $bypass Whether or not to bypass the precompiled arrays. * @return Array of UTF-8 characters to their corresponding ASCII, * which can be used to "undo" any overzealous iconv action. */ function testEncodingSupportsASCII($encoding, $bypass = false) { static $encodings = array(); if (!$bypass) { if (isset($encodings[$encoding])) return $encodings[$encoding]; $lenc = strtolower($encoding); switch ($lenc) { case 'shift_jis': return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~'); case 'johab': return array("\xE2\x82\xA9" => '\\'); } if (strpos($lenc, 'iso-8859-') === 0) return array(); } $ret = array(); set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); if (iconv('UTF-8', $encoding, 'a') === false) return false; for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars $c = chr($i); if (iconv('UTF-8', "$encoding//IGNORE", $c) === '') { // Reverse engineer: what's the UTF-8 equiv of this byte // sequence? This assumes that there's no variable width // encoding that doesn't support ASCII. $ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c; } } restore_error_handler(); $encodings[$encoding] = $ret; return $ret; } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -