📄 encoder.php

📁 很棒的在线教学系统
💻 PHP
📖 第 1 页 / 共 2 页
字号:
上一页 12
                    }                } else {                    // ((0xC0 & (*in) != 0x80) && (mState != 0))                    // Incomplete multi-octet sequence.                    // used to result in complete fail, but we'll reset                    $mState = 0;                    $mUcs4  = 0;                    $mBytes = 1;                    $char ='';                }            }        }        return $out;    }        /**     * Translates a Unicode codepoint into its corresponding UTF-8 character.     * @static     * @note Based on Feyd's function at     *       <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,     *       which is in public domain.     * @note While we're going to do code point parsing anyway, a good     *       optimization would be to refuse to translate code points that     *       are non-SGML characters.  However, this could lead to duplication.     * @note This is very similar to the unichr function in     *       maintenance/generate-entity-file.php (although this is superior,     *       due to its sanity checks).     */        // +----------+----------+----------+----------+    // | 33222222 | 22221111 | 111111   |          |    // | 10987654 | 32109876 | 54321098 | 76543210 | bit    // +----------+----------+----------+----------+    // |          |          |          | 0xxxxxxx | 1 byte 0x00000000..0x0000007F    // |          |          | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF    // |          | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF    // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF    // +----------+----------+----------+----------+    // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)    // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes    // +----------+----------+----------+----------+         function unichr($code) {        if($code > 1114111 or $code < 0 or          ($code >= 55296 and $code <= 57343) ) {            // bits are set outside the "valid" range as defined            // by UNICODE 4.1.0             return '';        }                $x = $y = $z = $w = 0;         if ($code < 128) {            // regular ASCII character            $x = $code;        } else {            // set up bits for UTF-8            $x = ($code & 63) | 128;            if ($code < 2048) {                $y = (($code & 2047) >> 6) | 192;            } else {                $y = (($code & 4032) >> 6) | 128;                if($code < 65536) {                    $z = (($code >> 12) & 15) | 224;                } else {                    $z = (($code >> 12) & 63) | 128;                    $w = (($code >> 18) & 7)  | 240;                }            }         }        // set up the actual character        $ret = '';        if($w) $ret .= chr($w);        if($z) $ret .= chr($z);        if($y) $ret .= chr($y);        $ret .= chr($x);                 return $ret;    }        /**     * Converts a string to UTF-8 based on configuration.     * @static     */    function convertToUTF8($str, $config, &$context) {        $encoding = $config->get('Core', 'Encoding');        if ($encoding === 'utf-8') return $str;        static $iconv = null;        if ($iconv === null) $iconv = function_exists('iconv');        set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));        if ($iconv && !$config->get('Test', 'ForceNoIconv')) {            $str = iconv($encoding, 'utf-8//IGNORE', $str);            // If the string is bjorked by Shift_JIS or a similar encoding            // that doesn't support all of ASCII, convert the naughty            // characters to their true byte-wise ASCII/UTF-8 equivalents.            $str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding));            restore_error_handler();            return $str;        } elseif ($encoding === 'iso-8859-1') {            $str = utf8_encode($str);            restore_error_handler();            return $str;        }        trigger_error('Encoding not supported', E_USER_ERROR);    }        /**     * Converts a string from UTF-8 based on configuration.     * @static     * @note Currently, this is a lossy conversion, with unexpressable     *       characters being omitted.     */    function convertFromUTF8($str, $config, &$context) {        $encoding = $config->get('Core', 'Encoding');        if ($encoding === 'utf-8') return $str;        static $iconv = null;        if ($iconv === null) $iconv = function_exists('iconv');        if ($escape = $config->get('Core', 'EscapeNonASCIICharacters')) {            $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);        }        set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));        if ($iconv && !$config->get('Test', 'ForceNoIconv')) {            // Undo our previous fix in convertToUTF8, otherwise iconv will barf            $ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);            if (!$escape && !empty($ascii_fix)) {                $clear_fix = array();                foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = '';                $str = strtr($str, $clear_fix);            }            $str = strtr($str, array_flip($ascii_fix));            // Normal stuff            $str = iconv('utf-8', $encoding . '//IGNORE', $str);            restore_error_handler();            return $str;        } elseif ($encoding === 'iso-8859-1') {            $str = utf8_decode($str);            restore_error_handler();            return $str;        }        trigger_error('Encoding not supported', E_USER_ERROR);    }        /**     * Lossless (character-wise) conversion of HTML to ASCII     * @static     * @param $str UTF-8 string to be converted to ASCII     * @returns ASCII encoded string with non-ASCII character entity-ized     * @warning Adapted from MediaWiki, claiming fair use: this is a common     *       algorithm. If you disagree with this license fudgery,     *       implement it yourself.     * @note Uses decimal numeric entities since they are best supported.     * @note This is a DUMB function: it has no concept of keeping     *       character entities that the projected character encoding     *       can allow. We could possibly implement a smart version     *       but that would require it to also know which Unicode     *       codepoints the charset supported (not an easy task).     * @note Sort of with cleanUTF8() but it assumes that $str is     *       well-formed UTF-8     */    function convertToASCIIDumbLossless($str) {        $bytesleft = 0;        $result = '';        $working = 0;        $len = strlen($str);        for( $i = 0; $i < $len; $i++ ) {            $bytevalue = ord( $str[$i] );            if( $bytevalue <= 0x7F ) { //0xxx xxxx                $result .= chr( $bytevalue );                $bytesleft = 0;            } elseif( $bytevalue <= 0xBF ) { //10xx xxxx                $working = $working << 6;                $working += ($bytevalue & 0x3F);                $bytesleft--;                if( $bytesleft <= 0 ) {                    $result .= "&#" . $working . ";";                }            } elseif( $bytevalue <= 0xDF ) { //110x xxxx                $working = $bytevalue & 0x1F;                $bytesleft = 1;            } elseif( $bytevalue <= 0xEF ) { //1110 xxxx                $working = $bytevalue & 0x0F;                $bytesleft = 2;            } else { //1111 0xxx                $working = $bytevalue & 0x07;                $bytesleft = 3;            }        }        return $result;    }        /**     * This expensive function tests whether or not a given character     * encoding supports ASCII. 7/8-bit encodings like Shift_JIS will     * fail this test, and require special processing. Variable width     * encodings shouldn't ever fail.     *      * @param string $encoding Encoding name to test, as per iconv format     * @param bool $bypass Whether or not to bypass the precompiled arrays.     * @return Array of UTF-8 characters to their corresponding ASCII,     *      which can be used to "undo" any overzealous iconv action.     */    function testEncodingSupportsASCII($encoding, $bypass = false) {        static $encodings = array();        if (!$bypass) {            if (isset($encodings[$encoding])) return $encodings[$encoding];            $lenc = strtolower($encoding);            switch ($lenc) {                case 'shift_jis':                    return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');                case 'johab':                    return array("\xE2\x82\xA9" => '\\');            }            if (strpos($lenc, 'iso-8859-') === 0) return array();        }        $ret = array();        set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));        if (iconv('UTF-8', $encoding, 'a') === false) return false;        for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars            $c = chr($i);            if (iconv('UTF-8', "$encoding//IGNORE", $c) === '') {                // Reverse engineer: what's the UTF-8 equiv of this byte                // sequence? This assumes that there's no variable width                // encoding that doesn't support ASCII.                $ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c;            }        }        restore_error_handler();        $encodings[$encoding] = $ret;        return $ret;    }        }
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -