idna_convert.class.php

来自「Joomla15 - 最新开源CMS」· PHP 代码 · 共 991 行 · 第 1/3 页
PHP
991 行
    /**
    * Ccomposes a Hangul syllable
    * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
    * @param    array    Decomposed UCS4 sequence
    * @return   array    UCS4 sequence with syllables composed
    * @access   private
    */
    function _hangul_compose($input)
    {
        $inp_len = count($input);
        if (!$inp_len) return array();
        $result = array();
        $last = $input[0];
        $result[] = $last; // copy first char from input to output

        for ($i = 1; $i < $inp_len; ++$i) {
            $char = $input[$i];

            // Find out, wether two current characters from L and V
            $lindex = $last - $this->_lbase;
            if (0 <= $lindex && $lindex < $this->_lcount) {
                $vindex = $char - $this->_vbase;
                if (0 <= $vindex && $vindex < $this->_vcount) {
                    // create syllable of form LV
                    $last = ($this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount);
                    $out_off = count($result) - 1;
                    $result[$out_off] = $last; // reset last
                    continue; // discard char
                }
            }

            // Find out, wether two current characters are LV and T
            $sindex = $last - $this->_sbase;
            if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount) == 0) {
                $tindex = $char - $this->_tbase;
                if (0 <= $tindex && $tindex <= $this->_tcount) {
                    // create syllable of form LVT
                    $last += $tindex;
                    $out_off = count($result) - 1;
                    $result[$out_off] = $last; // reset last
                    continue; // discard char
                }
            }
            // if neither case was true, just add the character
            $last = $char;
            $result[] = $char;
        }
        return $result;
    }

    /**
    * Returns the combining class of a certain wide char
    * @param    integer    Wide char to check (32bit integer)
    * @return   integer    Combining class if found, else 0
    * @access   private
    */
    function _get_combining_class($char)
    {
        return isset($this->_np_['norm_combcls'][$char]) ? $this->_np_['norm_combcls'][$char] : 0;
    }

    /**
    * Apllies the cannonical ordering of a decomposed UCS4 sequence
    * @param    array      Decomposed UCS4 sequence
    * @return   array      Ordered USC4 sequence
    * @access   private
    */
    function _apply_cannonical_ordering($input)
    {
        $swap = true;
        $size = count($input);
        while ($swap) {
            $swap = false;
            $last = $this->_get_combining_class($input[0]);
            for ($i = 0; $i < $size - 1; ++$i) {
                $next = $this->_get_combining_class($input[$i+1]);
                if ($next != 0 && $last > $next) {
                    // Move item leftward until it fits
                    for ($j = $i + 1; $j > 0; --$j) {
                        if ($this->_get_combining_class($input[$j - 1]) <= $next) break;
                        $t = $input[$j];
                        $input[$j] = $input[$j - 1];
                        $input[$j - 1] = $t;
                        $swap = 1;
                    }
                    // Reentering the loop looking at the old character again
                    $next = $last;
                }
                $last = $next;
            }
        }
        return $input;
    }

    /**
    * Do composition of a sequence of starter and non-starter
    * @param    array      UCS4 Decomposed sequence
    * @return   array      Ordered USC4 sequence
    * @access   private
    */
    function _combine($input)
    {
        $inp_len = count($input);
        // Is it a Hangul syllable?
        if (1 != $inp_len) {
            $hangul = $this->_hangul_compose($input);
            if (count($hangul) != $inp_len) return $hangul; // This place is probably wrong
        }
        foreach ($this->_np_['replacemaps'] as $np_src => $np_target) {
            if ($np_target[0] != $input[0]) continue;
            if (count($np_target) != $inp_len) continue;
            $hit = false;
            foreach ($input as $k2 => $v2) {
                if ($v2 == $np_target[$k2]) {
                    $hit = true;
                } else {
                    $hit = false;
                    break;
                }
            }
            if ($hit) return $np_src;
        }
        return false;
    }

    /**
    * This converts an UTF-8 encoded string to its UCS-4 representation
    * By talking about UCS-4 "strings" we mean arrays of 32bit integers representing
    * each of the "chars". This is due to PHP not being able to handle strings with
    * bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too.
    * The following UTF-8 encodings are supported:
    * bytes bits  representation
    * 1        7  0xxxxxxx
    * 2       11  110xxxxx 10xxxxxx
    * 3       16  1110xxxx 10xxxxxx 10xxxxxx
    * 4       21  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    * 5       26  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    * 6       31  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    * Each x represents a bit that can be used to store character data.
    * The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000
    * @access   private
    */
    function _utf8_to_ucs4($input)
    {
        $output = array();
        $out_len = 0;
        $inp_len = strlen($input);
        $mode = 'next';
        $test = 'none';
        for ($k = 0; $k < $inp_len; ++$k) {
            $v = ord($input{$k}); // Extract byte from input string

            if ($v < 128) { // We found an ASCII char - put into stirng as is
                $output[$out_len] = $v;
                ++$out_len;
                if ('add' == $mode) {
                    $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
                    return false;
                }
                continue;
            }
            if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
                $start_byte = $v;
                $mode = 'add';
                $test = 'range';
                if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
                    $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
                    $v = ($v - 192) << 6;
                } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
                    $next_byte = 1;
                    $v = ($v - 224) << 12;
                } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                    $next_byte = 2;
                    $v = ($v - 240) << 18;
                } elseif ($v >> 2 == 62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
                    $next_byte = 3;
                    $v = ($v - 248) << 24;
                } elseif ($v >> 1 == 126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
                    $next_byte = 4;
                    $v = ($v - 252) << 30;
                } else {
                    $this->_error('This might be UTF-8, but I don\'t understand it at byte '.$k);
                    return false;
                }
                if ('add' == $mode) {
                    $output[$out_len] = (int) $v;
                    ++$out_len;
                    continue;
                }
            }
            if ('add' == $mode) {
                if (!$this->_allow_overlong && $test == 'range') {
                    $test = 'none';
                    if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
                        $this->_error('Bogus UTF-8 character detected (out of legal range) at byte '.$k);
                        return false;
                    }
                }
                if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
                    $v = ($v - 128) << ($next_byte * 6);
                    $output[($out_len - 1)] += $v;
                    --$next_byte;
                } else {
                    $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
                    return false;
                }
                if ($next_byte < 0) {
                    $mode = 'next';
                }
            }
        } // for
        return $output;
    }

    /**
    * Convert UCS-4 string into UTF-8 string
    * See _utf8_to_ucs4() for details
    * @access   private
    */
    function _ucs4_to_utf8($input)
    {
        $output = '';
        foreach ($input as $v) {
            // $v = ord($v);
            if ($v < 128) { // 7bit are transferred literally
                $output .= chr($v);
            } elseif ($v < (1 << 11)) { // 2 bytes
                $output .= chr(192 + ($v >> 6)) . chr(128 + ($v & 63));
            } elseif ($v < (1 << 16)) { // 3 bytes
                $output .= chr(224 + ($v >> 12)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
            } elseif ($v < (1 << 21)) { // 4 bytes
                $output .= chr(240 + ($v >> 18)) . chr(128 + (($v >> 12) & 63))
                         . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
            } elseif ($v < (1 << 26)) { // 5 bytes
                $output .= chr(248 + ($v >> 24)) . chr(128 + (($v >> 18) & 63))
                         . chr(128 + (($v >> 12) & 63)) . chr(128 + (($v >> 6) & 63))
                         . chr(128 + ($v & 63));
            } elseif ($v < (1 << 31)) { // 6 bytes
                $output .= chr(252 + ($v >> 30)) . chr(128 + (($v >> 24) & 63))
                         . chr(128 + (($v >> 18) & 63)) . chr(128 + (($v >> 12) & 63))
                         . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
            } else {
                $this->_error('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k);
                return false;
            }
        }
        return $output;
    }

    /**
     * Convert UCS-4 array into UCS-4 string
     *
     * @access   private
     */
    function _ucs4_to_ucs4_string($input)
    {
        $output = '';
        // Take array values and split output to 4 bytes per value
        // The bit mask is 255, which reads &11111111
        foreach ($input as $v) {
            $output .= chr(($v >> 24) & 255)
                     . chr(($v >> 16) & 255)
                     . chr(($v >> 8) & 255)
                     . chr($v & 255);
        }
        return $output;
    }

    /**
     * Convert UCS-4 strin into UCS-4 garray
     *
     * @access   private
     */
    function _ucs4_string_to_ucs4($input)
    {
        $output = array();

        $inp_len = strlen($input);
        // Input length must be dividable by 4
        if ($inp_len % 4) {
            $this->_error('Input UCS4 string is broken');
            return false;
        }

        // Empty input - return empty output
        if (!$inp_len) return $output;

        for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
            // Increment output position every 4 input bytes
            if (!($i % 4)) {
                $out_len++;
                $output[$out_len] = 0;
            }
            $output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4) ) );
        }
        return $output;
    }
}

/**
* Adapter class for aligning the API of idna_convert with that of
* Net_IDNA
* @author  Matthias Sommerfeld <mso@phlylabs.de>
*/
class Net_IDNA_php4 extends idna_convert
{
    /**
    * Sets a new option value. Available options and values:
    * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
    *         'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
    * [overlong - Unicode does not allow unnecessarily long encodings of chars,
    *             to allow this, set this parameter to true, else to false;
    *             default is false.]
    * [strict - true: strict mode, good for registration purposes - Causes errors
    *           on failures; false: loose mode, ideal for "wildlife" applications
    *           by silently ignoring errors and returning the original input instead
    *
    * @param    mixed     Parameter to set (string: single parameter; array of Parameter => Value pairs)
    * @param    string    Value to use (if parameter 1 is a string)
    * @return   boolean   true on success, false otherwise
    * @access   public
    */
    function setParams($option, $param = false)
    {
        return $this->IC->set_parameters($option, $param);
    }
}

?>
idna_convert.class.php - 源码说明

本页面展示了「Joomla15 - 最新开源CMS」中的 idna_convert.class.php 源码文件，采用 PHP 编程语言编写，共 991 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Joomla相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?