📄 idna_convert.class.php
字号:
/**
* Ccomposes a Hangul syllable
* (see http://www.unicode.org/unicode/reports/tr15/#Hangul
* @param array Decomposed UCS4 sequence
* @return array UCS4 sequence with syllables composed
* @access private
*/
function _hangul_compose($input)
{
$inp_len = count($input);
if (!$inp_len) return array();
$result = array();
$last = $input[0];
$result[] = $last; // copy first char from input to output
for ($i = 1; $i < $inp_len; ++$i) {
$char = $input[$i];
// Find out, wether two current characters from L and V
$lindex = $last - $this->_lbase;
if (0 <= $lindex && $lindex < $this->_lcount) {
$vindex = $char - $this->_vbase;
if (0 <= $vindex && $vindex < $this->_vcount) {
// create syllable of form LV
$last = ($this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount);
$out_off = count($result) - 1;
$result[$out_off] = $last; // reset last
continue; // discard char
}
}
// Find out, wether two current characters are LV and T
$sindex = $last - $this->_sbase;
if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount) == 0) {
$tindex = $char - $this->_tbase;
if (0 <= $tindex && $tindex <= $this->_tcount) {
// create syllable of form LVT
$last += $tindex;
$out_off = count($result) - 1;
$result[$out_off] = $last; // reset last
continue; // discard char
}
}
// if neither case was true, just add the character
$last = $char;
$result[] = $char;
}
return $result;
}
/**
* Returns the combining class of a certain wide char
* @param integer Wide char to check (32bit integer)
* @return integer Combining class if found, else 0
* @access private
*/
function _get_combining_class($char)
{
return isset($this->_np_['norm_combcls'][$char]) ? $this->_np_['norm_combcls'][$char] : 0;
}
/**
* Apllies the cannonical ordering of a decomposed UCS4 sequence
* @param array Decomposed UCS4 sequence
* @return array Ordered USC4 sequence
* @access private
*/
function _apply_cannonical_ordering($input)
{
$swap = true;
$size = count($input);
while ($swap) {
$swap = false;
$last = $this->_get_combining_class($input[0]);
for ($i = 0; $i < $size - 1; ++$i) {
$next = $this->_get_combining_class($input[$i+1]);
if ($next != 0 && $last > $next) {
// Move item leftward until it fits
for ($j = $i + 1; $j > 0; --$j) {
if ($this->_get_combining_class($input[$j - 1]) <= $next) break;
$t = $input[$j];
$input[$j] = $input[$j - 1];
$input[$j - 1] = $t;
$swap = 1;
}
// Reentering the loop looking at the old character again
$next = $last;
}
$last = $next;
}
}
return $input;
}
/**
* Do composition of a sequence of starter and non-starter
* @param array UCS4 Decomposed sequence
* @return array Ordered USC4 sequence
* @access private
*/
function _combine($input)
{
$inp_len = count($input);
// Is it a Hangul syllable?
if (1 != $inp_len) {
$hangul = $this->_hangul_compose($input);
if (count($hangul) != $inp_len) return $hangul; // This place is probably wrong
}
foreach ($this->_np_['replacemaps'] as $np_src => $np_target) {
if ($np_target[0] != $input[0]) continue;
if (count($np_target) != $inp_len) continue;
$hit = false;
foreach ($input as $k2 => $v2) {
if ($v2 == $np_target[$k2]) {
$hit = true;
} else {
$hit = false;
break;
}
}
if ($hit) return $np_src;
}
return false;
}
/**
* This converts an UTF-8 encoded string to its UCS-4 representation
* By talking about UCS-4 "strings" we mean arrays of 32bit integers representing
* each of the "chars". This is due to PHP not being able to handle strings with
* bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too.
* The following UTF-8 encodings are supported:
* bytes bits representation
* 1 7 0xxxxxxx
* 2 11 110xxxxx 10xxxxxx
* 3 16 1110xxxx 10xxxxxx 10xxxxxx
* 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* Each x represents a bit that can be used to store character data.
* The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000
* @access private
*/
function _utf8_to_ucs4($input)
{
$output = array();
$out_len = 0;
$inp_len = strlen($input);
$mode = 'next';
$test = 'none';
for ($k = 0; $k < $inp_len; ++$k) {
$v = ord($input{$k}); // Extract byte from input string
if ($v < 128) { // We found an ASCII char - put into stirng as is
$output[$out_len] = $v;
++$out_len;
if ('add' == $mode) {
$this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
return false;
}
continue;
}
if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
$start_byte = $v;
$mode = 'add';
$test = 'range';
if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
$next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
$v = ($v - 192) << 6;
} elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
$next_byte = 1;
$v = ($v - 224) << 12;
} elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
$next_byte = 2;
$v = ($v - 240) << 18;
} elseif ($v >> 2 == 62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
$next_byte = 3;
$v = ($v - 248) << 24;
} elseif ($v >> 1 == 126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
$next_byte = 4;
$v = ($v - 252) << 30;
} else {
$this->_error('This might be UTF-8, but I don\'t understand it at byte '.$k);
return false;
}
if ('add' == $mode) {
$output[$out_len] = (int) $v;
++$out_len;
continue;
}
}
if ('add' == $mode) {
if (!$this->_allow_overlong && $test == 'range') {
$test = 'none';
if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
$this->_error('Bogus UTF-8 character detected (out of legal range) at byte '.$k);
return false;
}
}
if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
$v = ($v - 128) << ($next_byte * 6);
$output[($out_len - 1)] += $v;
--$next_byte;
} else {
$this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
return false;
}
if ($next_byte < 0) {
$mode = 'next';
}
}
} // for
return $output;
}
/**
* Convert UCS-4 string into UTF-8 string
* See _utf8_to_ucs4() for details
* @access private
*/
function _ucs4_to_utf8($input)
{
$output = '';
foreach ($input as $v) {
// $v = ord($v);
if ($v < 128) { // 7bit are transferred literally
$output .= chr($v);
} elseif ($v < (1 << 11)) { // 2 bytes
$output .= chr(192 + ($v >> 6)) . chr(128 + ($v & 63));
} elseif ($v < (1 << 16)) { // 3 bytes
$output .= chr(224 + ($v >> 12)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
} elseif ($v < (1 << 21)) { // 4 bytes
$output .= chr(240 + ($v >> 18)) . chr(128 + (($v >> 12) & 63))
. chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
} elseif ($v < (1 << 26)) { // 5 bytes
$output .= chr(248 + ($v >> 24)) . chr(128 + (($v >> 18) & 63))
. chr(128 + (($v >> 12) & 63)) . chr(128 + (($v >> 6) & 63))
. chr(128 + ($v & 63));
} elseif ($v < (1 << 31)) { // 6 bytes
$output .= chr(252 + ($v >> 30)) . chr(128 + (($v >> 24) & 63))
. chr(128 + (($v >> 18) & 63)) . chr(128 + (($v >> 12) & 63))
. chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
} else {
$this->_error('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k);
return false;
}
}
return $output;
}
/**
* Convert UCS-4 array into UCS-4 string
*
* @access private
*/
function _ucs4_to_ucs4_string($input)
{
$output = '';
// Take array values and split output to 4 bytes per value
// The bit mask is 255, which reads &11111111
foreach ($input as $v) {
$output .= chr(($v >> 24) & 255)
. chr(($v >> 16) & 255)
. chr(($v >> 8) & 255)
. chr($v & 255);
}
return $output;
}
/**
* Convert UCS-4 strin into UCS-4 garray
*
* @access private
*/
function _ucs4_string_to_ucs4($input)
{
$output = array();
$inp_len = strlen($input);
// Input length must be dividable by 4
if ($inp_len % 4) {
$this->_error('Input UCS4 string is broken');
return false;
}
// Empty input - return empty output
if (!$inp_len) return $output;
for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
// Increment output position every 4 input bytes
if (!($i % 4)) {
$out_len++;
$output[$out_len] = 0;
}
$output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4) ) );
}
return $output;
}
}
/**
* Adapter class for aligning the API of idna_convert with that of
* Net_IDNA
* @author Matthias Sommerfeld <mso@phlylabs.de>
*/
class Net_IDNA_php4 extends idna_convert
{
/**
* Sets a new option value. Available options and values:
* [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
* 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
* [overlong - Unicode does not allow unnecessarily long encodings of chars,
* to allow this, set this parameter to true, else to false;
* default is false.]
* [strict - true: strict mode, good for registration purposes - Causes errors
* on failures; false: loose mode, ideal for "wildlife" applications
* by silently ignoring errors and returning the original input instead
*
* @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
* @param string Value to use (if parameter 1 is a string)
* @return boolean true on success, false otherwise
* @access public
*/
function setParams($option, $param = false)
{
return $this->IC->set_parameters($option, $param);
}
}
?>
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -