📄 utf_normalizer.php
字号:
{ if ($sort) { ksort($utf_sort); } foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } } $tmp .= str_repeat(UTF8_REPLACEMENT, $spn); $dump = $sort = 0; } else { $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn); } $pos += $spn; $tmp_pos = $starter_pos = $pos; $utf_sort = array(); $last_cc = 0; continue; } // STEP 1: Decide what to do with current char // Now, in that order: // - check if that character is decomposable // - check if that character is a non-starter // - check if that character requires extra checks to be performed if (isset($decomp_map[$utf_char])) { // Decompose the char $_pos = 0; $_len = strlen($decomp_map[$utf_char]); do { $c = $decomp_map[$utf_char][$_pos]; $_utf_len =& $utf_len_mask[$c & "\xF0"]; if (isset($_utf_len)) { $_utf_char = substr($decomp_map[$utf_char], $_pos, $_utf_len); $_pos += $_utf_len; if (isset($utf_combining_class[$_utf_char])) { // The character decomposed to a non-starter, buffer it for sorting $utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char; if ($utf_combining_class[$_utf_char] < $last_cc) { // Not canonically ordered, will require sorting $sort = $dump = 1; } else { $dump = 1; $last_cc = $utf_combining_class[$_utf_char]; } } else { // This character decomposition contains a starter, dump the buffer and continue if ($dump) { $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); // Dump combiners if (!empty($utf_sort)) { if ($sort) { ksort($utf_sort); } foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } } $tmp .= $_utf_char; $dump = $sort = 0; } else { $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos) . $_utf_char; } $tmp_pos = $starter_pos = $pos; $utf_sort = array(); $last_cc = 0; } } else { // This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue ++$_pos; if ($dump) { $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); // Dump combiners if (!empty($utf_sort)) { if ($sort) { ksort($utf_sort); } foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } } $tmp .= $c; $dump = $sort = 0; } else { $tmp .= substr($str, $tmp_pos, $pos - $utf_len - $tmp_pos) . $c; } $tmp_pos = $starter_pos = $pos; $utf_sort = array(); $last_cc = 0; } } while ($_pos < $_len); } else if (isset($utf_combining_class[$utf_char])) { // Combining character if ($utf_combining_class[$utf_char] < $last_cc) { // Not in canonical order $sort = $dump = 1; } else { $last_cc = $utf_combining_class[$utf_char]; } $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char; } else { // Non-decomposable starter, check out if it's a Hangul syllable if ($utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST) { // Nope, regular UTF char, check that we have the correct number of trailing bytes if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len]) { // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char // has been encoded in a five- or six- byte sequence. // Move the cursor back to its original position then advance it to the position it should really be at $pos -= $utf_len; $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); if (!empty($utf_sort)) { ksort($utf_sort); foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } $utf_sort = array(); } // Add a replacement char then another replacement char for every trailing byte. // // @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this $spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos); $tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1); $dump = $sort = 0; $pos += $spn; $tmp_pos = $pos; continue; } if (isset($extra_check[$utf_char[0]])) { switch ($utf_char[0]) { // Note: 0xED is quite common in Korean case "\xED": if ($utf_char >= "\xED\xA0\x80") { // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF) $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); if (!empty($utf_sort)) { ksort($utf_sort); foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } $utf_sort = array(); } $tmp .= UTF8_REPLACEMENT; $dump = $sort = 0; $tmp_pos = $starter_pos = $pos; continue 2; } break; // Note: 0xEF is quite common in Japanese case "\xEF": if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF") { // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF) $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); if (!empty($utf_sort)) { ksort($utf_sort); foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } $utf_sort = array(); } $tmp .= UTF8_REPLACEMENT; $dump = $sort = 0; $tmp_pos = $starter_pos = $pos; continue 2; } break; case "\xC0": case "\xC1": if ($utf_char <= "\xC1\xBF") { // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); if (!empty($utf_sort)) { ksort($utf_sort); foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } $utf_sort = array(); } $tmp .= UTF8_REPLACEMENT; $dump = $sort = 0; $tmp_pos = $starter_pos = $pos; continue 2; } break; case "\xE0": if ($utf_char <= "\xE0\x9F\xBF") { // Unicode char U+0000..U+07FF encoded in 3 bytes $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); if (!empty($utf_sort)) { ksort($utf_sort); foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } $utf_sort = array(); } $tmp .= UTF8_REPLACEMENT; $dump = $sort = 0; $tmp_pos = $starter_pos = $pos; continue 2; } break; case "\xF0": if ($utf_char <= "\xF0\x8F\xBF\xBF") { // Unicode char U+0000..U+FFFF encoded in 4 bytes $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); if (!empty($utf_sort)) { ksort($utf_sort); foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } $utf_sort = array(); } $tmp .= UTF8_REPLACEMENT; $dump = $sort = 0; $tmp_pos = $starter_pos = $pos; continue 2; } break; default: if ($utf_char > UTF8_MAX) { // Out of the Unicode range $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); if (!empty($utf_sort)) { ksort($utf_sort); foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } $utf_sort = array(); } $tmp .= UTF8_REPLACEMENT; $dump = $sort = 0; $tmp_pos = $starter_pos = $pos; continue 2; } break; } } } else { // Hangul syllable $idx = (((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE; // LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase). // // The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte if ($t_index = $idx % UNICODE_HANGUL_TCOUNT) { if ($t_index < 25) { $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00"; $utf_char[8] = chr(0xA7 + $t_index); } else { $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00"; $utf_char[8] = chr(0x67 + $t_index); } } else { $utf_char = "\xE1\x84\x00\xE1\x85\x00"; } $utf_char[2] = chr(0x80 + (int) ($idx / UNICODE_HANGUL_NCOUNT)); $utf_char[5] = chr(0xA1 + (int) (($idx % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT)); // Just like other decompositions, the resulting Jamos must be dumped to the tmp string $dump = 1; } // Do we need to dump stuff to the tmp string? if ($dump) { $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); // Dump combiners if (!empty($utf_sort)) { if ($sort) { ksort($utf_sort); } foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } } $tmp .= $utf_char; $dump = $sort = 0; $tmp_pos = $pos; } $last_cc = 0; $utf_sort = array(); $starter_pos = $pos; } } else { // ASCII char, which happens to be a starter (as any other ASCII char) if ($dump) { $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); // Dump combiners if (!empty($utf_sort)) { if ($sort) { ksort($utf_sort); } foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } } $tmp .= $str[$pos]; $dump = $sort = 0; $tmp_pos = ++$pos; $pos += strspn($str, UTF8_ASCII_RANGE, $pos); } else { $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos); } $last_cc = 0; $utf_sort = array(); $starter_pos = $pos; } } while ($pos < $len); // Now is time to return the string if ($dump) { $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); // Dump combiners if (!empty($utf_sort)) { if ($sort) { ksort($utf_sort); } foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } } return $tmp; } else if ($tmp_pos) { // If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version if ($tmp_pos == $len) { // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str return $tmp; } else { // The rightmost chunk of $str has not been appended to $tmp yet return $tmp . substr($str, $tmp_pos); } } // The string was already in normal form return $str; }}?>
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -