📄 utf_normalizer.php

📁 这些都是我以前学习是用到的源码
💻 PHP
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
						{							if ($sort)							{								ksort($utf_sort);							}							foreach ($utf_sort as $utf_chars)							{								$tmp .= implode('', $utf_chars);							}						}						$tmp .= str_repeat(UTF8_REPLACEMENT, $spn);						$dump = $sort = 0;					}					else					{						$tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);					}					$pos += $spn;					$tmp_pos = $starter_pos = $pos;					$utf_sort = array();					$last_cc = 0;					continue;				}				// STEP 1: Decide what to do with current char				// Now, in that order:				//  - check if that character is decomposable				//  - check if that character is a non-starter				//  - check if that character requires extra checks to be performed				if (isset($decomp_map[$utf_char]))				{					// Decompose the char					$_pos = 0;					$_len = strlen($decomp_map[$utf_char]);					do					{						$c = $decomp_map[$utf_char][$_pos];						$_utf_len =& $utf_len_mask[$c & "\xF0"];						if (isset($_utf_len))						{							$_utf_char = substr($decomp_map[$utf_char], $_pos, $_utf_len);							$_pos += $_utf_len;							if (isset($utf_combining_class[$_utf_char]))							{								// The character decomposed to a non-starter, buffer it for sorting								$utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char;								if ($utf_combining_class[$_utf_char] < $last_cc)								{									// Not canonically ordered, will require sorting									$sort = $dump = 1;								}								else								{									$dump = 1;									$last_cc = $utf_combining_class[$_utf_char];								}							}							else							{								// This character decomposition contains a starter, dump the buffer and continue								if ($dump)								{									$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);									// Dump combiners									if (!empty($utf_sort))									{										if ($sort)										{											ksort($utf_sort);										}										foreach ($utf_sort as $utf_chars)										{											$tmp .= implode('', $utf_chars);										}									}									$tmp .= $_utf_char;									$dump = $sort = 0;								}								else								{									$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos) . $_utf_char;								}								$tmp_pos = $starter_pos = $pos;								$utf_sort = array();								$last_cc = 0;							}						}						else						{							// This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue							++$_pos;							if ($dump)							{								$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);								// Dump combiners								if (!empty($utf_sort))								{									if ($sort)									{										ksort($utf_sort);									}									foreach ($utf_sort as $utf_chars)									{										$tmp .= implode('', $utf_chars);									}								}								$tmp .= $c;								$dump = $sort = 0;							}							else							{								$tmp .= substr($str, $tmp_pos, $pos - $utf_len - $tmp_pos) . $c;							}							$tmp_pos = $starter_pos = $pos;							$utf_sort = array();							$last_cc = 0;						}					}					while ($_pos < $_len);				}				else if (isset($utf_combining_class[$utf_char]))				{					// Combining character					if ($utf_combining_class[$utf_char] < $last_cc)					{						// Not in canonical order						$sort = $dump = 1;					}					else					{						$last_cc = $utf_combining_class[$utf_char];					}					$utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;				}				else				{					// Non-decomposable starter, check out if it's a Hangul syllable					if ($utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST)					{						// Nope, regular UTF char, check that we have the correct number of trailing bytes						if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])						{							// Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char							// has been encoded in a five- or six- byte sequence.							// Move the cursor back to its original position then advance it to the position it should really be at							$pos -= $utf_len;							$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);							if (!empty($utf_sort))							{								ksort($utf_sort);								foreach ($utf_sort as $utf_chars)								{									$tmp .= implode('', $utf_chars);								}								$utf_sort = array();							}							// Add a replacement char then another replacement char for every trailing byte.							//							// @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this							$spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos);							$tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1);							$dump = $sort = 0;							$pos += $spn;							$tmp_pos = $pos;							continue;						}						if (isset($extra_check[$utf_char[0]]))						{							switch ($utf_char[0])							{								// Note: 0xED is quite common in Korean								case "\xED":									if ($utf_char >= "\xED\xA0\x80")									{										// Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)										$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);										if (!empty($utf_sort))										{											ksort($utf_sort);											foreach ($utf_sort as $utf_chars)											{												$tmp .= implode('', $utf_chars);											}											$utf_sort = array();										}										$tmp .= UTF8_REPLACEMENT;										$dump = $sort = 0;										$tmp_pos = $starter_pos = $pos;										continue 2;									}								break;								// Note: 0xEF is quite common in Japanese								case "\xEF":									if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")									{										// U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)										$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);										if (!empty($utf_sort))										{											ksort($utf_sort);											foreach ($utf_sort as $utf_chars)											{												$tmp .= implode('', $utf_chars);											}											$utf_sort = array();										}										$tmp .= UTF8_REPLACEMENT;										$dump = $sort = 0;										$tmp_pos = $starter_pos = $pos;										continue 2;									}								break;								case "\xC0":								case "\xC1":									if ($utf_char <= "\xC1\xBF")									{										// Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char										$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);										if (!empty($utf_sort))										{											ksort($utf_sort);											foreach ($utf_sort as $utf_chars)											{												$tmp .= implode('', $utf_chars);											}											$utf_sort = array();										}										$tmp .= UTF8_REPLACEMENT;										$dump = $sort = 0;										$tmp_pos = $starter_pos = $pos;										continue 2;									}								break;								case "\xE0":									if ($utf_char <= "\xE0\x9F\xBF")									{										// Unicode char U+0000..U+07FF encoded in 3 bytes										$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);										if (!empty($utf_sort))										{											ksort($utf_sort);											foreach ($utf_sort as $utf_chars)											{												$tmp .= implode('', $utf_chars);											}											$utf_sort = array();										}										$tmp .= UTF8_REPLACEMENT;										$dump = $sort = 0;										$tmp_pos = $starter_pos = $pos;										continue 2;									}								break;								case "\xF0":									if ($utf_char <= "\xF0\x8F\xBF\xBF")									{										// Unicode char U+0000..U+FFFF encoded in 4 bytes										$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);										if (!empty($utf_sort))										{											ksort($utf_sort);											foreach ($utf_sort as $utf_chars)											{												$tmp .= implode('', $utf_chars);											}											$utf_sort = array();										}										$tmp .= UTF8_REPLACEMENT;										$dump = $sort = 0;										$tmp_pos = $starter_pos = $pos;										continue 2;									}								break;								default:									if ($utf_char > UTF8_MAX)									{										// Out of the Unicode range										$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);										if (!empty($utf_sort))										{											ksort($utf_sort);											foreach ($utf_sort as $utf_chars)											{												$tmp .= implode('', $utf_chars);											}											$utf_sort = array();										}										$tmp .= UTF8_REPLACEMENT;										$dump = $sort = 0;										$tmp_pos = $starter_pos = $pos;										continue 2;									}								break;							}						}					}					else					{						// Hangul syllable						$idx = (((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE;						// LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).						//						// The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte						if ($t_index = $idx % UNICODE_HANGUL_TCOUNT)						{							if ($t_index < 25)							{								$utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";								$utf_char[8] = chr(0xA7 + $t_index);							}							else							{								$utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";								$utf_char[8] = chr(0x67 + $t_index);							}						}						else						{							$utf_char = "\xE1\x84\x00\xE1\x85\x00";						}						$utf_char[2] = chr(0x80 + (int) ($idx / UNICODE_HANGUL_NCOUNT));						$utf_char[5] = chr(0xA1 + (int) (($idx % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT));						// Just like other decompositions, the resulting Jamos must be dumped to the tmp string						$dump = 1;					}					// Do we need to dump stuff to the tmp string?					if ($dump)					{						$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);						// Dump combiners						if (!empty($utf_sort))						{							if ($sort)							{								ksort($utf_sort);							}							foreach ($utf_sort as $utf_chars)							{								$tmp .= implode('', $utf_chars);							}						}						$tmp .= $utf_char;						$dump = $sort = 0;						$tmp_pos = $pos;					}					$last_cc = 0;					$utf_sort = array();					$starter_pos = $pos;				}			}			else			{				// ASCII char, which happens to be a starter (as any other ASCII char)				if ($dump)				{					$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);					// Dump combiners					if (!empty($utf_sort))					{						if ($sort)						{							ksort($utf_sort);						}						foreach ($utf_sort as $utf_chars)						{							$tmp .= implode('', $utf_chars);						}					}					$tmp .= $str[$pos];					$dump = $sort = 0;					$tmp_pos = ++$pos;					$pos += strspn($str, UTF8_ASCII_RANGE, $pos);				}				else				{					$pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);				}				$last_cc = 0;				$utf_sort = array();				$starter_pos = $pos;			}		}		while ($pos < $len);		// Now is time to return the string		if ($dump)		{			$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);			// Dump combiners			if (!empty($utf_sort))			{				if ($sort)				{					ksort($utf_sort);				}				foreach ($utf_sort as $utf_chars)				{					$tmp .= implode('', $utf_chars);				}			}			return $tmp;		}		else if ($tmp_pos)		{			// If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version			if ($tmp_pos == $len)			{				// The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str				return $tmp;			}			else			{				// The rightmost chunk of $str has not been appended to $tmp yet				return $tmp . substr($str, $tmp_pos);			}		}		// The string was already in normal form		return $str;	}}?>
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -