📄 utf_normalizer.php

📁 这些都是我以前学习是用到的源码
💻 PHP
📖 第 1 页 / 共 3 页
字号:
				}				else				{					// The char is not decomposable					$utf_seq = array($utf_char);				}				// STEP 2: Capture the starter				// Check out the combining class of the first character of the UTF sequence				$k = 0;				if (isset($utf_combining_class[$utf_seq[0]]) || $qc[$utf_char] == UNICODE_QC_MAYBE)				{					// Not a starter, inspect previous characters					// The last 8 characters are kept in a buffer so that we don't have to capture them everytime.					// This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode,					// although it is slower than this method.					//					// In the following loop, $j starts at the previous buffered character ($i - 1, because current character is					// at offset $i) and process them in backward mode until we find a starter.					//					// $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more					// characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering					$starter_found = 0;					$j_min = max(1, $i - 7);					for ($j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j)					{						$utf_char = $buffer[$j & 7];						$lpos -= strlen($utf_char);						if (isset($decomp_map[$utf_char]))						{							// The char is a composite, decompose for storage							$decomp_seq = array();							$_pos = 0;							$_len = strlen($decomp_map[$utf_char]);							do							{								$c = $decomp_map[$utf_char][$_pos];								$_utf_len =& $utf_len_mask[$c & "\xF0"];								if (isset($_utf_len))								{									$decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);									$_pos += $_utf_len;								}								else								{									$decomp_seq[] = $c;									++$_pos;								}							}							while ($_pos < $_len);							// Prepend the UTF sequence with our decomposed sequence							if (isset($decomp_seq[1]))							{								// The char expanded into several chars								$decomp_cnt = sizeof($decomp_seq);								foreach ($decomp_seq as $decomp_i => $decomp_char)								{									$utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char;								}								$k -= $decomp_cnt;							}							else							{								// Decomposed to a single char, easier to prepend								$utf_seq[--$k] = $decomp_seq[0];							}						}						else						{							$utf_seq[--$k] = $utf_char;						}						if (!isset($utf_combining_class[$utf_seq[$k]]))						{							// We have found our starter							$starter_found = 1;							break;						}					}					if (!$starter_found && $lpos > $tmp_pos)					{						// The starter was not found in the buffer, let's rewind some more						do						{							// $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte.							$c = $str[--$lpos];							$c_mask = $c & "\xF0";							if (isset($utf_len_mask[$c_mask]))							{								// UTF byte								if ($utf_len = $utf_len_mask[$c_mask])								{									// UTF *leading* byte									$utf_char = substr($str, $lpos, $utf_len);									if (isset($decomp_map[$utf_char]))									{										// Decompose the character										$decomp_seq = array();										$_pos = 0;										$_len = strlen($decomp_map[$utf_char]);										do										{											$c = $decomp_map[$utf_char][$_pos];											$_utf_len =& $utf_len_mask[$c & "\xF0"];											if (isset($_utf_len))											{												$decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);												$_pos += $_utf_len;											}											else											{												$decomp_seq[] = $c;												++$_pos;											}										}										while ($_pos < $_len);										// Prepend the UTF sequence with our decomposed sequence										if (isset($decomp_seq[1]))										{											// The char expanded into several chars											$decomp_cnt = sizeof($decomp_seq);											foreach ($decomp_seq as $decomp_i => $utf_char)											{												$utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char;											}											$k -= $decomp_cnt;										}										else										{											// Decomposed to a single char, easier to prepend											$utf_seq[--$k] = $decomp_seq[0];										}									}									else									{										$utf_seq[--$k] = $utf_char;									}								}							}							else							{								// ASCII char								$utf_seq[--$k] = $c;							}						}						while ($lpos > $tmp_pos);					}				}				// STEP 3: Capture following combining modifiers				while ($pos < $len)				{					$c_mask = $str[$pos] & "\xF0";					if (isset($utf_len_mask[$c_mask]))					{						if ($utf_len = $utf_len_mask[$c_mask])						{							$utf_char = substr($str, $pos, $utf_len);						}						else						{							// A trailing byte came out of nowhere							// Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop							// as if it was a starter (replacement chars ARE starters) and let the next loop replace it							break;						}						if (isset($utf_combining_class[$utf_char]) || isset($qc[$utf_char]))						{							// Combining character, add it to the sequence and move the cursor							if (isset($decomp_map[$utf_char]))							{								// Decompose the character								$_pos = 0;								$_len = strlen($decomp_map[$utf_char]);								do								{									$c = $decomp_map[$utf_char][$_pos];									$_utf_len =& $utf_len_mask[$c & "\xF0"];									if (isset($_utf_len))									{										$utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);										$_pos += $_utf_len;									}									else									{										$utf_seq[] = $c;										++$_pos;									}								}								while ($_pos < $_len);							}							else							{								$utf_seq[] = $utf_char;							}							$pos += $utf_len;						}						else						{							// Combining class 0 and no QC, break out of the loop							// Note: we do not know if that character is valid. If it's not, the next iteration will replace it							break;						}					}					else					{						// ASCII chars are starters						break;					}				}				// STEP 4: Sort and combine				// Here we sort...				$k_max = $k + sizeof($utf_seq);				if (!$k && $k_max == 1)				{					// There is only one char in the UTF sequence, add it then jump to the next iteration of main loop						// Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases//						if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos))//						{						$tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $utf_seq[0];						$tmp_pos = $pos;//						}					continue;				}				// ...there we combine				if (isset($utf_combining_class[$utf_seq[$k]]))				{					$starter = $nf_seq = '';				}				else				{					$starter = $utf_seq[$k++];					$nf_seq = '';				}				$utf_sort = array();				// We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine				// at the end of the string without altering it				$utf_seq[] = '';				do				{					$utf_char = $utf_seq[$k++];					if (isset($utf_combining_class[$utf_char]))					{						$utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;					}					else					{						if (empty($utf_sort))						{							// No combining characters... check for a composite of the two starters							if (isset($utf_canonical_comp[$starter . $utf_char]))							{								// Good ol' composite character								$starter = $utf_canonical_comp[$starter . $utf_char];							}							else if (isset($utf_jamo_type[$utf_char]))							{								// Current char is a composable jamo								if (isset($utf_jamo_type[$starter]) && $utf_jamo_type[$starter] == UNICODE_JAMO_L && $utf_jamo_type[$utf_char] == UNICODE_JAMO_V)								{									// We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo									if (isset($utf_jamo_type[$utf_seq[$k]]) && $utf_jamo_type[$utf_seq[$k]] == UNICODE_JAMO_T)									{										// L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented)										$cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char] + $utf_jamo_index[$utf_seq[$k]];										++$k;									}									else									{										// L+V jamos, combine to a LV Hangul syllable										$cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char];									}									$starter = chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));								}								else								{									// Non-composable jamo, just add it to the sequence									$nf_seq .= $starter;									$starter = $utf_char;								}							}							else							{								// No composite, just add the first starter to the sequence then continue with the other one								$nf_seq .= $starter;								$starter = $utf_char;							}						}						else						{							ksort($utf_sort);							// For each class of combining characters							foreach ($utf_sort as $cc => $utf_chars)							{								$j = 0;								do								{									// Look for a composite									if (isset($utf_canonical_comp[$starter . $utf_chars[$j]]))									{										// Found a composite, replace the starter										$starter = $utf_canonical_comp[$starter . $utf_chars[$j]];										unset($utf_sort[$cc][$j]);									}									else									{										// No composite, all following characters in that class are blocked										break;									}								}								while (isset($utf_sort[$cc][++$j]));							}							// Add the starter to the normalized sequence, followed by non-starters in canonical order							$nf_seq .= $starter;							foreach ($utf_sort as $utf_chars)							{								if (!empty($utf_chars))								{									$nf_seq .= implode('', $utf_chars);								}							}							// Reset the array and go on							$utf_sort = array();							$starter = $utf_char;						}					}				}				while ($k <= $k_max);				$tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $nf_seq;				$tmp_pos = $pos;			}			else			{				// Only a ASCII char can make the program get here				//				// First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn().				//				// The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on				// multi-byte text (where the only ASCII chars are spaces and punctuation)				if (++$pos != $len)				{					if ($str[$pos] < "\x80")					{						$pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);						$buffer[++$i & 7] = $str[$pos - 1];					}					else					{						$buffer[++$i & 7] = $c;					}				}			}		}		while ($pos < $len);		// Now is time to return the string		if ($tmp_pos)		{			// If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version			if ($tmp_pos == $len)			{				// The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str				return $tmp;			}			else			{				// The rightmost chunk of $str has not been appended to $tmp yet				return $tmp . substr($str, $tmp_pos);			}		}		// The string was already in normal form		return $str;	}	/**	* Decompose a UTF string	*	* @param	string	$str			UTF string	* @param	integer	$pos			Position of the first UTF char (in bytes)	* @param	integer	$len			Length of the string (in bytes)	* @param	array	&$decomp_map	Decomposition mapping, passed by reference but never modified	* @return	string					The string, decomposed and sorted canonically	*	* @access	private	*/	function decompose($str, $pos, $len, &$decomp_map)	{		global $utf_combining_class, $phpbb_root_path;		// Load some commonly-used tables		if (!isset($utf_combining_class))		{			include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.php');		}		// UTF char length array		$utf_len_mask = array(			// Leading bytes masks			"\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,			// Trailing bytes masks			"\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0		);		// Some extra checks are triggered on the first byte of a UTF sequence		$extra_check = array(			"\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,			"\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,			"\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1		);		// These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge:		//   - 2-byte: 110? ???? 10?? ????		//   - 3-byte: 1110 ???? 10?? ???? 10?? ????		//   - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????		// Note that 5- and 6- byte sequences are automatically discarded		$utf_validation_mask = array(			2	=> "\xE0\xC0",			3	=> "\xF0\xC0\xC0",			4	=> "\xF8\xC0\xC0\xC0"		);		$utf_validation_check = array(			2	=> "\xC0\x80",			3	=> "\xE0\x80\x80",			4	=> "\xF0\x80\x80\x80"		);		$tmp = '';		$starter_pos = $pos;		$tmp_pos = $last_cc = $sort = $dump = 0;		$utf_sort = array();		// Main loop		do		{			// STEP 0: Capture the current char			$cur_mask = $str[$pos] & "\xF0";			if (isset($utf_len_mask[$cur_mask]))			{				if ($utf_len = $utf_len_mask[$cur_mask])				{					// Multibyte char					$utf_char = substr($str, $pos, $utf_len);					$pos += $utf_len;				}				else				{					// A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode					// replacement char and we will advance the cursor					$spn = strspn($str, UTF8_TRAILING_BYTES, $pos);					if ($dump)					{						$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);						// Dump combiners						if (!empty($utf_sort))
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -