utfnormal.php

来自「php 开发的内容管理系统」· PHP 代码 · 共 793 行 · 第 1/2 页

PHP
793
字号
								$head = '';								continue;							}						} else {							# Slower, but rarer checks...							$n = ord( $head );							if(								# "Overlong sequences" are those that are syntactically								# correct but use more UTF-8 bytes than are necessary to								# encode a character. Na茂ve string comparisons can be								# tricked into failing to see a match for an ASCII								# character, for instance, which can be a security hole								# if blacklist checks are being used.							       ($n  < 0xc2 && $sequence <= UTF8_OVERLONG_A)								|| ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B)								|| ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C)								# U+FFFE and U+FFFF are explicitly forbidden in Unicode.								|| ($n == 0xef &&									   ($sequence == UTF8_FFFE)									|| ($sequence == UTF8_FFFF) )								# Unicode has been limited to 21 bits; longer								# sequences are not allowed.								|| ($n >= 0xf0 && $sequence > UTF8_MAX) ) {								$replace[] = array( UTF8_REPLACEMENT,								                    $base + $i + 1 - strlen( $sequence ),								                    strlen( $sequence ) );								$head = '';								continue;							}						}					}					if( isset( $utfCheckOrCombining[$sequence] ) ) {						# If it's NO or MAYBE, we'll have to rip						# the string apart and put it back together.						# That's going to be mighty slow.						$looksNormal = false;					}					# The sequence is legal!					$head = '';				} elseif( $c < "\x80" ) {					# ASCII byte.					$head = '';				} elseif( $c < "\xc0" ) {					# Illegal tail bytes					if( $head == '' ) {						# Out of the blue!						$replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );					} else {						# Don't add if we're continuing a broken sequence;						# we already put a replacement character when we looked						# at the broken sequence.						$replace[] = array( '', $base + $i, 1 );					}				} else {					# Miscellaneous freaks.					$replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );					$head = '';				}			}			$base += $chunk;		}		if( count( $replace ) ) {			# There were illegal UTF-8 sequences we need to fix up.			$out = '';			$last = 0;			foreach( $replace as $rep ) {				list( $replacement, $start, $length ) = $rep;				if( $last < $start ) {					$out .= substr( $string, $last, $start - $last );				}				$out .= $replacement;				$last = $start + $length;			}			if( $last < strlen( $string ) ) {				$out .= substr( $string, $last );			}			$string = $out;		}		return $looksNormal;	}	# These take a string and run the normalization on them, without	# checking for validity or any optimization etc. Input must be	# VALID UTF-8!	/**	 * @param string $string	 * @return string	 * @private	 */	function NFC( $string ) {		return UtfNormal::fastCompose( UtfNormal::NFD( $string ) );	}	/**	 * @param string $string	 * @return string	 * @private	 */	function NFD( $string ) {		UtfNormal::loadData();		global $utfCanonicalDecomp;		return UtfNormal::fastCombiningSort(			UtfNormal::fastDecompose( $string, $utfCanonicalDecomp ) );	}	/**	 * @param string $string	 * @return string	 * @private	 */	function NFKC( $string ) {		return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );	}	/**	 * @param string $string	 * @return string	 * @private	 */	function NFKD( $string ) {		global $utfCompatibilityDecomp;		if( !isset( $utfCompatibilityDecomp ) ) {			require_once( 'UtfNormalDataK.inc' );		}		return UtfNormal::fastCombiningSort(			UtfNormal::fastDecompose( $string, $utfCompatibilityDecomp ) );	}	/**	 * Perform decomposition of a UTF-8 string into either D or KD form	 * (depending on which decomposition map is passed to us).	 * Input is assumed to be *valid* UTF-8. Invalid code will break.	 * @private	 * @param string $string Valid UTF-8 string	 * @param array $map hash of expanded decomposition map	 * @return string a UTF-8 string decomposed, not yet normalized (needs sorting)	 */	function fastDecompose( $string, &$map ) {		UtfNormal::loadData();		$len = strlen( $string );		$out = '';		for( $i = 0; $i < $len; $i++ ) {			$c = $string{$i};			$n = ord( $c );			if( $n < 0x80 ) {				# ASCII chars never decompose				# THEY ARE IMMORTAL				$out .= $c;				continue;			} elseif( $n >= 0xf0 ) {				$c = substr( $string, $i, 4 );				$i += 3;			} elseif( $n >= 0xe0 ) {				$c = substr( $string, $i, 3 );				$i += 2;			} elseif( $n >= 0xc0 ) {				$c = substr( $string, $i, 2 );				$i++;			}			if( isset( $map[$c] ) ) {				$out .= $map[$c];				continue;			} else {				if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {					# Decompose a hangul syllable into jamo;					# hardcoded for three-byte UTF-8 sequence.					# A lookup table would be slightly faster,					# but adds a lot of memory & disk needs.					#					$index = ( (ord( $c{0} ) & 0x0f) << 12					         | (ord( $c{1} ) & 0x3f) <<  6					         | (ord( $c{2} ) & 0x3f) )					       - UNICODE_HANGUL_FIRST;					$l = intval( $index / UNICODE_HANGUL_NCOUNT );					$v = intval( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);					$t = $index % UNICODE_HANGUL_TCOUNT;					$out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v );					if( $t >= 25 ) {						$out .= "\xe1\x87" . chr( 0x80 + $t - 25 );					} elseif( $t ) {						$out .= "\xe1\x86" . chr( 0xa7 + $t );					}					continue;				}			}			$out .= $c;		}		return $out;	}	/**	 * Sorts combining characters into canonical order. This is the	 * final step in creating decomposed normal forms D and KD.	 * @private	 * @param string $string a valid, decomposed UTF-8 string. Input is not validated.	 * @return string a UTF-8 string with combining characters sorted in canonical order	 */	function fastCombiningSort( $string ) {		UtfNormal::loadData();		global $utfCombiningClass;		$len = strlen( $string );		$out = '';		$combiners = array();		$lastClass = -1;		for( $i = 0; $i < $len; $i++ ) {			$c = $string{$i};			$n = ord( $c );			if( $n >= 0x80 ) {				if( $n >= 0xf0 ) {					$c = substr( $string, $i, 4 );					$i += 3;				} elseif( $n >= 0xe0 ) {					$c = substr( $string, $i, 3 );					$i += 2;				} elseif( $n >= 0xc0 ) {					$c = substr( $string, $i, 2 );					$i++;				}				if( isset( $utfCombiningClass[$c] ) ) {					$lastClass = $utfCombiningClass[$c];					@$combiners[$lastClass] .= $c;					continue;				}			}			if( $lastClass ) {				ksort( $combiners );				$out .= implode( '', $combiners );				$combiners = array();			}			$out .= $c;			$lastClass = 0;		}		if( $lastClass ) {			ksort( $combiners );			$out .= implode( '', $combiners );		}		return $out;	}	/**	 * Produces canonically composed sequences, i.e. normal form C or KC.	 *	 * @private	 * @param string $string a valid UTF-8 string in sorted normal form D or KD. Input is not validated.	 * @return string a UTF-8 string with canonical precomposed characters used where possible	 */	function fastCompose( $string ) {		UtfNormal::loadData();		global $utfCanonicalComp, $utfCombiningClass;		$len = strlen( $string );		$out = '';		$lastClass = -1;		$lastHangul = 0;		$startChar = '';		$combining = '';		$x1 = ord(substr(UTF8_HANGUL_VBASE,0,1));		$x2 = ord(substr(UTF8_HANGUL_TEND,0,1));		for( $i = 0; $i < $len; $i++ ) {			$c = $string{$i};			$n = ord( $c );			if( $n < 0x80 ) {				# No combining characters here...				$out .= $startChar;				$out .= $combining;				$startChar = $c;				$combining = '';				$lastClass = 0;				continue;			} elseif( $n >= 0xf0 ) {				$c = substr( $string, $i, 4 );				$i += 3;			} elseif( $n >= 0xe0 ) {				$c = substr( $string, $i, 3 );				$i += 2;			} elseif( $n >= 0xc0 ) {				$c = substr( $string, $i, 2 );				$i++;			}			$pair = $startChar . $c;			if( $n > 0x80 ) {				if( isset( $utfCombiningClass[$c] ) ) {					# A combining char; see what we can do with it					$class = $utfCombiningClass[$c];					if( !empty( $startChar ) &&						$lastClass < $class &&						$class > 0 &&						isset( $utfCanonicalComp[$pair] ) ) {						$startChar = $utfCanonicalComp[$pair];						$class = 0;					} else {						$combining .= $c;					}					$lastClass = $class;					$lastHangul = 0;					continue;				}			}			# New start char			if( $lastClass == 0 ) {				if( isset( $utfCanonicalComp[$pair] ) ) {					$startChar = $utfCanonicalComp[$pair];					$lastHangul = 0;					continue;				}				if( $n >= $x1 && $n <= $x2 ) {					# WARNING: Hangul code is painfully slow.					# I apologize for this ugly, ugly code; however					# performance is even more teh suck if we call					# out to nice clean functions. Lookup tables are					# marginally faster, but require a lot of space.					#					if( $c >= UTF8_HANGUL_VBASE &&						$c <= UTF8_HANGUL_VEND &&						$startChar >= UTF8_HANGUL_LBASE &&						$startChar <= UTF8_HANGUL_LEND ) {						#						#$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;						#$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;						$lIndex = ord( $startChar{2} ) - 0x80;						$vIndex = ord( $c{2}         ) - 0xa1;						$hangulPoint = UNICODE_HANGUL_FIRST +							UNICODE_HANGUL_TCOUNT *							(UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);						# Hardcode the limited-range UTF-8 conversion:						$startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .									 chr( $hangulPoint >>  6 & 0x3f | 0x80 ) .									 chr( $hangulPoint       & 0x3f | 0x80 );						$lastHangul = 0;						continue;					} elseif( $c >= UTF8_HANGUL_TBASE &&							  $c <= UTF8_HANGUL_TEND &&							  $startChar >= UTF8_HANGUL_FIRST &&							  $startChar <= UTF8_HANGUL_LAST &&							  !$lastHangul ) {						# $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;						$tIndex = ord( $c{2} ) - 0xa7;						if( $tIndex < 0 ) $tIndex = ord( $c{2} ) - 0x80 + (0x11c0 - 0x11a7);						# Increment the code point by $tIndex, without						# the function overhead of decoding and recoding UTF-8						#						$tail = ord( $startChar{2} ) + $tIndex;						if( $tail > 0xbf ) {							$tail -= 0x40;							$mid = ord( $startChar{1} ) + 1;							if( $mid > 0xbf ) {								$startChar{0} = chr( ord( $startChar{0} ) + 1 );								$mid -= 0x40;							}							$startChar{1} = chr( $mid );						}						$startChar{2} = chr( $tail );						# If there's another jamo char after this, *don't* try to merge it.						$lastHangul = 1;						continue;					}				}			}			$out .= $startChar;			$out .= $combining;			$startChar = $c;			$combining = '';			$lastClass = 0;			$lastHangul = 0;		}		$out .= $startChar . $combining;		return $out;	}	/**	 * This is just used for the benchmark, comparing how long it takes to	 * interate through a string without really doing anything of substance.	 * @param string $string	 * @return string	 */	function placebo( $string ) {		$len = strlen( $string );		$out = '';		for( $i = 0; $i < $len; $i++ ) {			$out .= $string{$i};		}		return $out;	}}?>

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?