sanitizer.php

来自「php 开发的内容管理系统」· PHP 代码 · 共 1,185 行 · 第 1/3 页

PHP
1,185
字号
								# and see if we find a match below them								$optstack = array();								array_push ($optstack, $ot);								while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&												in_array($ot, $htmlsingleallowed) ) {									array_push ($optstack, $ot);								}								if ( $t != $ot ) {									# No match. Push the optinal elements back again									$badtag = 1;									while ( $ot = @array_pop( $optstack ) ) {										array_push( $tagstack, $ot );									}								}							} else {								@array_push( $tagstack, $ot );								# <li> can be nested in <ul> or <ol>, skip those cases:								if(!(in_array($ot, $htmllist) && in_array($t, $listtags) )) {									$badtag = 1;								}							}						} else {							if ( $t == 'table' ) {								$tagstack = array_pop( $tablestack );							}						}						$newparams = '';					} else {						# Keep track for later						if ( in_array( $t, $tabletags ) &&						! in_array( 'table', $tagstack ) ) {							$badtag = 1;						} else if ( in_array( $t, $tagstack ) &&						! in_array ( $t , $htmlnest ) ) {							$badtag = 1 ;						#聽Is it a self closed htmlpair ? (bug 5487)						} else if( $brace == '/>' &&						in_array($t, $htmlpairs) ) {							$badtag = 1;						} elseif( in_array( $t, $htmlsingleonly ) ) {							# Hack to force empty tag for uncloseable elements							$brace = '/>';						} else if( in_array( $t, $htmlsingle ) ) {							# Hack to not close $htmlsingle tags							$brace = NULL;						} else {							if ( $t == 'table' ) {								array_push( $tablestack, $tagstack );								$tagstack = array();							}							array_push( $tagstack, $t );						}						# Replace any variables or template parameters with						# plaintext results.						if( is_callable( $processCallback ) ) {							call_user_func_array( $processCallback, array( &$params, $args ) );						}						# Strip non-approved attributes from the tag						$newparams = Sanitizer::fixTagAttributes( $params, $t );					}					if ( ! $badtag ) {						$rest = str_replace( '>', '&gt;', $rest );						$close = ( $brace == '/>' ) ? ' /' : '';						$text .= "<$slash$t$newparams$close>$rest";						continue;					}				}				$text .= '&lt;' . str_replace( '>', '&gt;', $x);			}			# Close off any remaining tags			while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {				$text .= "</$t>\n";				if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }			}		} else {			# this might be possible using tidy itself			foreach ( $bits as $x ) {				preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',				$x, $regs );				@list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;				if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {					if( is_callable( $processCallback ) ) {						call_user_func_array( $processCallback, array( &$params, $args ) );					}					$newparams = Sanitizer::fixTagAttributes( $params, $t );					$rest = str_replace( '>', '&gt;', $rest );					$text .= "<$slash$t$newparams$brace$rest";				} else {					$text .= '&lt;' . str_replace( '>', '&gt;', $x);				}			}		}		wfProfileOut( $fname );		return $text;	}	/**	 * Remove '<!--', '-->', and everything between.	 * To avoid leaving blank lines, when a comment is both preceded	 * and followed by a newline (ignoring spaces), trim leading and	 * trailing spaces and one of the newlines.	 *	 * @private	 * @param string $text	 * @return string	 */	function removeHTMLcomments( $text ) {		$fname='Parser::removeHTMLcomments';		wfProfileIn( $fname );		while (($start = strpos($text, '<!--')) !== false) {			$end = strpos($text, '-->', $start + 4);			if ($end === false) {				# Unterminated comment; bail out				break;			}			$end += 3;			# Trim space and newline if the comment is both			# preceded and followed by a newline			$spaceStart = max($start - 1, 0);			$spaceLen = $end - $spaceStart;			while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {				$spaceStart--;				$spaceLen++;			}			while (substr($text, $spaceStart + $spaceLen, 1) === ' ')				$spaceLen++;			if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {				# Remove the comment, leading and trailing				# spaces, and leave only one newline.				$text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);			}			else {				# Remove just the comment.				$text = substr_replace($text, '', $start, $end - $start);			}		}		wfProfileOut( $fname );		return $text;	}	/**	 * Take an array of attribute names and values and normalize or discard	 * illegal values for the given element type.	 *	 * - Discards attributes not on a whitelist for the given element	 * - Unsafe style attributes are discarded	 *	 * @param array $attribs	 * @param string $element	 * @return array	 *	 * @todo Check for legal values where the DTD limits things.	 * @todo Check for unique id attribute :P	 */	function validateTagAttributes( $attribs, $element ) {		$whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );		$out = array();		foreach( $attribs as $attribute => $value ) {			if( !isset( $whitelist[$attribute] ) ) {				continue;			}			# Strip javascript "expression" from stylesheets.			# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp			if( $attribute == 'style' ) {				$value = Sanitizer::checkCss( $value );				if( $value === false ) {					# haxx0r					continue;				}			}			if ( $attribute === 'id' )				$value = Sanitizer::escapeId( $value );			// If this attribute was previously set, override it.			// Output should only have one attribute of each name.			$out[$attribute] = $value;		}		return $out;	}		/**	 * Pick apart some CSS and check it for forbidden or unsafe structures.	 * Returns a sanitized string, or false if it was just too evil.	 *	 * Currently URL references, 'expression', 'tps' are forbidden.	 *	 * @param string $value	 * @return mixed	 */	static function checkCss( $value ) {		$stripped = Sanitizer::decodeCharReferences( $value );		// Remove any comments; IE gets token splitting wrong		$stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );		$value = $stripped;		// ... and continue checks		$stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',			'codepointToUtf8(hexdec("$1"))', $stripped );		$stripped = str_replace( '\\', '', $stripped );		if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',				$stripped ) ) {			# haxx0r			return false;		}				return $value;	}	/**	 * Take a tag soup fragment listing an HTML element's attributes	 * and normalize it to well-formed XML, discarding unwanted attributes.	 * Output is safe for further wikitext processing, with escaping of	 * values that could trigger problems.	 *	 * - Normalizes attribute names to lowercase	 * - Discards attributes not on a whitelist for the given element	 * - Turns broken or invalid entities into plaintext	 * - Double-quotes all attribute values	 * - Attributes without values are given the name as attribute	 * - Double attributes are discarded	 * - Unsafe style attributes are discarded	 * - Prepends space if there are attributes.	 *	 * @param string $text	 * @param string $element	 * @return string	 */	function fixTagAttributes( $text, $element ) {		if( trim( $text ) == '' ) {			return '';		}				$stripped = Sanitizer::validateTagAttributes(			Sanitizer::decodeTagAttributes( $text ), $element );				$attribs = array();		foreach( $stripped as $attribute => $value ) {			$encAttribute = htmlspecialchars( $attribute );			$encValue = Sanitizer::safeEncodeAttribute( $value );						$attribs[] = "$encAttribute=\"$encValue\"";		}		return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';	}	/**	 * Encode an attribute value for HTML output.	 * @param $text	 * @return HTML-encoded text fragment	 */	function encodeAttribute( $text ) {		$encValue = htmlspecialchars( $text );				// Whitespace is normalized during attribute decoding,		// so if we've been passed non-spaces we must encode them		// ahead of time or they won't be preserved.		$encValue = strtr( $encValue, array(			"\n" => '&#10;',			"\r" => '&#13;',			"\t" => '&#9;',		) );				return $encValue;	}		/**	 * Encode an attribute value for HTML tags, with extra armoring	 * against further wiki processing.	 * @param $text	 * @return HTML-encoded text fragment	 */	function safeEncodeAttribute( $text ) {		$encValue = Sanitizer::encodeAttribute( $text );				# Templates and links may be expanded in later parsing,		# creating invalid or dangerous output. Suppress this.		$encValue = strtr( $encValue, array(			'<'    => '&lt;',   // This should never happen,			'>'    => '&gt;',   // we've received invalid input			'"'    => '&quot;', // which should have been escaped.			'{'    => '&#123;',			'['    => '&#91;',			"''"   => '&#39;&#39;',			'ISBN' => '&#73;SBN',			'RFC'  => '&#82;FC',			'PMID' => '&#80;MID',			'|'    => '&#124;',			'__'   => '&#95;_',		) );		# Stupid hack		$encValue = preg_replace_callback(			'/(' . wfUrlProtocols() . ')/',			array( 'Sanitizer', 'armorLinksCallback' ),			$encValue );		return $encValue;	}	/**	 * Given a value escape it so that it can be used in an id attribute and	 * return it, this does not validate the value however (see first link)	 *	 * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters	 *                                                          in the id and	 *                                                          name attributes	 * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute	 *	 * @bug 4461	 *	 * @static	 *	 * @param string $id	 * @return string	 */	function escapeId( $id ) {		static $replace = array(			'%3A' => ':',			'%' => '.'		);		$id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );		return str_replace( array_keys( $replace ), array_values( $replace ), $id );	}	/**	 * Regex replace callback for armoring links against further processing.	 * @param array $matches	 * @return string	 * @private	 */	function armorLinksCallback( $matches ) {		return str_replace( ':', '&#58;', $matches[1] );	}	/**	 * Return an associative array of attribute names and values from	 * a partial tag string. Attribute names are forces to lowercase,	 * character references are decoded to UTF-8 text.	 *	 * @param string	 * @return array	 */	function decodeTagAttributes( $text ) {		$attribs = array();		if( trim( $text ) == '' ) {			return $attribs;		}		$pairs = array();		if( !preg_match_all(			MW_ATTRIBS_REGEX,			$text,			$pairs,			PREG_SET_ORDER ) ) {			return $attribs;		}		foreach( $pairs as $set ) {			$attribute = strtolower( $set[1] );			$value = Sanitizer::getTagAttributeCallback( $set );						// Normalize whitespace			$value = preg_replace( '/[\t\r\n ]+/', ' ', $value );			$value = trim( $value );						// Decode character references			$attribs[$attribute] = Sanitizer::decodeCharReferences( $value );		}		return $attribs;	}	/**	 * Pick the appropriate attribute value from a match set from the	 * MW_ATTRIBS_REGEX matches.	 *	 * @param array $set	 * @return string	 * @private	 */	function getTagAttributeCallback( $set ) {		if( isset( $set[6] ) ) {			# Illegal #XXXXXX color with no quotes.			return $set[6];		} elseif( isset( $set[5] ) ) {			# No quotes.			return $set[5];		} elseif( isset( $set[4] ) ) {

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?