sanitizer.php
来自「php 开发的内容管理系统」· PHP 代码 · 共 1,185 行 · 第 1/3 页
PHP
1,185 行
# and see if we find a match below them $optstack = array(); array_push ($optstack, $ot); while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) && in_array($ot, $htmlsingleallowed) ) { array_push ($optstack, $ot); } if ( $t != $ot ) { # No match. Push the optinal elements back again $badtag = 1; while ( $ot = @array_pop( $optstack ) ) { array_push( $tagstack, $ot ); } } } else { @array_push( $tagstack, $ot ); # <li> can be nested in <ul> or <ol>, skip those cases: if(!(in_array($ot, $htmllist) && in_array($t, $listtags) )) { $badtag = 1; } } } else { if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); } } $newparams = ''; } else { # Keep track for later if ( in_array( $t, $tabletags ) && ! in_array( 'table', $tagstack ) ) { $badtag = 1; } else if ( in_array( $t, $tagstack ) && ! in_array ( $t , $htmlnest ) ) { $badtag = 1 ; #聽Is it a self closed htmlpair ? (bug 5487) } else if( $brace == '/>' && in_array($t, $htmlpairs) ) { $badtag = 1; } elseif( in_array( $t, $htmlsingleonly ) ) { # Hack to force empty tag for uncloseable elements $brace = '/>'; } else if( in_array( $t, $htmlsingle ) ) { # Hack to not close $htmlsingle tags $brace = NULL; } else { if ( $t == 'table' ) { array_push( $tablestack, $tagstack ); $tagstack = array(); } array_push( $tagstack, $t ); } # Replace any variables or template parameters with # plaintext results. if( is_callable( $processCallback ) ) { call_user_func_array( $processCallback, array( &$params, $args ) ); } # Strip non-approved attributes from the tag $newparams = Sanitizer::fixTagAttributes( $params, $t ); } if ( ! $badtag ) { $rest = str_replace( '>', '>', $rest ); $close = ( $brace == '/>' ) ? ' /' : ''; $text .= "<$slash$t$newparams$close>$rest"; continue; } } $text .= '<' . str_replace( '>', '>', $x); } # Close off any remaining tags while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) { $text .= "</$t>\n"; if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); } } } else { # this might be possible using tidy itself foreach ( $bits as $x ) { preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', $x, $regs ); @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs; if ( in_array( $t = strtolower( $t ), $htmlelements ) ) { if( is_callable( $processCallback ) ) { call_user_func_array( $processCallback, array( &$params, $args ) ); } $newparams = Sanitizer::fixTagAttributes( $params, $t ); $rest = str_replace( '>', '>', $rest ); $text .= "<$slash$t$newparams$brace$rest"; } else { $text .= '<' . str_replace( '>', '>', $x); } } } wfProfileOut( $fname ); return $text; } /** * Remove '<!--', '-->', and everything between. * To avoid leaving blank lines, when a comment is both preceded * and followed by a newline (ignoring spaces), trim leading and * trailing spaces and one of the newlines. * * @private * @param string $text * @return string */ function removeHTMLcomments( $text ) { $fname='Parser::removeHTMLcomments'; wfProfileIn( $fname ); while (($start = strpos($text, '<!--')) !== false) { $end = strpos($text, '-->', $start + 4); if ($end === false) { # Unterminated comment; bail out break; } $end += 3; # Trim space and newline if the comment is both # preceded and followed by a newline $spaceStart = max($start - 1, 0); $spaceLen = $end - $spaceStart; while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) { $spaceStart--; $spaceLen++; } while (substr($text, $spaceStart + $spaceLen, 1) === ' ') $spaceLen++; if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") { # Remove the comment, leading and trailing # spaces, and leave only one newline. $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1); } else { # Remove just the comment. $text = substr_replace($text, '', $start, $end - $start); } } wfProfileOut( $fname ); return $text; } /** * Take an array of attribute names and values and normalize or discard * illegal values for the given element type. * * - Discards attributes not on a whitelist for the given element * - Unsafe style attributes are discarded * * @param array $attribs * @param string $element * @return array * * @todo Check for legal values where the DTD limits things. * @todo Check for unique id attribute :P */ function validateTagAttributes( $attribs, $element ) { $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) ); $out = array(); foreach( $attribs as $attribute => $value ) { if( !isset( $whitelist[$attribute] ) ) { continue; } # Strip javascript "expression" from stylesheets. # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp if( $attribute == 'style' ) { $value = Sanitizer::checkCss( $value ); if( $value === false ) { # haxx0r continue; } } if ( $attribute === 'id' ) $value = Sanitizer::escapeId( $value ); // If this attribute was previously set, override it. // Output should only have one attribute of each name. $out[$attribute] = $value; } return $out; } /** * Pick apart some CSS and check it for forbidden or unsafe structures. * Returns a sanitized string, or false if it was just too evil. * * Currently URL references, 'expression', 'tps' are forbidden. * * @param string $value * @return mixed */ static function checkCss( $value ) { $stripped = Sanitizer::decodeCharReferences( $value ); // Remove any comments; IE gets token splitting wrong $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped ); $value = $stripped; // ... and continue checks $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e', 'codepointToUtf8(hexdec("$1"))', $stripped ); $stripped = str_replace( '\\', '', $stripped ); if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is', $stripped ) ) { # haxx0r return false; } return $value; } /** * Take a tag soup fragment listing an HTML element's attributes * and normalize it to well-formed XML, discarding unwanted attributes. * Output is safe for further wikitext processing, with escaping of * values that could trigger problems. * * - Normalizes attribute names to lowercase * - Discards attributes not on a whitelist for the given element * - Turns broken or invalid entities into plaintext * - Double-quotes all attribute values * - Attributes without values are given the name as attribute * - Double attributes are discarded * - Unsafe style attributes are discarded * - Prepends space if there are attributes. * * @param string $text * @param string $element * @return string */ function fixTagAttributes( $text, $element ) { if( trim( $text ) == '' ) { return ''; } $stripped = Sanitizer::validateTagAttributes( Sanitizer::decodeTagAttributes( $text ), $element ); $attribs = array(); foreach( $stripped as $attribute => $value ) { $encAttribute = htmlspecialchars( $attribute ); $encValue = Sanitizer::safeEncodeAttribute( $value ); $attribs[] = "$encAttribute=\"$encValue\""; } return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; } /** * Encode an attribute value for HTML output. * @param $text * @return HTML-encoded text fragment */ function encodeAttribute( $text ) { $encValue = htmlspecialchars( $text ); // Whitespace is normalized during attribute decoding, // so if we've been passed non-spaces we must encode them // ahead of time or they won't be preserved. $encValue = strtr( $encValue, array( "\n" => ' ', "\r" => ' ', "\t" => '	', ) ); return $encValue; } /** * Encode an attribute value for HTML tags, with extra armoring * against further wiki processing. * @param $text * @return HTML-encoded text fragment */ function safeEncodeAttribute( $text ) { $encValue = Sanitizer::encodeAttribute( $text ); # Templates and links may be expanded in later parsing, # creating invalid or dangerous output. Suppress this. $encValue = strtr( $encValue, array( '<' => '<', // This should never happen, '>' => '>', // we've received invalid input '"' => '"', // which should have been escaped. '{' => '{', '[' => '[', "''" => '''', 'ISBN' => 'ISBN', 'RFC' => 'RFC', 'PMID' => 'PMID', '|' => '|', '__' => '__', ) ); # Stupid hack $encValue = preg_replace_callback( '/(' . wfUrlProtocols() . ')/', array( 'Sanitizer', 'armorLinksCallback' ), $encValue ); return $encValue; } /** * Given a value escape it so that it can be used in an id attribute and * return it, this does not validate the value however (see first link) * * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters * in the id and * name attributes * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute * * @bug 4461 * * @static * * @param string $id * @return string */ function escapeId( $id ) { static $replace = array( '%3A' => ':', '%' => '.' ); $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) ); return str_replace( array_keys( $replace ), array_values( $replace ), $id ); } /** * Regex replace callback for armoring links against further processing. * @param array $matches * @return string * @private */ function armorLinksCallback( $matches ) { return str_replace( ':', ':', $matches[1] ); } /** * Return an associative array of attribute names and values from * a partial tag string. Attribute names are forces to lowercase, * character references are decoded to UTF-8 text. * * @param string * @return array */ function decodeTagAttributes( $text ) { $attribs = array(); if( trim( $text ) == '' ) { return $attribs; } $pairs = array(); if( !preg_match_all( MW_ATTRIBS_REGEX, $text, $pairs, PREG_SET_ORDER ) ) { return $attribs; } foreach( $pairs as $set ) { $attribute = strtolower( $set[1] ); $value = Sanitizer::getTagAttributeCallback( $set ); // Normalize whitespace $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); $value = trim( $value ); // Decode character references $attribs[$attribute] = Sanitizer::decodeCharReferences( $value ); } return $attribs; } /** * Pick the appropriate attribute value from a match set from the * MW_ATTRIBS_REGEX matches. * * @param array $set * @return string * @private */ function getTagAttributeCallback( $set ) { if( isset( $set[6] ) ) { # Illegal #XXXXXX color with no quotes. return $set[6]; } elseif( isset( $set[5] ) ) { # No quotes. return $set[5]; } elseif( isset( $set[4] ) ) {
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?