sanitizer.php
来自「php 开发的内容管理系统」· PHP 代码 · 共 1,185 行 · 第 1/3 页
PHP
1,185 行
# Single-quoted return $set[4]; } elseif( isset( $set[3] ) ) { # Double-quoted return $set[3]; } elseif( !isset( $set[2] ) ) { # In XHTML, attributes must have a value. # For 'reduced' form, return explicitly the attribute name here. return $set[1]; } else { throw new MWException( "Tag conditions not met. This should never happen and is a bug." ); } } /** * Normalize whitespace and character references in an XML source- * encoded text for an attribute value. * * See http://www.w3.org/TR/REC-xml/#AVNormalize for background, * but note that we're not returning the value, but are returning * XML source fragments that will be slapped into output. * * @param string $text * @return string * @private */ function normalizeAttributeValue( $text ) { return str_replace( '"', '"', preg_replace( '/\r\n|[\x20\x0d\x0a\x09]/', ' ', Sanitizer::normalizeCharReferences( $text ) ) ); } /** * Ensure that any entities and character references are legal * for XML and XHTML specifically. Any stray bits will be * &-escaped to result in a valid text fragment. * * a. any named char refs must be known in XHTML * b. any numeric char refs must be legal chars, not invalid or forbidden * c. use &#x, not &#X * d. fix or reject non-valid attributes * * @param string $text * @return string * @private */ function normalizeCharReferences( $text ) { return preg_replace_callback( MW_CHAR_REFS_REGEX, array( 'Sanitizer', 'normalizeCharReferencesCallback' ), $text ); } /** * @param string $matches * @return string */ function normalizeCharReferencesCallback( $matches ) { $ret = null; if( $matches[1] != '' ) { $ret = Sanitizer::normalizeEntity( $matches[1] ); } elseif( $matches[2] != '' ) { $ret = Sanitizer::decCharReference( $matches[2] ); } elseif( $matches[3] != '' ) { $ret = Sanitizer::hexCharReference( $matches[3] ); } elseif( $matches[4] != '' ) { $ret = Sanitizer::hexCharReference( $matches[4] ); } if( is_null( $ret ) ) { return htmlspecialchars( $matches[0] ); } else { return $ret; } } /** * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, * return the named entity reference as is. Otherwise, returns * HTML-escaped text of pseudo-entity source (eg &foo;) * * @param string $name * @return string */ function normalizeEntity( $name ) { global $wgHtmlEntities; if( isset( $wgHtmlEntities[$name] ) ) { return "&$name;"; } else { return "&$name;"; } } function decCharReference( $codepoint ) { $point = intval( $codepoint ); if( Sanitizer::validateCodepoint( $point ) ) { return sprintf( '&#%d;', $point ); } else { return null; } } function hexCharReference( $codepoint ) { $point = hexdec( $codepoint ); if( Sanitizer::validateCodepoint( $point ) ) { return sprintf( '&#x%x;', $point ); } else { return null; } } /** * Returns true if a given Unicode codepoint is a valid character in XML. * @param int $codepoint * @return bool */ function validateCodepoint( $codepoint ) { return ($codepoint == 0x09) || ($codepoint == 0x0a) || ($codepoint == 0x0d) || ($codepoint >= 0x20 && $codepoint <= 0xd7ff) || ($codepoint >= 0xe000 && $codepoint <= 0xfffd) || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff); } /** * Decode any character references, numeric or named entities, * in the text and return a UTF-8 string. * * @param string $text * @return string * @public */ function decodeCharReferences( $text ) { return preg_replace_callback( MW_CHAR_REFS_REGEX, array( 'Sanitizer', 'decodeCharReferencesCallback' ), $text ); } /** * @param string $matches * @return string */ function decodeCharReferencesCallback( $matches ) { if( $matches[1] != '' ) { return Sanitizer::decodeEntity( $matches[1] ); } elseif( $matches[2] != '' ) { return Sanitizer::decodeChar( intval( $matches[2] ) ); } elseif( $matches[3] != '' ) { return Sanitizer::decodeChar( hexdec( $matches[3] ) ); } elseif( $matches[4] != '' ) { return Sanitizer::decodeChar( hexdec( $matches[4] ) ); } # Last case should be an ampersand by itself return $matches[0]; } /** * Return UTF-8 string for a codepoint if that is a valid * character reference, otherwise U+FFFD REPLACEMENT CHARACTER. * @param int $codepoint * @return string * @private */ function decodeChar( $codepoint ) { if( Sanitizer::validateCodepoint( $codepoint ) ) { return codepointToUtf8( $codepoint ); } else { return UTF8_REPLACEMENT; } } /** * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, * return the UTF-8 encoding of that character. Otherwise, returns * pseudo-entity source (eg &foo;) * * @param string $name * @return string */ function decodeEntity( $name ) { global $wgHtmlEntities; if( isset( $wgHtmlEntities[$name] ) ) { return codepointToUtf8( $wgHtmlEntities[$name] ); } else { return "&$name;"; } } /** * Fetch the whitelist of acceptable attributes for a given * element name. * * @param string $element * @return array */ function attributeWhitelist( $element ) { static $list; if( !isset( $list ) ) { $list = Sanitizer::setupAttributeWhitelist(); } return isset( $list[$element] ) ? $list[$element] : array(); } /** * @return array */ function setupAttributeWhitelist() { $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' ); $block = array_merge( $common, array( 'align' ) ); $tablealign = array( 'align', 'char', 'charoff', 'valign' ); $tablecell = array( 'abbr', 'axis', 'headers', 'scope', 'rowspan', 'colspan', 'nowrap', # deprecated 'width', # deprecated 'height', # deprecated 'bgcolor' # deprecated ); # Numbers refer to sections in HTML 4.01 standard describing the element. # See: http://www.w3.org/TR/html4/ $whitelist = array ( # 7.5.4 'div' => $block, 'center' => $common, # deprecated 'span' => $block, # ?? # 7.5.5 'h1' => $block, 'h2' => $block, 'h3' => $block, 'h4' => $block, 'h5' => $block, 'h6' => $block, # 7.5.6 # address # 8.2.4 # bdo # 9.2.1 'em' => $common, 'strong' => $common, 'cite' => $common, # dfn 'code' => $common, # samp # kbd 'var' => $common, # abbr # acronym # 9.2.2 'blockquote' => array_merge( $common, array( 'cite' ) ), # q # 9.2.3 'sub' => $common, 'sup' => $common, # 9.3.1 'p' => $block, # 9.3.2 'br' => array( 'id', 'class', 'title', 'style', 'clear' ), # 9.3.4 'pre' => array_merge( $common, array( 'width' ) ), # 9.4 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ), 'del' => array_merge( $common, array( 'cite', 'datetime' ) ), # 10.2 'ul' => array_merge( $common, array( 'type' ) ), 'ol' => array_merge( $common, array( 'type', 'start' ) ), 'li' => array_merge( $common, array( 'type', 'value' ) ), # 10.3 'dl' => $common, 'dd' => $common, 'dt' => $common, # 11.2.1 'table' => array_merge( $common, array( 'summary', 'width', 'border', 'frame', 'rules', 'cellspacing', 'cellpadding', 'align', 'bgcolor', 'frame', 'rules', 'border' ) ), # 11.2.2 'caption' => array_merge( $common, array( 'align' ) ), # 11.2.3 'thead' => array_merge( $common, $tablealign ), 'tfoot' => array_merge( $common, $tablealign ), 'tbody' => array_merge( $common, $tablealign ), # 11.2.4 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ), 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ), # 11.2.5 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ), # 11.2.6 'td' => array_merge( $common, $tablecell, $tablealign ), 'th' => array_merge( $common, $tablecell, $tablealign ), # 15.2.1 'tt' => $common, 'b' => $common, 'i' => $common, 'big' => $common, 'small' => $common, 'strike' => $common, 's' => $common, 'u' => $common, # 15.2.2 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ), # basefont # 15.3 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ), # XHTML Ruby annotation text module, simple ruby only. # http://www.w3c.org/TR/ruby/ 'ruby' => $common, # rbc # rtc 'rb' => $common, 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ), 'rp' => $common, ); return $whitelist; } /** * Take a fragment of (potentially invalid) HTML and return * a version with any tags removed, encoded suitably for literal * inclusion in an attribute value. * * @param string $text HTML fragment * @return string */ function stripAllTags( $text ) { # Actual <tags> $text = preg_replace( '/ < .*? > /x', '', $text ); # Normalize &entities and whitespace $text = Sanitizer::normalizeAttributeValue( $text ); # Will be placed into "double-quoted" attributes, # make sure remaining bits are safe. $text = str_replace( array('<', '>', '"'), array('<', '>', '"'), $text ); return $text; } /** * Hack up a private DOCTYPE with HTML's standard entity declarations. * PHP 4 seemed to know these if you gave it an HTML doctype, but * PHP 5.1 doesn't. * * Use for passing XHTML fragments to PHP's XML parsing functions * * @return string * @static */ function hackDocType() { global $wgHtmlEntities; $out = "<!DOCTYPE html [\n"; foreach( $wgHtmlEntities as $entity => $codepoint ) { $out .= "<!ENTITY $entity \"&#$codepoint;\">"; } $out .= "]>\n"; return $out; }}?>
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?