📄 sanitizer.php

📁 汉字转拼音jar包
💻 PHP
字号:
<?php

/**
 * Regular expression to match various types of character references in
 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
 */
define( 'MW_CHAR_REFS_REGEX',
'/&([A-Za-z0-9\x80-\xff]+);
	 |&\#([0-9]+);
	 |&\#x([0-9A-Za-z]+);
	 |&\#X([0-9A-Za-z]+);
	 |(&)/x' );

/**
 * codepointToUtf8( UNICODE_REPLACEMENT )
 */
define( 'UTF8_REPLACEMENT', "\xef\xbf\xbd");

/**
 * Class borrowed from Mediawiki, based on the following files:
 * Sanitizer.php, SpecialUpload.php, UtfNormal.php, UtfNormalUtil.php
 *
 */
class Sanitizer {

	/**
    * List of all named character entities defined in HTML 4.01
    * http://www.w3.org/TR/html4/sgml/entities.html
    */
	var $htmlEntities = array(
	'Aacute'   => 193,
	'aacute'   => 225,
	'Acirc'    => 194,
	'acirc'    => 226,
	'acute'    => 180,
	'AElig'    => 198,
	'aelig'    => 230,
	'Agrave'   => 192,
	'agrave'   => 224,
	'alefsym'  => 8501,
	'Alpha'    => 913,
	'alpha'    => 945,
	'amp'      => 38,
	'and'      => 8743,
	'ang'      => 8736,
	'Aring'    => 197,
	'aring'    => 229,
	'asymp'    => 8776,
	'Atilde'   => 195,
	'atilde'   => 227,
	'Auml'     => 196,
	'auml'     => 228,
	'bdquo'    => 8222,
	'Beta'     => 914,
	'beta'     => 946,
	'brvbar'   => 166,
	'bull'     => 8226,
	'cap'      => 8745,
	'Ccedil'   => 199,
	'ccedil'   => 231,
	'cedil'    => 184,
	'cent'     => 162,
	'Chi'      => 935,
	'chi'      => 967,
	'circ'     => 710,
	'clubs'    => 9827,
	'cong'     => 8773,
	'copy'     => 169,
	'crarr'    => 8629,
	'cup'      => 8746,
	'curren'   => 164,
	'dagger'   => 8224,
	'Dagger'   => 8225,
	'darr'     => 8595,
	'dArr'     => 8659,
	'deg'      => 176,
	'Delta'    => 916,
	'delta'    => 948,
	'diams'    => 9830,
	'divide'   => 247,
	'Eacute'   => 201,
	'eacute'   => 233,
	'Ecirc'    => 202,
	'ecirc'    => 234,
	'Egrave'   => 200,
	'egrave'   => 232,
	'empty'    => 8709,
	'emsp'     => 8195,
	'ensp'     => 8194,
	'Epsilon'  => 917,
	'epsilon'  => 949,
	'equiv'    => 8801,
	'Eta'      => 919,
	'eta'      => 951,
	'ETH'      => 208,
	'eth'      => 240,
	'Euml'     => 203,
	'euml'     => 235,
	'euro'     => 8364,
	'exist'    => 8707,
	'fnof'     => 402,
	'forall'   => 8704,
	'frac12'   => 189,
	'frac14'   => 188,
	'frac34'   => 190,
	'frasl'    => 8260,
	'Gamma'    => 915,
	'gamma'    => 947,
	'ge'       => 8805,
	'gt'       => 62,
	'harr'     => 8596,
	'hArr'     => 8660,
	'hearts'   => 9829,
	'hellip'   => 8230,
	'Iacute'   => 205,
	'iacute'   => 237,
	'Icirc'    => 206,
	'icirc'    => 238,
	'iexcl'    => 161,
	'Igrave'   => 204,
	'igrave'   => 236,
	'image'    => 8465,
	'infin'    => 8734,
	'int'      => 8747,
	'Iota'     => 921,
	'iota'     => 953,
	'iquest'   => 191,
	'isin'     => 8712,
	'Iuml'     => 207,
	'iuml'     => 239,
	'Kappa'    => 922,
	'kappa'    => 954,
	'Lambda'   => 923,
	'lambda'   => 955,
	'lang'     => 9001,
	'laquo'    => 171,
	'larr'     => 8592,
	'lArr'     => 8656,
	'lceil'    => 8968,
	'ldquo'    => 8220,
	'le'       => 8804,
	'lfloor'   => 8970,
	'lowast'   => 8727,
	'loz'      => 9674,
	'lrm'      => 8206,
	'lsaquo'   => 8249,
	'lsquo'    => 8216,
	'lt'       => 60,
	'macr'     => 175,
	'mdash'    => 8212,
	'micro'    => 181,
	'middot'   => 183,
	'minus'    => 8722,
	'Mu'       => 924,
	'mu'       => 956,
	'nabla'    => 8711,
	'nbsp'     => 160,
	'ndash'    => 8211,
	'ne'       => 8800,
	'ni'       => 8715,
	'not'      => 172,
	'notin'    => 8713,
	'nsub'     => 8836,
	'Ntilde'   => 209,
	'ntilde'   => 241,
	'Nu'       => 925,
	'nu'       => 957,
	'Oacute'   => 211,
	'oacute'   => 243,
	'Ocirc'    => 212,
	'ocirc'    => 244,
	'OElig'    => 338,
	'oelig'    => 339,
	'Ograve'   => 210,
	'ograve'   => 242,
	'oline'    => 8254,
	'Omega'    => 937,
	'omega'    => 969,
	'Omicron'  => 927,
	'omicron'  => 959,
	'oplus'    => 8853,
	'or'       => 8744,
	'ordf'     => 170,
	'ordm'     => 186,
	'Oslash'   => 216,
	'oslash'   => 248,
	'Otilde'   => 213,
	'otilde'   => 245,
	'otimes'   => 8855,
	'Ouml'     => 214,
	'ouml'     => 246,
	'para'     => 182,
	'part'     => 8706,
	'permil'   => 8240,
	'perp'     => 8869,
	'Phi'      => 934,
	'phi'      => 966,
	'Pi'       => 928,
	'pi'       => 960,
	'piv'      => 982,
	'plusmn'   => 177,
	'pound'    => 163,
	'prime'    => 8242,
	'Prime'    => 8243,
	'prod'     => 8719,
	'prop'     => 8733,
	'Psi'      => 936,
	'psi'      => 968,
	'quot'     => 34,
	'radic'    => 8730,
	'rang'     => 9002,
	'raquo'    => 187,
	'rarr'     => 8594,
	'rArr'     => 8658,
	'rceil'    => 8969,
	'rdquo'    => 8221,
	'real'     => 8476,
	'reg'      => 174,
	'rfloor'   => 8971,
	'Rho'      => 929,
	'rho'      => 961,
	'rlm'      => 8207,
	'rsaquo'   => 8250,
	'rsquo'    => 8217,
	'sbquo'    => 8218,
	'Scaron'   => 352,
	'scaron'   => 353,
	'sdot'     => 8901,
	'sect'     => 167,
	'shy'      => 173,
	'Sigma'    => 931,
	'sigma'    => 963,
	'sigmaf'   => 962,
	'sim'      => 8764,
	'spades'   => 9824,
	'sub'      => 8834,
	'sube'     => 8838,
	'sum'      => 8721,
	'sup'      => 8835,
	'sup1'     => 185,
	'sup2'     => 178,
	'sup3'     => 179,
	'supe'     => 8839,
	'szlig'    => 223,
	'Tau'      => 932,
	'tau'      => 964,
	'there4'   => 8756,
	'Theta'    => 920,
	'theta'    => 952,
	'thetasym' => 977,
	'thinsp'   => 8201,
	'THORN'    => 222,
	'thorn'    => 254,
	'tilde'    => 732,
	'times'    => 215,
	'trade'    => 8482,
	'Uacute'   => 218,
	'uacute'   => 250,
	'uarr'     => 8593,
	'uArr'     => 8657,
	'Ucirc'    => 219,
	'ucirc'    => 251,
	'Ugrave'   => 217,
	'ugrave'   => 249,
	'uml'      => 168,
	'upsih'    => 978,
	'Upsilon'  => 933,
	'upsilon'  => 965,
	'Uuml'     => 220,
	'uuml'     => 252,
	'weierp'   => 8472,
	'Xi'       => 926,
	'xi'       => 958,
	'Yacute'   => 221,
	'yacute'   => 253,
	'yen'      => 165,
	'Yuml'     => 376,
	'yuml'     => 255,
	'Zeta'     => 918,
	'zeta'     => 950,
	'zwj'      => 8205,
	'zwnj'     => 8204 );

	/**
     * Return UTF-8 sequence for a given Unicode code point.
    * May die if fed out of range data.
    *
    * @param $codepoint Integer:
    * @return String
    * @public
    */
	function codepointToUtf8( $codepoint ) {
		if($codepoint <		0x80)
			return chr($codepoint);
		if($codepoint <    0x800)
			return chr($codepoint >>	6 & 0x3f | 0xc0) .
				chr($codepoint		  & 0x3f | 0x80);
		if($codepoint <  0x10000)
			return chr($codepoint >> 12 & 0x0f | 0xe0) .
				chr($codepoint >>	6 & 0x3f | 0x80) .
				chr($codepoint		  & 0x3f | 0x80);
		if($codepoint < 0x110000)
			return chr($codepoint >> 18 & 0x07 | 0xf0) .
				chr($codepoint >> 12 & 0x3f | 0x80) .
				chr($codepoint >>	6 & 0x3f | 0x80) .
				chr($codepoint		  & 0x3f | 0x80);

		return $codepoint ;
	}

	/**
	 * Decode any character references, numeric or named entities,
	 * in the text and return a UTF-8 string.
	 *
	 * @param string $text
	 * @return string
	 * @public
	 * @static
	 */
	function decodeCharReferences( $text ) {
		return preg_replace_callback( MW_CHAR_REFS_REGEX, array( $this, 'decodeCharReferencesCallback' ), $text ) ;
	}

	/**
	 * @param string $matches
	 * @return string
	 */
	function decodeCharReferencesCallback( $matches ) {
		if( $matches[1] != '' ) {
			return $this->decodeEntity( $matches[1] ) ;
		} elseif( $matches[2] != '' ) {
			return  $this->decodeChar( intval( $matches[2] ) ) ;
		} elseif( $matches[3] != ''  ) {
			return  $this->decodeChar( hexdec( $matches[3] ) ) ;
		} elseif( $matches[4] != '' ) {
			return  $this->decodeChar( hexdec( $matches[4] ) ) ;
		}
		# Last case should be an ampersand by itself
		return $matches[0] ;
	}

	/**
	 * Return UTF-8 string for a codepoint if that is a valid
	 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
	 * @param int $codepoint
	 * @return string
	 */
	function decodeChar( $codepoint ) {
		if( $this->validateCodepoint( $codepoint ) ) {
			return $this->codepointToUtf8( $codepoint ) ;
		} else {
			return UTF8_REPLACEMENT ;
		}
	}

	/**
	 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
	 * return the UTF-8 encoding of that character. Otherwise, returns
	 * pseudo-entity source (eg &foo;)
	 *
	 * @param string $name
	 * @return string
	 */
	function decodeEntity( $name ) {
		if( isset( $this->$htmlEntities[$name] ) ) {
			return $this->codepointToUtf8( $this->$htmlEntities[$name] ) ;
		} else {
			return "&$name;" ;
		}
	}

	/**
	 * Returns true if a given Unicode codepoint is a valid character in XML.
	 * @param int $codepoint
	 * @return bool
	 */
	function validateCodepoint( $codepoint ) {
		return ($codepoint ==    0x09)
		|| ($codepoint ==    0x0a)
		|| ($codepoint ==    0x0d)
		|| ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
		|| ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
		|| ($codepoint >= 0x10000 && $codepoint <= 0x10ffff) ;
	}

	/**
    * Heuristig for detecting files that *could* contain JavaScript instructions or
	* things that may look like HTML to a browser and are thus
	* potentially harmful. The present implementation will produce false positives in some situations.
	*
	* @param string $file Pathname to the file
	* @return bool true if the file contains something looking like embedded scripts
	*/
	function detectScript( $file ) {

		#For binarie field, just check the first K.

		$fp = fopen( $file, 'rb' ) ;
		$chunk = fread( $fp, 1024 ) ;
		fclose( $fp ) ;

		$chunk = strtolower( $chunk ) ;

		if (!$chunk)
			return false ;

		#decode from UTF-16 if needed (could be used for obfuscation).
		if ( substr( $chunk, 0, 2 ) == "\xfe\xff" )
			$enc = "UTF-16BE" ;
		elseif ( substr( $chunk, 0, 2 ) == "\xff\xfe" )
			$enc = "UTF-16LE" ;
		else
			$enc= NULL ;

		if ( $enc ) {
			$chunk_tmp = @iconv($enc, "ASCII//IGNORE", $chunk) ;
			if ( $chunk_tmp )
				$chunk = $chunk_tmp ;
		}

		$chunk = trim( $chunk ) ;

		#FIXME: convert from UTF-16 if necessarry!

		#check for HTML doctype
		if ( eregi( "<!DOCTYPE *X?HTML", $chunk ) ) {
			return true ;
		}

		/**
		* Internet Explorer for Windows performs some really stupid file type
		* autodetection which can cause it to interpret valid image files as HTML
		* and potentially execute JavaScript, creating a cross-site scripting
		* attack vectors.
		*
		* Apple's Safari browser also performs some unsafe file type autodetection
		* which can cause legitimate files to be interpreted as HTML if the
		* web server is not correctly configured to send the right content-type
		* (or if you're really uploading plain text and octet streams!)
		*
		* Returns true if IE is likely to mistake the given file for HTML.
		* Also returns true if Safari would mistake the given file for HTML
		* when served with a generic content-type.
		*/

		$tags = array(
		'<body',
		'<head',
		'<html',   #also in safari
		'<img',
		'<pre',
		'<script', #also in safari
		'<table',
		'<title'
		) ;


		foreach( $tags as $tag ) {
			if( false !== strpos( $chunk, $tag ) ) {
				return true ;
			}
		}

		/*
		* look for javascript
		*/

		#resolve entity-refs to look at attributes. may be harsh on big files... cache result?
		$chunk = $this->decodeCharReferences( $chunk ) ;

		#look for script-types
		if ( preg_match( '!type\s*=\s*[\'"]?\s*(?:\w*/)?(?:ecma|java)!sim', $chunk ) )
			return true ;

		#look for html-style script-urls
		if ( preg_match( '!(?:href|src|data)\s*=\s*[\'"]?\s*(?:ecma|java)script:!sim', $chunk ) )
			return true ;

		#look for css-style script-urls
		if ( preg_match( '!url\s*\(\s*[\'"]?\s*(?:ecma|java)script:!sim', $chunk ) )
			return true ;

		return false ;
	}
}
?>
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -