📄 ph5p.php
字号:
}
}
private function doctypeNameState() {
/* Consume the next input character: */
$this->char++;
$char = $this->char();
if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
$this->state = 'AfterDoctypeName';
} elseif($char === '>') {
$this->emitToken($this->token);
$this->state = 'data';
} elseif(preg_match('/^[a-z]$/', $char)) {
$this->token['name'] .= strtoupper($char);
} elseif($this->char === $this->EOF) {
$this->emitToken($this->token);
$this->char--;
$this->state = 'data';
} else {
$this->token['name'] .= $char;
}
$this->token['error'] = ($this->token['name'] === 'HTML')
? false
: true;
}
private function afterDoctypeNameState() {
/* Consume the next input character: */
$this->char++;
$char = $this->char();
if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
// Stay in the DOCTYPE name state.
} elseif($char === '>') {
$this->emitToken($this->token);
$this->state = 'data';
} elseif($this->char === $this->EOF) {
$this->emitToken($this->token);
$this->char--;
$this->state = 'data';
} else {
$this->token['error'] = true;
$this->state = 'bogusDoctype';
}
}
private function bogusDoctypeState() {
/* Consume the next input character: */
$this->char++;
$char = $this->char();
if($char === '>') {
$this->emitToken($this->token);
$this->state = 'data';
} elseif($this->char === $this->EOF) {
$this->emitToken($this->token);
$this->char--;
$this->state = 'data';
} else {
// Stay in the bogus DOCTYPE state.
}
}
private function entity() {
$start = $this->char;
// This section defines how to consume an entity. This definition is
// used when parsing entities in text and in attributes.
// The behaviour depends on the identity of the next character (the
// one immediately after the U+0026 AMPERSAND character):
switch($this->character($this->char + 1)) {
// U+0023 NUMBER SIGN (#)
case '#':
// The behaviour further depends on the character after the
// U+0023 NUMBER SIGN:
switch($this->character($this->char + 1)) {
// U+0078 LATIN SMALL LETTER X
// U+0058 LATIN CAPITAL LETTER X
case 'x':
case 'X':
// Follow the steps below, but using the range of
// characters U+0030 DIGIT ZERO through to U+0039 DIGIT
// NINE, U+0061 LATIN SMALL LETTER A through to U+0066
// LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
// A, through to U+0046 LATIN CAPITAL LETTER F (in other
// words, 0-9, A-F, a-f).
$char = 1;
$char_class = '0-9A-Fa-f';
break;
// Anything else
default:
// Follow the steps below, but using the range of
// characters U+0030 DIGIT ZERO through to U+0039 DIGIT
// NINE (i.e. just 0-9).
$char = 0;
$char_class = '0-9';
break;
}
// Consume as many characters as match the range of characters
// given above.
$this->char++;
$e_name = $this->characters($char_class, $this->char + $char + 1);
$entity = $this->character($start, $this->char);
$cond = strlen($e_name) > 0;
// The rest of the parsing happens bellow.
break;
// Anything else
default:
// Consume the maximum number of characters possible, with the
// consumed characters case-sensitively matching one of the
// identifiers in the first column of the entities table.
$e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
$len = strlen($e_name);
for($c = 1; $c <= $len; $c++) {
$id = substr($e_name, 0, $c);
$this->char++;
if(in_array($id, $this->entities)) {
if ($e_name[$c-1] !== ';') {
if ($c < $len && $e_name[$c] == ';') {
$this->char++; // consume extra semicolon
}
}
$entity = $id;
break;
}
}
$cond = isset($entity);
// The rest of the parsing happens bellow.
break;
}
if(!$cond) {
// If no match can be made, then this is a parse error. No
// characters are consumed, and nothing is returned.
$this->char = $start;
return false;
}
// Return a character token for the character corresponding to the
// entity name (as given by the second column of the entities table).
return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8');
}
private function emitToken($token) {
$emit = $this->tree->emitToken($token);
if(is_int($emit)) {
$this->content_model = $emit;
} elseif($token['type'] === self::ENDTAG) {
$this->content_model = self::PCDATA;
}
}
private function EOF() {
$this->state = null;
$this->tree->emitToken(array(
'type' => self::EOF
));
}
}
class HTML5TreeConstructer {
public $stack = array();
private $phase;
private $mode;
private $dom;
private $foster_parent = null;
private $a_formatting = array();
private $head_pointer = null;
private $form_pointer = null;
private $scoping = array('button','caption','html','marquee','object','table','td','th');
private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u');
private $special = array('address','area','base','basefont','bgsound',
'blockquote','body','br','center','col','colgroup','dd','dir','div','dl',
'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5',
'h6','head','hr','iframe','image','img','input','isindex','li','link',
'listing','menu','meta','noembed','noframes','noscript','ol','optgroup',
'option','p','param','plaintext','pre','script','select','spacer','style',
'tbody','textarea','tfoot','thead','title','tr','ul','wbr');
// The different phases.
const INIT_PHASE = 0;
const ROOT_PHASE = 1;
const MAIN_PHASE = 2;
const END_PHASE = 3;
// The different insertion modes for the main phase.
const BEFOR_HEAD = 0;
const IN_HEAD = 1;
const AFTER_HEAD = 2;
const IN_BODY = 3;
const IN_TABLE = 4;
const IN_CAPTION = 5;
const IN_CGROUP = 6;
const IN_TBODY = 7;
const IN_ROW = 8;
const IN_CELL = 9;
const IN_SELECT = 10;
const AFTER_BODY = 11;
const IN_FRAME = 12;
const AFTR_FRAME = 13;
// The different types of elements.
const SPECIAL = 0;
const SCOPING = 1;
const FORMATTING = 2;
const PHRASING = 3;
const MARKER = 0;
public function __construct() {
$this->phase = self::INIT_PHASE;
$this->mode = self::BEFOR_HEAD;
$this->dom = new DOMDocument;
$this->dom->encoding = 'UTF-8';
$this->dom->preserveWhiteSpace = true;
$this->dom->substituteEntities = true;
$this->dom->strictErrorChecking = false;
}
// Process tag tokens
public function emitToken($token) {
switch($this->phase) {
case self::INIT_PHASE: return $this->initPhase($token); break;
case self::ROOT_PHASE: return $this->rootElementPhase($token); break;
case self::MAIN_PHASE: return $this->mainPhase($token); break;
case self::END_PHASE : return $this->trailingEndPhase($token); break;
}
}
private function initPhase($token) {
/* Initially, the tree construction stage must handle each token
emitted from the tokenisation stage as follows: */
/* A DOCTYPE token that is marked as being in error
A comment token
A start tag token
An end tag token
A character token that is not one of one of U+0009 CHARACTER TABULATION,
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
or U+0020 SPACE
An end-of-file token */
if((isset($token['error']) && $token['error']) ||
$token['type'] === HTML5::COMMENT ||
$token['type'] === HTML5::STARTTAG ||
$token['type'] === HTML5::ENDTAG ||
$token['type'] === HTML5::EOF ||
($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
!preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) {
/* This specification does not define how to handle this case. In
particular, user agents may ignore the entirety of this specification
altogether for such documents, and instead invoke special parse modes
with a greater emphasis on backwards compatibility. */
$this->phase = self::ROOT_PHASE;
return $this->rootElementPhase($token);
/* A DOCTYPE token marked as being correct */
} elseif(isset($token['error']) && !$token['error']) {
/* Append a DocumentType node to the Document node, with the name
attribute set to the name given in the DOCTYPE token (which will be
"HTML"), and the other attributes specific to DocumentType objects
set to null, empty lists, or the empty string as appropriate. */
$doctype = new DOMDocumentType(null, null, 'HTML');
/* Then, switch to the root element phase of the tree construction
stage. */
$this->phase = self::ROOT_PHASE;
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
or U+0020 SPACE */
} elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/',
$token['data'])) {
/* Append that character to the Document node. */
$text = $this->dom->createTextNode($token['data']);
$this->dom->appendChild($text);
}
}
private function rootElementPhase($token) {
/* After the initial phase, as each token is emitted from the tokenisation
stage, it must be processed as described in this section. */
/* A DOCTYPE token */
if($token['type'] === HTML5::DOCTYPE) {
// Parse error. Ignore the token.
/* A comment token */
} elseif($token['type'] === HTML5::COMMENT) {
/* Append a Comment node to the Document object with the data
attribute set to the data given in the comment token. */
$comment = $this->dom->createComment($token['data']);
$this->dom->appendChild($comment);
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
or U+0020 SPACE */
} elseif($token['type'] === HTML5::CHARACTR &&
preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
/* Append that character to the Document node. */
$text = $this->dom->createTextNode($token['data']);
$this->dom->appendChild($text);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -