📄 htmltokenizer.cpp
字号:
return state; // Finished parsing server include } src.advance(m_lineNumber); } return state;}HTMLTokenizer::State HTMLTokenizer::parseProcessingInstruction(SegmentedString& src, State state){ UChar oldchar = 0; while (!src.isEmpty()) { UChar chbegin = *src; if (chbegin == '\'') tquote = tquote == SingleQuote ? NoQuote : SingleQuote; else if (chbegin == '\"') tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote; // Look for '?>' // Some crappy sites omit the "?" before it, so // we look for an unquoted '>' instead. (IE compatible) else if (chbegin == '>' && (!tquote || oldchar == '?')) { // We got a '?>' sequence state.setInProcessingInstruction(false); src.advancePastNonNewline(); state.setDiscardLF(true); return state; // Finished parsing comment! } src.advance(m_lineNumber); oldchar = chbegin; } return state;}HTMLTokenizer::State HTMLTokenizer::parseText(SegmentedString& src, State state){ while (!src.isEmpty()) { UChar cc = *src; if (state.skipLF()) { state.setSkipLF(false); if (cc == '\n') { src.advancePastNewline(m_lineNumber); continue; } } // do we need to enlarge the buffer? checkBuffer(); if (cc == '\r') { state.setSkipLF(true); *m_dest++ = '\n'; } else *m_dest++ = cc; src.advance(m_lineNumber); } return state;}HTMLTokenizer::State HTMLTokenizer::parseEntity(SegmentedString& src, UChar*& dest, State state, unsigned& cBufferPos, bool start, bool parsingTag){ if (start) { cBufferPos = 0; state.setEntityState(SearchEntity); EntityUnicodeValue = 0; } while(!src.isEmpty()) { UChar cc = *src; switch(state.entityState()) { case NoEntity: ASSERT(state.entityState() != NoEntity); return state; case SearchEntity: if (cc == '#') { m_cBuffer[cBufferPos++] = cc; src.advancePastNonNewline(); state.setEntityState(NumericSearch); } else state.setEntityState(EntityName); break; case NumericSearch: if (cc == 'x' || cc == 'X') { m_cBuffer[cBufferPos++] = cc; src.advancePastNonNewline(); state.setEntityState(Hexadecimal); } else if (cc >= '0' && cc <= '9') state.setEntityState(Decimal); else state.setEntityState(SearchSemicolon); break; case Hexadecimal: { int ll = min(src.length(), 10 - cBufferPos); while (ll--) { cc = *src; if (!((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'))) { state.setEntityState(SearchSemicolon); break; } int digit; if (cc < 'A') digit = cc - '0'; else digit = (cc - 'A' + 10) & 0xF; // handle both upper and lower case without a branch EntityUnicodeValue = EntityUnicodeValue * 16 + digit; m_cBuffer[cBufferPos++] = cc; src.advancePastNonNewline(); } if (cBufferPos == 10) state.setEntityState(SearchSemicolon); break; } case Decimal: { int ll = min(src.length(), 9-cBufferPos); while(ll--) { cc = *src; if (!(cc >= '0' && cc <= '9')) { state.setEntityState(SearchSemicolon); break; } EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0'); m_cBuffer[cBufferPos++] = cc; src.advancePastNonNewline(); } if (cBufferPos == 9) state.setEntityState(SearchSemicolon); break; } case EntityName: { int ll = min(src.length(), 9-cBufferPos); while(ll--) { cc = *src; if (!((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) { state.setEntityState(SearchSemicolon); break; } m_cBuffer[cBufferPos++] = cc; src.advancePastNonNewline(); } if (cBufferPos == 9) state.setEntityState(SearchSemicolon); if (state.entityState() == SearchSemicolon) { if(cBufferPos > 1) { // Since the maximum length of entity name is 9, // so a single char array which is allocated on // the stack, its length is 10, should be OK. // Also if we have an illegal character, we treat it // as illegal entity name. unsigned testedEntityNameLen = 0; char tmpEntityNameBuffer[10]; ASSERT(cBufferPos < 10); for (; testedEntityNameLen < cBufferPos; ++testedEntityNameLen) { if (m_cBuffer[testedEntityNameLen] > 0x7e) break; tmpEntityNameBuffer[testedEntityNameLen] = m_cBuffer[testedEntityNameLen]; } const Entity *e; if (testedEntityNameLen == cBufferPos) e = findEntity(tmpEntityNameBuffer, cBufferPos); else e = 0; if(e) EntityUnicodeValue = e->code; // be IE compatible if(parsingTag && EntityUnicodeValue > 255 && *src != ';') EntityUnicodeValue = 0; } } else break; } case SearchSemicolon: // Don't allow values that are more than 21 bits. if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x10FFFF) { if (!inViewSourceMode()) { if (*src == ';') src.advancePastNonNewline(); if (EntityUnicodeValue <= 0xFFFF) { checkBuffer(); src.push(fixUpChar(EntityUnicodeValue)); } else { // Convert to UTF-16, using surrogate code points. checkBuffer(2); src.push(U16_LEAD(EntityUnicodeValue)); src.push(U16_TRAIL(EntityUnicodeValue)); } } else { // FIXME: We should eventually colorize entities by sending them as a special token. checkBuffer(11); *dest++ = '&'; for (unsigned i = 0; i < cBufferPos; i++) dest[i] = m_cBuffer[i]; dest += cBufferPos; if (*src == ';') { *dest++ = ';'; src.advancePastNonNewline(); } } } else { checkBuffer(10); // ignore the sequence, add it to the buffer as plaintext *dest++ = '&'; for (unsigned i = 0; i < cBufferPos; i++) dest[i] = m_cBuffer[i]; dest += cBufferPos; } state.setEntityState(NoEntity); return state; } } return state;}HTMLTokenizer::State HTMLTokenizer::parseDoctype(SegmentedString& src, State state){ ASSERT(state.inDoctype()); while (!src.isEmpty() && state.inDoctype()) { UChar c = *src; bool isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' '; switch (m_doctypeToken.state()) { case DoctypeBegin: { m_doctypeToken.setState(DoctypeBeforeName); if (isWhitespace) { src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } break; } case DoctypeBeforeName: { if (c == '>') { // Malformed. Just exit. src.advancePastNonNewline(); state.setInDoctype(false); if (inViewSourceMode()) processDoctypeToken(); } else if (isWhitespace) { src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else m_doctypeToken.setState(DoctypeName); break; } case DoctypeName: { if (c == '>') { // Valid doctype. Emit it. src.advancePastNonNewline(); state.setInDoctype(false); processDoctypeToken(); } else if (isWhitespace) { m_doctypeSearchCount = 0; // Used now to scan for PUBLIC m_doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM m_doctypeToken.setState(DoctypeAfterName); src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else { src.advancePastNonNewline(); m_doctypeToken.m_name.append(c); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } break; } case DoctypeAfterName: { if (c == '>') { // Valid doctype. Emit it. src.advancePastNonNewline(); state.setInDoctype(false); processDoctypeToken(); } else if (!isWhitespace) { src.advancePastNonNewline(); if (toASCIILower(c) == publicStart[m_doctypeSearchCount]) { m_doctypeSearchCount++; if (m_doctypeSearchCount == 6) // Found 'PUBLIC' sequence m_doctypeToken.setState(DoctypeBeforePublicID); } else if (m_doctypeSearchCount > 0) { m_doctypeSearchCount = 0; m_doctypeToken.setState(DoctypeBogus); } else if (toASCIILower(c) == systemStart[m_doctypeSecondarySearchCount]) { m_doctypeSecondarySearchCount++; if (m_doctypeSecondarySearchCount == 6) // Found 'SYSTEM' sequence m_doctypeToken.setState(DoctypeBeforeSystemID); } else { m_doctypeSecondarySearchCount = 0; m_doctypeToken.setState(DoctypeBogus); } if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else { src.advance(m_lineNumber); // Whitespace keeps us in the after name state. if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } break; } case DoctypeBeforePublicID: { if (c == '\"' || c == '\'') { tquote = c == '\"' ? DoubleQuote : SingleQuote; m_doctypeToken.setState(DoctypePublicID); src.advancePastNonNewline(); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else if (c == '>') { // Considered bogus. Don't process the doctype. src.advancePastNonNewline(); state.setInDoctype(false);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -