📄 htmltokenizer.cpp
字号:
{
// We got a '?>' sequence
processingInstruction = false;
++src;
discard=LFDiscard;
return; // Finished parsing comment!
}
++src;
oldchar = chbegin;
}
}
void HTMLTokenizer::parseText(TokenizerString &src)
{
while ( !src.isEmpty() )
{
// do we need to enlarge the buffer?
RETURN_IF_OOM( checkBuffer() );
// ascii is okay because we only do ascii comparisons
unsigned char chbegin = src->latin1();
if (skipLF && ( chbegin != '\n' ))
{
skipLF = false;
}
if (skipLF)
{
skipLF = false;
++src;
}
else if (( chbegin == '\n' ) || ( chbegin == '\r' ))
{
if (chbegin == '\r')
skipLF = true;
*dest++ = '\n';
++src;
}
else {
*dest = *src;
fixUpChar(*dest);
++dest;
++src;
}
}
}
void HTMLTokenizer::parseEntity(TokenizerString &src, QChar *&dest, bool start)
{
if( start )
{
cBufferPos = 0;
Entity = SearchEntity;
EntityUnicodeValue = 0;
}
while( !src.isEmpty() )
{
ushort cc = src->unicode();
switch(Entity) {
case NoEntity:
assert(Entity != NoEntity);
return;
case SearchEntity:
if(cc == '#') {
cBuffer[cBufferPos++] = cc;
++src;
Entity = NumericSearch;
}
else
Entity = EntityName;
break;
case NumericSearch:
if(cc == 'x' || cc == 'X') {
cBuffer[cBufferPos++] = cc;
++src;
Entity = Hexadecimal;
}
else if(cc >= '0' && cc <= '9')
Entity = Decimal;
else
Entity = SearchSemicolon;
break;
case Hexadecimal:
{
int ll = kMin(src.length(), 8);
while(ll--) {
QChar csrc(src->lower());
cc = csrc.cell();
if(csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) {
break;
}
EntityUnicodeValue = EntityUnicodeValue*16 + (cc - ( cc < 'a' ? '0' : 'a' - 10));
cBuffer[cBufferPos++] = cc;
++src;
}
Entity = SearchSemicolon;
break;
}
case Decimal:
{
int ll = kMin(src.length(), 9-cBufferPos);
while(ll--) {
cc = src->cell();
if(src->row() || !(cc >= '0' && cc <= '9')) {
Entity = SearchSemicolon;
break;
}
EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0');
cBuffer[cBufferPos++] = cc;
++src;
}
if(cBufferPos == 9) Entity = SearchSemicolon;
break;
}
case EntityName:
{
int ll = kMin(src.length(), 9-cBufferPos);
while(ll--) {
QChar csrc = *src;
cc = csrc.cell();
if(csrc.row() || !((cc >= 'a' && cc <= 'z') ||
(cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
Entity = SearchSemicolon;
break;
}
cBuffer[cBufferPos++] = cc;
++src;
}
if(cBufferPos == 9) Entity = SearchSemicolon;
if(Entity == SearchSemicolon) {
if(cBufferPos > 1) {
const entity *e = findEntity(cBuffer, cBufferPos);
if(e)
EntityUnicodeValue = e->code;
// be IE compatible
if(tag && EntityUnicodeValue > 255 && *src != ';')
EntityUnicodeValue = 0;
}
}
else
break;
}
case SearchSemicolon:
//kdDebug( 6036 ) << "ENTITY " << EntityUnicodeValue << ", " << res << endl;
// Don't allow surrogate code points, or values that are more than 21 bits.
if ((EntityUnicodeValue > 0 && EntityUnicodeValue < 0xD800)
|| (EntityUnicodeValue >= 0xE000 && EntityUnicodeValue <= 0x1FFFFF)) {
if (*src == ';')
++src;
if (EntityUnicodeValue <= 0xFFFF) {
QChar c(EntityUnicodeValue);
fixUpChar(c);
RETURN_IF_OOM( checkBuffer() );
src.push(c);
} else {
// Convert to UTF-16, using surrogate code points.
QChar c1(0xD800 | (((EntityUnicodeValue >> 16) - 1) << 6) | ((EntityUnicodeValue >> 10) & 0x3F));
QChar c2(0xDC00 | (EntityUnicodeValue & 0x3FF));
RETURN_IF_OOM( checkBuffer(2) );
src.push(c1);
src.push(c2);
}
} else {
#ifdef TOKEN_DEBUG
kdDebug( 6036 ) << "unknown entity!" << endl;
#endif
RETURN_IF_OOM( checkBuffer() );
// ignore the sequence, add it to the buffer as plaintext
*dest++ = '&';
for(unsigned int i = 0; i < cBufferPos; i++)
dest[i] = cBuffer[i];
dest += cBufferPos;
if (pre)
prePos += cBufferPos+1;
}
Entity = NoEntity;
return;
}
}
}
void HTMLTokenizer::parseTag(TokenizerString &src)
{
assert(!Entity );
while ( !src.isEmpty() )
{
RETURN_IF_OOM( checkBuffer() );
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
uint l = 0;
while(l < src.length() && (*(src.current()+l)).latin1() != '>')
l++;
qDebug("src is now: *%s*, tquote: %d",
QConstString((QChar*)src.current(), l).string().latin1(), tquote);
#endif
switch(tag) {
case NoTag:
{
return;
}
case TagName:
{
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("TagName");
#endif
if (searchCount > 0)
{
if (*src == commentStart[searchCount])
{
searchCount++;
if (searchCount == 4)
{
#ifdef TOKEN_DEBUG
kdDebug( 6036 ) << "Found comment" << endl;
#endif
// Found '<!--' sequence
++src;
dest = buffer; // ignore the previous part of this tag
comment = true;
tag = NoTag;
// Fix bug 34302 at kde.bugs.org. Go ahead and treat
// <!--> as a valid comment, since both mozilla and IE on windows
// can handle this case. Only do this in quirks mode. -dwh
if (!src.isEmpty() && *src == '>' && parser->doc()->inCompatMode()) {
comment = false;
++src;
if (!src.isEmpty())
cBuffer[cBufferPos++] = src->cell();
}
else
parseComment(src);
return; // Finished parsing tag!
}
// cuts of high part, is okay
cBuffer[cBufferPos++] = src->cell();
++src;
break;
}
else
searchCount = 0; // Stop looking for '<!--' sequence
}
bool finish = false;
unsigned int ll = kMin(src.length(), CBUFLEN-cBufferPos);
while(ll--) {
ushort curchar = *src;
if(curchar <= ' ' || curchar == '>' ) {
finish = true;
break;
}
// Use tolower() instead of | 0x20 to lowercase the char because there is no
// performance gain in using | 0x20 since tolower() is optimized and
// | 0x20 turns characters such as '_' into junk.
cBuffer[cBufferPos++] = tolower(curchar);
++src;
}
// Disadvantage: we add the possible rest of the tag
// as attribute names. ### judge if this causes problems
if(finish || CBUFLEN == cBufferPos) {
bool beginTag;
char* ptr = cBuffer;
unsigned int len = cBufferPos;
cBuffer[cBufferPos] = '\0';
if ((cBufferPos > 0) && (*ptr == '/'))
{
// End Tag
beginTag = false;
ptr++;
len--;
}
else
// Start Tag
beginTag = true;
// Accept empty xml tags like <br/>. We trim off the "/" so that when we call
// getTagID, we'll look up "br" as the tag name and not "br/".
if(len > 1 && ptr[len-1] == '/' )
ptr[--len] = '\0';
// Look up the tagID for the specified tag name (now that we've shaved off any
// invalid / that might have followed the name).
unsigned short tagID = getTagID(ptr, len);
if (!tagID) {
DOMString tagName(ptr);
DocumentImpl *doc = parser->docPtr()->document();
if (doc->isValidName(tagName))
tagID = parser->docPtr()->document()->tagId(0, tagName.implementation(), false);
}
if (tagID) {
#ifdef TOKEN_DEBUG
QCString tmp(ptr, len+1);
kdDebug( 6036 ) << "found tag id=" << tagID << ": " << tmp.data() << endl;
#endif
currToken.id = beginTag ? tagID : tagID + ID_CLOSE_TAG;
}
dest = buffer;
tag = SearchAttribute;
cBufferPos = 0;
}
break;
}
case SearchAttribute:
{
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("SearchAttribute");
#endif
bool atespace = false;
ushort curchar;
while(!src.isEmpty()) {
curchar = *src;
// In this mode just ignore any quotes we encounter and treat them like spaces.
if (curchar > ' ' && curchar != '\'' && curchar != '"') {
if (curchar == '<' || curchar == '>')
tag = SearchEnd;
else
tag = AttributeName;
cBufferPos = 0;
break;
}
atespace = true;
++src;
}
break;
}
case AttributeName:
{
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("AttributeName");
#endif
ushort curchar;
int ll = kMin(src.length(), CBUFLEN-cBufferPos);
while(ll--) {
curchar = *src;
if(curchar <= '>') {
if(curchar <= ' ' || curchar == '=' || curchar == '>') {
unsigned int a;
cBuffer[cBufferPos] = '\0';
a = getAttrID(cBuffer, cBufferPos);
if (a)
attrNamePresent = true;
else {
attrName = QString::fromLatin1(QCString(cBuffer, cBufferPos+1).data());
attrNamePresent = !attrName.isEmpty();
// This is a deliberate quirk to match Mozilla and Opera. We have to do this
// since sites that use the "standards-compliant" path sometimes send
// <script src="foo.js"/>. Both Moz and Opera will honor this, despite it
// being bogus HTML. They do not honor the "/" for other tags. This behavior
// also deviates from WinIE, but in this case we'll just copy Moz and Opera.
if (currToken.id == ID_SCRIPT && curchar == '>' &&
attrName == "/")
currToken.flat = true;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -