📄 htmltoken.cpp
字号:
/* This file is part of the KDE libraries Copyright (C) 1997 Martin Jones (mjones@kde.org) (C) 1997 Torben Weis (weis@kde.org) (C) 1998 Waldo Bastian (bastian@kde.org) This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.*///-----------------------------------------------------------------------------//// KDE HTML Widget - Tokenizers//#ifdef HAVE_CONFIG_H#include "config.h"#endif#include "htmltoken.h"#include <stdio.h>#include <stdlib.h>#include <ctype.h>#include <string.h>#include <strings.h>#include <kcharsets.h>#include <kapp.h>// Include Java Script#include <jsexec.h>// Token buffers are allocated in units of TOKEN_BUFFER_SIZE bytes.#define TOKEN_BUFFER_SIZE (32*1024) - 1static const char *commentStart = "<!--";static const char *scriptEnd = "</script>";static const char *styleEnd = "</style>";enum quoteEnum { NO_QUOTE=0, SINGLE_QUOTE, DOUBLE_QUOTE };//-----------------------------------------------------------------------------const char *BlockingToken::tokenName(){ switch ( ttype ) { case Table: return "</table"; break; case FrameSet: return "</frameset"; break; case Script: return "</script"; break; case Cell: return "</cell"; break; } return "";}//-----------------------------------------------------------------------------HTMLTokenizer::HTMLTokenizer( KHTMLWidget *_widget ){ blocking.setAutoDelete( true ); jsEnvironment = 0L; widget = _widget; last = next = curr = 0; buffer = 0; scriptCode = 0;}void HTMLTokenizer::reset(){ while (!tokenBufferList.isEmpty()) { char *oldBuffer = (char *) tokenBufferList.take(0); delete [] oldBuffer; } last = next = curr = 0; tokenBufferSizeRemaining = 0; // No space allocated at all if ( buffer ) delete [] buffer; buffer = 0; if ( scriptCode ) delete [] scriptCode; scriptCode = 0;}void HTMLTokenizer::begin(){ reset(); blocking.clear(); size = 1000; buffer = new char[ 1024 ]; dest = buffer; tag = false; pending = NonePending; discard = NoneDiscard; pre = false; prePos = 0; script = false; style = false; skipLF = false; select = false; comment = false; textarea = false; startTag = false; tquote = NO_QUOTE; searchCount = 0; title = false; charEntity = false;}void HTMLTokenizer::addPending(){ if ( tag || select) { *dest++ = ' '; } else if ( textarea ) { if (pending == LFPending) *dest++ = '\n'; else *dest++ = ' '; } else if ( pre ) { int p; switch (pending) { case SpacePending: // Insert a non-breaking space *(unsigned char *)dest++ = 0xa0; prePos++; break; case LFPending: if ( dest > buffer ) { *dest = 0; appendToken( buffer, dest-buffer ); } dest = buffer; *dest = TAG_ESCAPE; *(dest+1) = '\n'; *(dest+2) = 0; appendToken( buffer, 2 ); dest = buffer; prePos = 0; break; case TabPending: p = TAB_SIZE - ( prePos % TAB_SIZE ); for ( int x = 0; x < p; x++ ) { *dest = ' '; dest++; } prePos += p; break; default: printf("Assertion failed: pending = %d\n", (int) pending); break; } } else { *dest++ = ' '; } pending = NonePending;}void HTMLTokenizer::write( const char *str ){ // If this pointer is not 0L then we allocated some memory to store HTML // code in. This may happen while parsing the <script> tag, since the output // of the java code is treated as HTML code. This means we have to modify // the HTML code on the fly by inserting new HTML stuff. // If this pointer is not null, one has to free the memory before leaving // this function. char *srcPtr = 0L; KCharsets *charsets=KApplication::getKApplication()->getCharsets(); if ( str == 0L || buffer == 0L ) return; const char *src = str; while ( *src != 0 ) { // do we need to enlarge the buffer? if ( (dest - buffer) > size ) { char *newbuf = new char [ size + 1024 + 20 ]; memcpy( newbuf, buffer, dest - buffer + 1 ); dest = newbuf + ( dest - buffer ); delete [] buffer; buffer = newbuf; size += 1024; } if (skipLF && (*src != '\n')) { skipLF = false; } if (skipLF) { src++; } else if ( comment ) { // Look for '-->' if (*src == '-') { if (searchCount < 2) // Watch out for '--->' searchCount++; } else if ((searchCount == 2) && (*src == '>')) { // We got a '-->' sequence comment = false; } else { searchCount = 0; } src++; } // We are inside of the <script> or <style> tag. Look for the end tag // which is either </script> or </style>, // otherwise print out every received character else if ( script || style ) { // Allocate memory to store the script. We will write maximal // 10 characers. if ( scriptCodeSize + 11 > scriptCodeMaxSize ) { char *newbuf = new char [ scriptCodeSize + 1024 ]; memcpy( newbuf, scriptCode, scriptCodeSize ); delete [] scriptCode; scriptCode = newbuf; scriptCodeMaxSize += 1024; } if ( ( *src == '>' ) && ( searchFor[ searchCount ] == '>')) { src++; scriptCode[ scriptCodeSize ] = 0; scriptCode[ scriptCodeSize + 1 ] = 0; if (script) { script = false; /* Parse scriptCode containing <script> info */ /* Not implemented */ } else { style = false; /* Parse scriptCode containing <style> info */ /* Not implemented */ } delete [] scriptCode; scriptCode = 0; } // Find out wether we see a </script> tag without looking at // any other then the current character, since further characters // may still be on their way thru the web! else if ( searchCount > 0 ) { if ( tolower(*src) == searchFor[ searchCount ] ) { searchBuffer[ searchCount ] = *src; searchCount++; src++; } // We were wrong => print all buffered characters and the current one; else { searchBuffer[ searchCount ] = 0; char *p = searchBuffer; while ( *p ) scriptCode[ scriptCodeSize++ ] = *p++; scriptCode[ scriptCodeSize++ ] = *src++; searchCount = 0; } } // Is this perhaps the start of the </script> or </style> tag? else if ( *src == '<' ) { searchCount = 1; searchBuffer[ 0 ] = '<'; src++; } else scriptCode[ scriptCodeSize++ ] = *src++; } else if (charEntity) { unsigned long entityValue = 0; QString res = 0; searchBuffer[ searchCount+1] = *src; searchBuffer[ searchCount+2] = '\0'; // Check for '�' sequence if (searchBuffer[2] == '#') { if ((searchCount > 1) && (!isdigit(*src)) && (searchBuffer[3] != 'x')) { // { searchBuffer[ searchCount+1] = '\0'; entityValue = strtoul( &(searchBuffer[3]), NULL, 10 ); charEntity = false; } if ((searchCount > 1) && (!isalnum(*src)) && (searchBuffer[3] == 'x')) { // ካ searchBuffer[ searchCount+1] = '\0'; entityValue = strtoul( &(searchBuffer[4]), NULL, 16 ); charEntity = false; } } else { // Check for &abc12 sequence if (!isalnum(*src)) { int len; charEntity = false; // check trailing char to be ";", but only if in a tag (David) if ((searchBuffer[searchCount+1] == ';') || (!tag)) { searchBuffer[ searchCount+1] = '\0'; res = charsets->convertTag(searchBuffer+1, len).copy(); if (len <= 0) { res = 0; } } } } // Mapping for MS-Windows Latin-1 extension. // These mappings do not address all the extended // character sets as defined by MS-Windows Latin-1 // extension. Rather it only deals with those that // are heavily used on web pages by MS-Windows based // tools. Some of the entities do not have corresponding // enteries under ISO-8859-1, hence are not mapped. Also // note that some of the mappings are close but not prefect // matches! (Dawit A) switch (entityValue) { case 139: entityValue = 60; break; case 145: entityValue = 96; break; case 146: entityValue = 39; break; // for the next 4 values mapping to a name doesn't work... // perhaps it's just my computer not having these chars... case 147: //strcpy(searchBuffer+2, "ldquo"); //searchCount = 6; //break; case 148: //strcpy(searchBuffer+2, "rdquo"); //searchCount = 6; entityValue = 34; break; case 150: //strcpy(searchBuffer+2, "ndash"); //searchCount = 6; //break; case 151: //strcpy(searchBuffer+2, "mdash"); //searchCount = 6; entityValue = 45; break; case 152: entityValue = 126; break; case 155: entityValue = 62; break; case 133: strcpy(searchBuffer+2, "hellip"); searchCount = 7; break; case 149: strcpy(searchBuffer+2, "bull"); searchCount = 5; break; case 153: strcpy(searchBuffer+2, "trade"); searchCount = 6; break; default:; } if (searchCount > 8) { // This sequence is too long.. we ignore it charEntity = false; memcpy(dest,searchBuffer+1, searchCount); dest += searchCount; // *dest++ = *src++; if ( pre ) prePos += searchCount; } else if (charEntity) { // Keep searching for end of character entity searchCount++; src++; } else { // We have a complete sequence if (res && (res.length() == 1)) { entityValue = *((unsigned char *)res.data()); } if ( ( (entityValue < 128) && (entityValue > 0) ) || (entityValue == 160) ) { // Just insert plain ascii *dest++ = (char) entityValue; if (pre) prePos++; if (*src == ';') src++; } else if (!entityValue && !res) { // ignore the sequence, add it to the buffer as plaintext memcpy(dest,searchBuffer+1, searchCount); dest += searchCount; if (pre) prePos += searchCount; } else if (!tag && !textarea && !select && !title) { // add current token first if (dest > buffer) { *dest=0; appendToken(buffer,dest-buffer); dest = buffer; } // add token with the amp-sequence for further conversion appendToken(searchBuffer, searchCount+1); dest = buffer; // Assume a width of 1 if (pre) prePos++; if (*src == ';') src++; } else if (res) { // insert the characters, assuming iso-8859-1 memcpy(dest, res.data(), res.length()); dest += res.length(); if (pre) prePos += res.length(); if (*src == ';') src++; } else if (entityValue > 0) { // insert the character, assuming iso-8859-1 *dest++ = (char) entityValue; if (pre) prePos++; if (*src == ';') src++; } searchCount = 0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -