⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htmltokenizer.java

📁 html过滤 html过滤 html过滤
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
                } else {
                    content();
                }
            } else if (_isStyleContext) {
                if ( startsWith("</style") && (isWhitespace(_pos + 7) || isChar(_pos + 7, '>')) ) {
                    tagEnd();
                } else {
                    content();
                }
            } else {
                if ( startsWith("<!doctype") ) {
                	if ( !_isLateForDoctype ) {
                		doctype();
                		_isLateForDoctype = true;
                	} else {
                		ignore();
                	}
                } else if ( startsWith("</") && isIdentifierStartChar(_pos + 2) ) {
                	_isLateForDoctype = true;
                    tagEnd();
                } else if ( startsWith("<!--") ) {
                    comment();
                } else if ( startsWith("<") && isIdentifierStartChar(_pos + 1) ) {
                	_isLateForDoctype = true;
                    tagStart();
                } else {
                    content();
                }
            }
        }

        _reader.close();
    }

    /**
     * Parses start of the tag.
     * It expects that current position is at the "<" after which
     * the tag's name follows.
     * @throws IOException
     */
    private void tagStart() throws IOException {
        saveCurrent();
        go();

        if ( isAllRead() ) {
            return;
        }

        String tagName = identifier();
        _currentTagToken = new TagNode(tagName);

        if (_asExpected) {
            skipWhitespaces();
            tagAttributes();

            String originalSource = _saved.toString();
            addToken(_currentTagToken);
            if ( isChar('>') ) {
            	go();
                if ( "script".equalsIgnoreCase(tagName) ) {
                    _isScriptContext = true;
                } else if ( "style".equalsIgnoreCase(tagName) ) {
                    _isStyleContext = true;
                }
                originalSource += ">";
            } else if ( startsWith("/>") ) {
            	go(2);
                addToken( new EndTagToken(tagName) );
                originalSource += "/>";
            }

            _currentTagToken.setOriginalSource(originalSource);
            _currentTagToken = null;
        } else {
        	addSavedAsContent();
        }
    }


    /**
     * Parses end of the tag.
     * It expects that current position is at the "<" after which
     * "/" and the tag's name follows.
     * @throws IOException
     */
    private void tagEnd() throws IOException {
        saveCurrent(2);
        go(2);

        if ( isAllRead() ) {
            return;
        }

        String tagName = identifier();
        _currentTagToken = new EndTagToken(tagName);

        if (_asExpected) {
            skipWhitespaces();
            tagAttributes();

            String originalSource = _saved.toString();
            addToken(_currentTagToken);

            if ( isChar('>') ) {
            	go();
                originalSource += ">";
            }

            if ( "script".equalsIgnoreCase(tagName) ) {
                _isScriptContext = false;
            } else if ( "style".equalsIgnoreCase(tagName) ) {
                _isStyleContext = false;
            }

            _currentTagToken.setOriginalSource(originalSource);
            _currentTagToken = null;
        } else {
            addSavedAsContent();
        }
    }

    /**
     * Parses an identifier from the current position.
     * @throws IOException
     */
    private String identifier() throws IOException {
        _asExpected = true;

        if ( !isIdentifierStartChar() ) {
            _asExpected = false;
            return null;
        }

        StringBuffer tagName = new StringBuffer(16);

        while ( !isAllRead() && isIdentifierChar() ) {
            saveCurrent();
            tagName.append( _working[_pos] );
            go();
        }

        return tagName.toString();
    }

    /**
     * Parses list tag attributes from the current position.
     * @throws IOException
     */
    private void tagAttributes() throws IOException {
        while( !isAllRead() && _asExpected && !isChar('>') && !startsWith("/>") ) {
            skipWhitespaces();
            String attName = identifier();

            if (!_asExpected) {
                if ( !isChar('<') && !isChar('>') && !startsWith("/>") ) {
                    saveCurrent();
                    go();
                }

                if (!isChar('<')) {
                    _asExpected = true;
                }

                continue;
            }

            String attValue = attName;

            skipWhitespaces();
            if ( isChar('=') ) {
                saveCurrent();
                go();
                attValue = attributeValue();
            }

            if (_asExpected) {
                _currentTagToken.addAttribute(attName, attValue);
            }
        }
    }

    /**
     * Parses a single tag attribute - it is expected to be in one of the forms:
     * 		name=value
     * 		name="value"
     * 		name='value'
     * 		name
     * @throws IOException
     */
    private String attributeValue() throws IOException {
        skipWhitespaces();
        
        if ( isChar('<') || isChar('>') || startsWith("/>") ) {
        	return "";
        }

        boolean isQuoteMode = false;
        boolean isAposMode = false;

        StringBuffer result = new StringBuffer();

        if ( isChar('\'') ) {
            isAposMode = true;
            saveCurrent();
            go();
        } else if ( isChar('\"') ) {
            isQuoteMode = true;
            saveCurrent();
            go();
        }

        while ( !isAllRead() &&
                ( (isAposMode && !isChar('\'')) ||
                  (isQuoteMode && !isChar('\"')) ||
                  (!isAposMode && !isQuoteMode && !isWhitespace() && !isChar('>') && !startsWith("/>"))
                )
              ) {
            result.append( _working[_pos] );
            saveCurrent();
            go();
        }

        if ( isChar('\'') && isAposMode ) {
            saveCurrent();
            go();
        } else if ( isChar('\"') && isQuoteMode ) {
            saveCurrent();
            go();
        }


        return result.toString();
    }

    private void content() throws IOException {
        while ( !isAllRead() ) {
            saveCurrent();
            go();

            if ( isChar('<') ) {
                break;
            }
        }

        addSavedAsContent();
    }

    private void ignore() throws IOException {
        while ( !isAllRead() ) {
        	go();
            if ( isChar('<') ) {
                break;
            }
        }
    }

    private void comment() throws IOException {
    	go(4);
        while ( !isAllRead() && !startsWith("-->") ) {
            saveCurrent();
            go();
        }

        if (startsWith("-->")) {
        	go(3);
        }

        if (_saved.length() > 0) {
            if ( !cleaner.isOmitComments() ) {
                String hyphenRepl = cleaner.getHyphenReplacementInComment();
                String comment = _saved.toString().replaceAll("--", hyphenRepl + hyphenRepl);

        		if ( comment.length() > 0 && comment.charAt(0) == '-' ) {
        			comment = hyphenRepl + comment.substring(1);
        		}
        		int len = comment.length();
        		if ( len > 0 && comment.charAt(len - 1) == '-' ) {
        			comment = comment.substring(0, len - 1) + hyphenRepl;
        		}

        		addToken( new CommentToken(comment) );
        	}
            _saved.delete(0, _saved.length());
        }
    }
    
    private void doctype() throws IOException {
    	go(9);

    	skipWhitespaces();
    	String part1 = identifier();
	    skipWhitespaces();
	    String part2 = identifier();
	    skipWhitespaces();
	    String part3 = attributeValue();
	    skipWhitespaces();
	    String part4 = attributeValue();
	    
	    ignore();
	    
	    DoctypeToken _docType = new DoctypeToken(part1, part2, part3, part4);
	    
	    if ( _docType.isValid() ) {
	    	cleaner.setDoctype(_docType);
	    }
    }

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -