首页 › 资源下载 › 中间件编程 › 测试工具 › 源码查看

jtidyhtmlparser.java

来自「测试工具」· Java 代码 · 共 224 行

JAVA

224 行

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 */

package org.apache.jmeter.protocol.http.parser;

import java.io.ByteArrayInputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;

import org.apache.jorphan.logging.LoggingManager;
import org.apache.log.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;
import org.xml.sax.SAXException;

/**
 * HtmlParser implementation using JTidy.
 * 
 */
class JTidyHTMLParser extends HTMLParser {
    private static final Logger log = LoggingManager.getLoggerForClass();

	protected JTidyHTMLParser() {
		super();
	}

	protected boolean isReusable() {
		return true;
	}

	/*
	 * (non-Javadoc)
	 * 
	 * @see org.apache.jmeter.protocol.http.parser.HTMLParser#getEmbeddedResourceURLs(byte[],
	 *      java.net.URL)
	 */
	public Iterator getEmbeddedResourceURLs(byte[] html, URL baseUrl, URLCollection urls) throws HTMLParseException {
		Document dom = null;
		try {
			dom = (Document) getDOM(html);
		} catch (SAXException se) {
			throw new HTMLParseException(se);
		}

		// Now parse the DOM tree

		scanNodes(dom, urls, baseUrl);

		return urls.iterator();
	}

	/**
	 * Scan nodes recursively, looking for embedded resources
	 * 
	 * @param node -
	 *            initial node
	 * @param urls -
	 *            container for URLs
	 * @param baseUrl -
	 *            used to create absolute URLs
	 * 
	 * @return new base URL
	 */
	private URL scanNodes(Node node, URLCollection urls, URL baseUrl) throws HTMLParseException {
		if (node == null) {
			return baseUrl;
		}

		String name = node.getNodeName();

		int type = node.getNodeType();

		switch (type) {

		case Node.DOCUMENT_NODE:
			scanNodes(((Document) node).getDocumentElement(), urls, baseUrl);
			break;

		case Node.ELEMENT_NODE:

			NamedNodeMap attrs = node.getAttributes();
			if (name.equalsIgnoreCase(TAG_BASE)) {
				String tmp = getValue(attrs, ATT_HREF);
				if (tmp != null)
					try {
						baseUrl = new URL(baseUrl, tmp);
					} catch (MalformedURLException e) {
						throw new HTMLParseException(e);
					}
				break;
			}

			if (name.equalsIgnoreCase(TAG_IMAGE) || name.equalsIgnoreCase(TAG_EMBED)) {
				urls.addURL(getValue(attrs, ATT_SRC), baseUrl);
				break;
			}

			if (name.equalsIgnoreCase(TAG_APPLET)) {
				urls.addURL(getValue(attrs, "code"), baseUrl);
				break;
			}
			if (name.equalsIgnoreCase(TAG_INPUT)) {
				String src = getValue(attrs, ATT_SRC);
				String typ = getValue(attrs, ATT_TYPE);
				if ((src != null) && (typ.equalsIgnoreCase(ATT_IS_IMAGE))) {
					urls.addURL(src, baseUrl);
				}
				break;
			}
			if (name.equalsIgnoreCase(TAG_LINK) && getValue(attrs, ATT_REL).equalsIgnoreCase(STYLESHEET)) {
				urls.addURL(getValue(attrs, ATT_HREF), baseUrl);
				break;
			}
			if (name.equalsIgnoreCase(TAG_SCRIPT)) {
				urls.addURL(getValue(attrs, ATT_SRC), baseUrl);
				break;
			}
			if (name.equalsIgnoreCase(TAG_FRAME)) {
				urls.addURL(getValue(attrs, ATT_SRC), baseUrl);
				break;
			}
			String back = getValue(attrs, ATT_BACKGROUND);
			if (back != null) {
				urls.addURL(back, baseUrl);
			}
			if (name.equalsIgnoreCase(TAG_BGSOUND)) {
				urls.addURL(getValue(attrs, ATT_SRC), baseUrl);
				break;
			}

			String style = getValue(attrs, ATT_STYLE);
			if (style != null) {
            	HtmlParsingUtils.extractStyleURLs(baseUrl, urls, style);
			}

			NodeList children = node.getChildNodes();
			if (children != null) {
				int len = children.getLength();
				for (int i = 0; i < len; i++) {
					baseUrl = scanNodes(children.item(i), urls, baseUrl);
				}
			}

			break;

		// case Node.TEXT_NODE:
		// break;

		}

		return baseUrl;

	}

	/*
	 * Helper method to get an attribute value, if it exists @param attrs list
	 * of attributs @param attname attribute name @return
	 */
	private String getValue(NamedNodeMap attrs, String attname) {
		String v = null;
		Node n = attrs.getNamedItem(attname);
		if (n != null)
			v = n.getNodeValue();
		return v;
	}

	/**
	 * Returns <code>tidy</code> as HTML parser.
	 * 
	 * @return a <code>tidy</code> HTML parser
	 */
	private static Tidy getTidyParser() {
		log.debug("Start : getParser");
		Tidy tidy = new Tidy();
		tidy.setCharEncoding(org.w3c.tidy.Configuration.UTF8);
		tidy.setQuiet(true);
		tidy.setShowWarnings(false);
		if (log.isDebugEnabled()) {
			log.debug("getParser : tidy parser created - " + tidy);
		}
		log.debug("End   : getParser");
		return tidy;
	}

	/**
	 * Returns a node representing a whole xml given an xml document.
	 * 
	 * @param text
	 *            an xml document (as a byte array)
	 * @return a node representing a whole xml
	 * 
	 * @throws SAXException
	 *             indicates an error parsing the xml document
	 */
	private static Node getDOM(byte[] text) throws SAXException {
		log.debug("Start : getDOM");
		Node node = getTidyParser().parseDOM(new ByteArrayInputStream(text), null);
		if (log.isDebugEnabled()) {
			log.debug("node : " + node);
		}
		log.debug("End   : getDOM");
		return node;
	}
}

jtidyhtmlparser.java - 源码说明

本页面展示了「测试工具」中的 jtidyhtmlparser.java 源码文件，采用 Java 编程语言编写，共 224 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与测试工具相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?