tablecontentbyhtmlparse.java

来自「ajax lucene 部分源代码 HTMLParser.java Muil」· Java 代码 · 共 64 行

JAVA
64
字号
package test;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.List;

import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source;

/**
 * 1.使用htmlparser.jericho方法来实现 2.通过列和行来定位表格元素。获取表格元素 3.可以获取所有的表格。
 * 4.对于表格嵌套的话,可能还需要做相应更改
 * 
 * @author Administrator
 * 
 */
public class TableContentByHtmlParse {

	public Element getTableContent(Element element, int rows, int cols) {
		Element resultElement = null;
		List<Element> trList = element.findAllElements(HTMLElementName.TR);
		if (rows < trList.size()) {
			Element trElement = trList.get(rows);
			List<Element> tdList = trElement
					.findAllElements(HTMLElementName.TD);
			if (cols < tdList.size()) {
				// Element tdElement=tdList.get(cols);
				resultElement = tdList.get(cols);
				System.out.println(resultElement.getContent().toString());
			}

		}

		return resultElement;
	}

	public static void test(String url) {
		Source source = null;
		try {
			source = new Source(new URL(url));

		} catch (MalformedURLException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		List<Element> elementList = source
				.findAllElements(HTMLElementName.TABLE);
		for (Element element : elementList) {
			Element currentElement = new TableContentByHtmlParse()
					.getTableContent(element, 2, 2);
			if (currentElement != null)
				System.out.println(currentElement.getContent().toString());
		}
	}

	public static void main(String[] args) {
		test("http://www.akae.cn/study/resourceShareC.html");
	}

}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?