testhtmlparser.java
来自「cwbbs 云网论坛源码」· Java 代码 · 共 175 行
JAVA
175 行
package cn.js.fan.test;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.InputStreamReader;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.filters.OrFilter;import org.htmlparser.nodes.TextNode;import org.htmlparser.tags.*;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;import org.htmlparser.visitors.HtmlPage;import org.htmlparser.visitors.TextExtractingVisitor;public class TestHtmlParser { public static void main(String[] args) throws Exception { String aFile = "c:/rjcs.html"; String content = "大家好!<a href='ddd.jsp'>GOOD!</a><img src='http://www.cloudwe"; test2(content); } public static void test5(String resource) throws Exception { Parser myParser = new Parser(resource); myParser.setEncoding("GBK"); HtmlPage visitor = new HtmlPage(myParser); myParser.visitAllNodesWith(visitor); String textInPage = visitor.getTitle(); System.out.println(textInPage); } public static void test4(String content) throws Exception { Parser myParser; myParser = Parser.createParser(content, "GBK"); HtmlPage visitor = new HtmlPage(myParser); myParser.visitAllNodesWith(visitor); String textInPage = visitor.getTitle(); System.out.println(textInPage); } public static void test3(String content) throws Exception { Parser myParser; myParser = Parser.createParser(content, "GBK"); TextExtractingVisitor visitor = new TextExtractingVisitor(); myParser.visitAllNodesWith(visitor); String textInPage = visitor.getExtractedText(); System.out.println(textInPage); } public static void test2(String content) throws ParserException { Parser myParser; NodeList nodeList = null; myParser = Parser.createParser(content, "utf-8"); NodeFilter textFilter = new NodeClassFilter(TextNode.class); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); NodeFilter imgFilter = new NodeClassFilter(ImageTag.class); OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates(new NodeFilter[] {textFilter, linkFilter, imgFilter}); nodeList = myParser.parse(lastFilter); Node[] nodes = nodeList.toNodeArray(); for (int i = 0; i < nodes.length; i++) { Node anode = (Node) nodes[i]; String line = ""; if (anode instanceof TextNode) { TextNode textnode = (TextNode) anode; line = textnode.getText(); } else if (anode instanceof LinkTag) { LinkTag linknode = (LinkTag) anode; line = linknode.toHtml(); } else if (anode instanceof AppletTag) { AppletTag appletnode = (AppletTag) anode; line = appletnode.getAppletClass() + " " + appletnode.getArchive(); } else if (anode instanceof ImageTag) { ImageTag imagenode = (ImageTag) anode; line = imagenode.toHtml(); } if (isTrimEmpty(line)) continue; System.out.println(line); } } public static void test1(String content) throws ParserException { Parser myParser; Node[] nodes = null; myParser = Parser.createParser(content, null); nodes = (myParser.extractAllNodesThatMatch(new NodeClassFilter(TextNode.class))). toNodeArray(); for (int i = 0; i < nodes.length; i++) { TextNode textnode = (TextNode) nodes[i]; String line = textnode.toPlainTextString().trim(); if (line.equals("")) continue; System.out.println(line); } } public static String readTextFile(String sFileName, String sEncode) { StringBuffer sbStr = new StringBuffer(); try { File ff = new File(sFileName); InputStreamReader read = new InputStreamReader(new FileInputStream( ff), sEncode); BufferedReader ins = new BufferedReader(read); String dataLine = ""; while (null != (dataLine = ins.readLine())) { sbStr.append(dataLine); sbStr.append("\r\n"); } ins.close(); } catch (Exception e) { } return sbStr.toString(); } public static boolean isTrimEmpty(String astr) { if ((null == astr) || (astr.length() == 0)) { return true; } if (isBlank(astr.trim())) { return true; } return false; } public static boolean isBlank(String astr) { if ((null == astr) || (astr.length() == 0)) { return true; } else { return false; } }}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?