testhtmlparser.java

来自「cwbbs 云网论坛源码」· Java 代码 · 共 175 行

JAVA
175
字号
package cn.js.fan.test;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.InputStreamReader;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.filters.OrFilter;import org.htmlparser.nodes.TextNode;import org.htmlparser.tags.*;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;import org.htmlparser.visitors.HtmlPage;import org.htmlparser.visitors.TextExtractingVisitor;public class TestHtmlParser {    public static void main(String[] args) throws Exception {        String aFile = "c:/rjcs.html";                String content = "大家好!<a href='ddd.jsp'>GOOD!</a><img src='http://www.cloudwe";                        test2(content);                                                                                    }        public static void test5(String resource) throws Exception {        Parser myParser = new Parser(resource);                myParser.setEncoding("GBK");        HtmlPage visitor = new HtmlPage(myParser);        myParser.visitAllNodesWith(visitor);        String textInPage = visitor.getTitle();        System.out.println(textInPage);    }        public static void test4(String content) throws Exception {        Parser myParser;        myParser = Parser.createParser(content, "GBK");        HtmlPage visitor = new HtmlPage(myParser);        myParser.visitAllNodesWith(visitor);        String textInPage = visitor.getTitle();        System.out.println(textInPage);    }        public static void test3(String content) throws Exception {        Parser myParser;        myParser = Parser.createParser(content, "GBK");        TextExtractingVisitor visitor = new TextExtractingVisitor();        myParser.visitAllNodesWith(visitor);        String textInPage = visitor.getExtractedText();        System.out.println(textInPage);    }        public static void test2(String content) throws ParserException {        Parser myParser;        NodeList nodeList = null;        myParser = Parser.createParser(content, "utf-8");        NodeFilter textFilter = new NodeClassFilter(TextNode.class);        NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);        NodeFilter imgFilter = new NodeClassFilter(ImageTag.class);                        OrFilter lastFilter = new OrFilter();        lastFilter.setPredicates(new NodeFilter[] {textFilter, linkFilter, imgFilter});        nodeList = myParser.parse(lastFilter);        Node[] nodes = nodeList.toNodeArray();        for (int i = 0; i < nodes.length; i++) {            Node anode = (Node) nodes[i];            String line = "";            if (anode instanceof TextNode) {                TextNode textnode = (TextNode) anode;                                line = textnode.getText();            } else if (anode instanceof LinkTag) {                LinkTag linknode = (LinkTag) anode;                line = linknode.toHtml();                                                            } else if (anode instanceof AppletTag) {                AppletTag appletnode = (AppletTag) anode;                line = appletnode.getAppletClass() + " " +                       appletnode.getArchive();            } else if (anode instanceof ImageTag) {                ImageTag imagenode = (ImageTag) anode;                                line = imagenode.toHtml();                            }            if (isTrimEmpty(line))                continue;            System.out.println(line);        }    }        public static void test1(String content) throws ParserException {        Parser myParser;        Node[] nodes = null;        myParser = Parser.createParser(content, null);                nodes = (myParser.extractAllNodesThatMatch(new NodeClassFilter(TextNode.class))).                toNodeArray();        for (int i = 0; i < nodes.length; i++) {            TextNode textnode = (TextNode) nodes[i];            String line = textnode.toPlainTextString().trim();            if (line.equals(""))                continue;            System.out.println(line);        }    }        public static String readTextFile(String sFileName, String sEncode) {        StringBuffer sbStr = new StringBuffer();        try {            File ff = new File(sFileName);            InputStreamReader read = new InputStreamReader(new FileInputStream(                    ff),                    sEncode);            BufferedReader ins = new BufferedReader(read);            String dataLine = "";            while (null != (dataLine = ins.readLine())) {                sbStr.append(dataLine);                sbStr.append("\r\n");            }            ins.close();        } catch (Exception e) {                    }        return sbStr.toString();    }        public static boolean isTrimEmpty(String astr) {        if ((null == astr) || (astr.length() == 0)) {            return true;        }        if (isBlank(astr.trim())) {            return true;        }        return false;    }        public static boolean isBlank(String astr) {        if ((null == astr) || (astr.length() == 0)) {            return true;        } else {            return false;        }    }}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?