jtidyhtmlhandler.java

来自「一个很好的开源项目管理系统源代码」· Java 代码 · 共 130 行

JAVA

130 行

package net.java.workeffort.searchengine;import java.io.File;import java.io.FileInputStream;import java.io.InputStream;import org.apache.lucene.document.Field;import org.w3c.dom.Element;import org.w3c.dom.NamedNodeMap;import org.w3c.dom.Node;import org.w3c.dom.NodeList;import org.w3c.dom.Text;import org.w3c.tidy.Tidy;/** * Handler for HTML. Specific to the application user guide. * @author Antony Joseph */public class JTidyHTMLHandler implements IDocumentHandler {    public org.apache.lucene.document.Document getDocument(InputStream is)            throws DocumentHandlerException {        Tidy tidy = new Tidy();        tidy.setQuiet(true);        tidy.setShowWarnings(false);        org.w3c.dom.Document root = tidy.parseDOM(is, null);        Element rawDoc = root.getDocumentElement();        org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();        String title = getTitle(rawDoc);        String body = getBody(rawDoc);        if ((title != null) && (!title.equals(""))) {            doc.add(Field.Text("title", title));        }        if ((body != null) && (!body.equals(""))) {            doc.add(Field.Text("body", body));        }        return doc;    }    /**     * Gets the title text of the HTML document.     * @param rawDoc the DOM Element to extract title Node from     * @return the title text     */    protected String getTitle(Element rawDoc) {        if (rawDoc == null) {            return null;        }        String title = "";        NodeList children = rawDoc.getElementsByTagName("title");        if (children.getLength() > 0) {            Element titleElement = ((Element) children.item(0));            Text text = (Text) titleElement.getFirstChild();            if (text != null) {                title = text.getData();            }        }        return title;    }    /**     * Gets the body text of the HTML document.     * @param rawDoc the DOM Element to extract body Node from     * @return the body text     */    protected String getBody(Element rawDoc) {        if (rawDoc == null) {            return null;        }        String body = "";        NodeList children = rawDoc.getElementsByTagName("body");        if (children.getLength() > 0) {            body = getText(children.item(0));        }        return body;    }    /**     * Extracts text from the DOM node. This is specific to the user guide Skip     * elements with attributes class equal to "navheader", "navfooter"     * @param node a DOM node     * @return the text value of the node     */    protected String getText(Node node) {        if ("div".equals(node.getNodeName())) {            NamedNodeMap attributes = node.getAttributes();            if (attributes != null && attributes.getNamedItem("class") != null) {                String attr = attributes.getNamedItem("class").getNodeValue();                if ("navheader".equals(attr) || "navfooter".equals(attr)                        || "toc".equals(attr))                    return "";                // else                //   System.out                //          .println("attributes.getNamedItem(class).getNodeValue()="                //                  + attributes.getNamedItem("class")                //                          .getNodeValue());            }        }        NodeList children = node.getChildNodes();        StringBuffer sb = new StringBuffer();        for (int i = 0; i < children.getLength(); i++) {            Node child = children.item(i);            switch (child.getNodeType()) {            case Node.ELEMENT_NODE:                sb.append(getText(child));                sb.append(" ");                break;            case Node.TEXT_NODE:                sb.append(((Text) child).getData());                break;            }        }        return sb.toString();    }    public static void main(String args[]) throws Exception {        JTidyHTMLHandler handler = new JTidyHTMLHandler();        org.apache.lucene.document.Document doc = handler                .getDocument(new FileInputStream(new File(args[0])));        System.out.println(doc);    }}

jtidyhtmlhandler.java - 源码说明

本页面展示了「一个很好的开源项目管理系统源代码」中的 jtidyhtmlhandler.java 源码文件，采用 Java 编程语言编写，共 130 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与开源相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?