jtidyhtmlhandler.java
来自「一个很好的开源项目管理系统源代码」· Java 代码 · 共 130 行
JAVA
130 行
package net.java.workeffort.searchengine;import java.io.File;import java.io.FileInputStream;import java.io.InputStream;import org.apache.lucene.document.Field;import org.w3c.dom.Element;import org.w3c.dom.NamedNodeMap;import org.w3c.dom.Node;import org.w3c.dom.NodeList;import org.w3c.dom.Text;import org.w3c.tidy.Tidy;/** * Handler for HTML. Specific to the application user guide. * @author Antony Joseph */public class JTidyHTMLHandler implements IDocumentHandler { public org.apache.lucene.document.Document getDocument(InputStream is) throws DocumentHandlerException { Tidy tidy = new Tidy(); tidy.setQuiet(true); tidy.setShowWarnings(false); org.w3c.dom.Document root = tidy.parseDOM(is, null); Element rawDoc = root.getDocumentElement(); org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); String title = getTitle(rawDoc); String body = getBody(rawDoc); if ((title != null) && (!title.equals(""))) { doc.add(Field.Text("title", title)); } if ((body != null) && (!body.equals(""))) { doc.add(Field.Text("body", body)); } return doc; } /** * Gets the title text of the HTML document. * @param rawDoc the DOM Element to extract title Node from * @return the title text */ protected String getTitle(Element rawDoc) { if (rawDoc == null) { return null; } String title = ""; NodeList children = rawDoc.getElementsByTagName("title"); if (children.getLength() > 0) { Element titleElement = ((Element) children.item(0)); Text text = (Text) titleElement.getFirstChild(); if (text != null) { title = text.getData(); } } return title; } /** * Gets the body text of the HTML document. * @param rawDoc the DOM Element to extract body Node from * @return the body text */ protected String getBody(Element rawDoc) { if (rawDoc == null) { return null; } String body = ""; NodeList children = rawDoc.getElementsByTagName("body"); if (children.getLength() > 0) { body = getText(children.item(0)); } return body; } /** * Extracts text from the DOM node. This is specific to the user guide Skip * elements with attributes class equal to "navheader", "navfooter" * @param node a DOM node * @return the text value of the node */ protected String getText(Node node) { if ("div".equals(node.getNodeName())) { NamedNodeMap attributes = node.getAttributes(); if (attributes != null && attributes.getNamedItem("class") != null) { String attr = attributes.getNamedItem("class").getNodeValue(); if ("navheader".equals(attr) || "navfooter".equals(attr) || "toc".equals(attr)) return ""; // else // System.out // .println("attributes.getNamedItem(class).getNodeValue()=" // + attributes.getNamedItem("class") // .getNodeValue()); } } NodeList children = node.getChildNodes(); StringBuffer sb = new StringBuffer(); for (int i = 0; i < children.getLength(); i++) { Node child = children.item(i); switch (child.getNodeType()) { case Node.ELEMENT_NODE: sb.append(getText(child)); sb.append(" "); break; case Node.TEXT_NODE: sb.append(((Text) child).getData()); break; } } return sb.toString(); } public static void main(String args[]) throws Exception { JTidyHTMLHandler handler = new JTidyHTMLHandler(); org.apache.lucene.document.Document doc = handler .getDocument(new FileInputStream(new File(args[0]))); System.out.println(doc); }}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?