📄 htmlwriter.java
字号:
/* * Copyright 2001 (C) MetaStuff, Ltd. All Rights Reserved. * * This software is open source. * See the bottom of this file for the licence. * * $Id: HTMLWriter.java,v 1.3 2003/07/07 10:30:29 per_nyfelt Exp $ */package org.dom4j.io;import org.dom4j.*;import org.xml.sax.SAXException;import java.io.*;import java.util.HashSet;import java.util.Iterator;import java.util.Set;/** <p><code>HTMLWriter</code> takes a DOM4J tree and formats it to a * stream as HTML. * This formatter is similar to XMLWriter but it outputs the text of CDATA * and Entity sections rather than the serialised format as in XML, * it has an XHTML mode, it retains whitespace in certain elements such as <PRE>, * and it supports certain elements which have no corresponding close tag such * as for <BR> and <P>. * * <p> The OutputFormat passed in to the constructor is checked for isXHTML() and isExpandEmptyElements(). * See {@link OutputFormat OutputFormat} for details. Here are the rules for * <b>this class</b> based on an OutputFormat, "format", passed in to the constructor:<br/><br/> * <ul> * <li>If an element is in {@link #getOmitElementCloseSet() getOmitElementCloseSet}, then it is treated specially:</li> * <ul> * <li>It never expands, since some browsers treat this as two separate Horizontal Rules: <HR></HR></li> * <li>If {@link org.dom4j.io.OutputFormat#isXHTML() format.isXHTML()}, then it has a space before the closing single-tag slash, since Netscape 4.x- treats this: <HR /> as * an element named "HR" with an attribute named "/", but that's better than when it refuses to recognize this: <hr/> * which it thinks is an element named "HR/". </li> * </ul> * <li>If {@link org.dom4j.io.OutputFormat#isXHTML() format.isXHTML()}, all elements must have * either a close element, or be a closed single tag.</li> * <li>If {@link org.dom4j.io.OutputFormat#isExpandEmptyElements() format.isExpandEmptyElements()}() is true, * all elements are expanded except as above.</li> * </ul> * <b>Examples</b> * * <table border="1" cellpadding="0" cellspacing="0"> * <tr> * <th colspan="3" align="left">isXHTML == true</th> * </tr> * <tr> * <td width="25"> </td> * <th align="left">isExpandEmptyElements == true</th> * <td><code> * <td></td><br /> * <br /><br /> * <foo></foo></code> * </td> * </tr> * <tr> * <td width="25"> </td> * <th align="left">isExpandEmptyElements == false</th> * <td><code> * <td/><br /> * <br /><br /> * <foo/></code> * </td> * </tr> * <tr> * <th colspan="3" align="left">isXHTML == false</th> * </tr> * <tr> * <td width="25"> </td> * <th align="left">isExpandEmptyElements == true</th> * <td><code> * <td></td><br /> * <br><br /> * <foo></foo></code> * </td> * </tr> * <tr> * <td width="25"> </td> * <th align="left">isExpandEmptyElements == false</th> * <td><code> * <td/><br /> * <br><br /> * <foo/></code> * </td> * </tr> * </table> * <p> * <p> * If isXHTML == true, CDATA sections look like this: * <PRE> * <b><myelement><![CDATA[My data]]></myelement></b> * </PRE> * Otherwise, they look like this: * <PRE> * <b><myelement>My data</myelement></b> * </PRE> * </p> * * Basically, {@link org.dom4j.io.OutputFormat#isXHTML() OutputFormat.isXHTML()} == true will produce valid XML, * while {@link org.dom4j.io.OutputFormat#isExpandEmptyElements() format.isExpandEmptyElements()} * determines whether empty elements are expanded * if isXHTML is true, excepting the special HTML single tags. * </p> * * * <p>Also, HTMLWriter handles tags whose contents should be preformatted, that is, whitespace-preserved. * By default, this set includes the tags <PRE>, <SCRIPT>, <STYLE>, and <TEXTAREA>, case insensitively. * It does not include <IFRAME>. * Other tags, such as <CODE>, <KBD>, <TT>, <VAR>, are usually rendered in a different font in most browsers, * but don't preserve whitespace, so they also don't appear in the default list. HTML Comments * are always whitespace-preserved. However, the parser you use may store comments with linefeed-only * text nodes (\n) even if your platform uses another line.separator character, and HTMLWriter outputs * Comment nodes exactly as the DOM is set up by the parser. * See examples and discussion here: {@link #setPreformattedTags(java.util.Set) setPreformattedTags}</p> * * <p><b>Examples</b></p> * <blockquote> * <p><b>Pretty Printing</b></p> * <p>This example shows how to pretty print a string containing a valid HTML document to a string. * You can also just call the static methods of this class:<br/> * {@link #prettyPrintHTML(String) prettyPrintHTML(String)} * or<br/> * {@link #prettyPrintHTML(String,boolean,boolean,boolean,boolean) prettyPrintHTML(String,boolean,boolean,boolean,boolean)} * or, <br/> * {@link #prettyPrintXHTML(String) prettyPrintXHTML(String)} for XHTML (note the X) * </p> * <pre> * String testPrettyPrint(String html){ * StringWriter sw = new StringWriter(); * org.dom4j.io.OutputFormat format = org.dom4j.io.OutputFormat.createPrettyPrint(); * <font color='green'>//These are the default formats from createPrettyPrint, so you needn't set them:</font> * <font color='green'>// format.setNewlines(true);</font> * <font color='green'>// format.setTrimText(true);</font> * format.setXHTML(true); <font color='green'>//Default is false, this produces XHTML</font> * org.dom4j.io.HTMLWriter writer = new org.dom4j.io.HTMLWriter(sw, format); * org.dom4j.Document document = org.dom4j.O3DocumentHelper.parseText(html); * writer.write(document); * writer.flush(); * return sw.toString(); * } * </pre> * * <p>This example shows how to create a "squeezed" document, but one that will work in browsers * even if the browser line length is limited. No newlines are included, no extra whitespace * at all, except where it it required by {@link #setPreformattedTags(java.util.Set) setPreformattedTags}. * </p> * <pre> * String testCrunch(String html){ * StringWriter sw = new StringWriter(); * org.dom4j.io.OutputFormat format = org.dom4j.io.OutputFormat.createPrettyPrint(); * format.setNewlines(false); * format.setTrimText(true); * format.setIndent(""); * format.setXHTML(true); * format.setExpandEmptyElements(false); * format.setNewLineAfterNTags(20); <font color='green'>//print a line every so often.</font> * org.dom4j.io.HTMLWriter writer = new org.dom4j.io.HTMLWriter(sw, format); * org.dom4j.Document document = org.dom4j.O3DocumentHelper.parseText(html); * writer.write(document); * writer.flush(); * return sw.toString(); * } * </pre> * * </blockquote> * * </p> * * @author <a href="mailto:james.strachan@metastuff.com">James Strachan</a> (james.strachan@metastuff.com) * @author Laramie Crocker * @version $Revision: 1.3 $ */public class HTMLWriter extends XMLWriter { public HTMLWriter(Writer writer) { super( writer, defaultHtmlFormat ); } public HTMLWriter(Writer writer, OutputFormat format) { super( writer, format ); } public HTMLWriter() throws UnsupportedEncodingException { super( defaultHtmlFormat ); } public HTMLWriter(OutputFormat format) throws UnsupportedEncodingException { super( format ); } public HTMLWriter(OutputStream out) throws UnsupportedEncodingException { super( out, defaultHtmlFormat ); } public HTMLWriter(OutputStream out, OutputFormat format) throws UnsupportedEncodingException { super( out, format ); } //Allows us to the current state of the format in this struct on the m_formatStack. private class FormatState { public FormatState(boolean newLines, boolean trimText, String indent){ this.m_Newlines = newLines; this.m_TrimText = trimText; this.m_indent = indent; } private boolean m_Newlines = false; public boolean isNewlines(){return m_Newlines;} private boolean m_TrimText = false; public boolean isTrimText(){return m_TrimText;} private String m_indent = ""; public String getIndent(){return m_indent;} } private java.util.Stack m_formatStack = new java.util.Stack(); private static String m_lineSeparator = System.getProperty("line.separator"); private String m_lastText = ""; private int m_tagsOuput = 0; private int m_newLineAfterNTags = -1; //legal values are 0+, but -1 signifies lazy initialization. protected static final HashSet defaultPreformattedTags; static { //If you change this list, update the javadoc examples, above in the class javadoc, // in writeElement, and in setPreformattedTags(). defaultPreformattedTags = new HashSet(); defaultPreformattedTags.add("PRE"); defaultPreformattedTags.add("SCRIPT"); defaultPreformattedTags.add("STYLE"); defaultPreformattedTags.add("TEXTAREA"); } private HashSet preformattedTags = defaultPreformattedTags; protected static final OutputFormat defaultHtmlFormat; static { defaultHtmlFormat = new OutputFormat( " ", true ); defaultHtmlFormat.setTrimText( true ); defaultHtmlFormat.setSuppressDeclaration( true ); } /** Used to store the qualified element names which * should have no close element tag */ private HashSet omitElementCloseSet; //keep as a HashSet, but only show as a Set when asked for by getOmitElementCloseSet(). public void startCDATA() throws SAXException { } public void endCDATA() throws SAXException { } // Overloaded methods // laramiec 3/21/2002 added isXHTML() stuff so you get the CDATA brackets if you desire. protected void writeCDATA(String text) throws IOException { // XXX: Should we escape entities? // writer.write( escapeElementEntities( text ) ); if ( getOutputFormat().isXHTML() ) { super.writeCDATA(text); } else { writer.write( text ); } lastOutputNodeType = Node.CDATA_SECTION_NODE; } protected void writeEntity(Entity entity) throws IOException { writer.write(entity.getText()); lastOutputNodeType = Node.ENTITY_REFERENCE_NODE; } protected void writeDeclaration() throws IOException { } protected void writeString(String text) throws IOException { //DOM stores \n at the end of text nodes that are newlines. This is significant if // we are in a PRE section. However, we only want to output the system line.separator, not \n. // This is a little brittle, but this function appears to be called with these lineseparators // as a separate TEXT_NODE. If we are in a preformatted section, output the right line.separator, // otherwise ditch. If the single \n character is not the text, then do the super thing // to output the text. // Also, we store the last text that was not a \n since it may be used by writeElement in this class to // line up preformatted tags. if ( text.equals("\n")){ if ( ! m_formatStack.empty() ) { super.writeString(m_lineSeparator); } return; } m_lastText = text; if ( m_formatStack.empty() ) { super.writeString(text.trim()); } else { super.writeString(text); } } /** Overriden method to not close certain element names to avoid * wierd behaviour from browsers for versions up to 5.x */ protected void writeClose(String qualifiedName) throws IOException { if ( ! omitElementClose( qualifiedName ) ) { super.writeClose(qualifiedName); } } protected void writeEmptyElementClose(String qualifiedName) throws IOException { if (getOutputFormat().isXHTML()){ //xhtml, always check with format object whether to expand or not. if ( omitElementClose(qualifiedName) ) { // it was a special omit tag, do it the XHTML way: "<br/>", ignoring the expansion option, // since <br></br> is OK XML, but produces twice the linefeeds desired in the browser. // for netscape 4.7, though all are fine with it, write a space before the close slash. writer.write(" />"); } else { super.writeEmptyElementClose(qualifiedName); } } else { //html, not xhtml if ( omitElementClose(qualifiedName) ) { // it was a special omit tag, do it the old html way: "<br>". writer.write(">"); } else { // it was NOT a special omit tag, check with format object whether to expand or not. super.writeEmptyElementClose(qualifiedName); } } } protected boolean omitElementClose( String qualifiedName ) { return internalGetOmitElementCloseSet().contains( qualifiedName.toUpperCase() ); } private HashSet internalGetOmitElementCloseSet() { if (omitElementCloseSet == null) { omitElementCloseSet = new HashSet(); loadOmitElementCloseSet(omitElementCloseSet); } return omitElementCloseSet; } //If you change this, change the javadoc for getOmitElementCloseSet. protected void loadOmitElementCloseSet(Set set) { set.add( "AREA" ); set.add( "BASE" ); set.add( "BR" ); set.add( "COL" ); set.add( "HR" ); set.add( "IMG" ); set.add( "INPUT" ); set.add( "LINK" ); set.add( "META" ); set.add( "P" ); set.add( "PARAM" ); } //let the people see the set, but not modify it. /** A clone of the Set of elements that can have their close-tags omitted. By default it * should be * "AREA", * "BASE", * "BR", * "COL", * "HR", * "IMG", * "INPUT", * "LINK", * "META",
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -