📄 xmlhelper.java

📁 本程序用JAVA编制
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
      File f = new File(fileName);
      FileOutputStream fos = new FileOutputStream(f);
      XMLSerializer serializer = new XMLSerializer(fos, of);
      serializer.serialize(doc);
      fos.close();
    } catch (IOException ioe) {
      throw new XMLHelperException("Unable to write to the given file", ioe);
    }
  }

  /**
   * A utility method for converting an XML document to a <CODE>String</CODE> object.
   * This method is included in case the user would like to do their own I/O in a way
   * not specified in this class.
   *
   * @param doc The XML document to be encoded as a <CODE>String</CODE>.
   * @return The XML document as text in a <CODE>String</CODE>.
   */

  public static String convertXMLToString(Document doc) throws XMLHelperException {
    try {
      OutputFormat of = new OutputFormat(doc);
      of.setIndenting(true);
      StringWriter sw = new StringWriter();
      XMLSerializer serializer = new XMLSerializer(sw, of);
      serializer.serialize(doc);
      return sw.toString();
    } catch (IOException ioe) {
      throw new XMLHelperException("Unable to write to the string", ioe);
    }
  }

  /**
   * <P>Copies the content of one XML <CODE>Element</CODE> to another. By setting the
   * <CODE>childrenOnly</CODE> parameter to <CODE>false</CODE>, the element is simply
   * imported as a child to the original element. If the parameter is instead
   * <CODE>true</CODE>, then the children of the element are imported as children
   * to the original element.</P>
   *
   * <P>For instance, let the <CODE>mergeToXML</CODE> parameter be the XML:</P>
   * <BLOCKQUOTE>
   * <PRE>
   * &lt;Original&gt
   * &lt;/Original&gt;
   * </PRE>
   * </BLOCKQUOTE>
   * <P>And let the <CODE>mergeFromXML</CODE> parameter be the XML:</P>
   * <BLOCKQUOTE>
   * <PRE>
   * &lt;Target&gt;
   *   &lt;Child&gt;Child 1&lt;/Child&gt;
   *   &lt;Child&gt;Child 2&lt;/Child&gt;
   * &lt;/Target&gt;
   * </PRE>
   * </BLOCKQUOTE>
   * <P>If <CODE>childrenOnly</CODE> is set to <CODE>false</CODE>, the result is:</P>
   * <BLOCKQUOTE>
   * <PRE>
   * &lt;Original&gt
   *   &lt;Target&gt;
   *     &lt;Child&gt;Child 1&lt;/Child&gt;
   *     &lt;Child&gt;Child 2&lt;/Child&gt;
   *   &lt;/Target&gt;
   * &lt;/Original&gt
   * </PRE>
   * </BLOCKQUOTE>
   * <P>Otherwise, if <CODE>childrenOnly</CODE> is set to <CODE>true</CODE>, the result is:</P>
   * <BLOCKQUOTE>
   * <PRE>
   * &lt;Original&gt
   *   &lt;Child&gt;Child 1&lt;/Child&gt;
   *   &lt;Child&gt;Child 2&lt;/Child&gt;
   * &lt;/Original&gt
   * </PRE>
   * </BLOCKQUOTE>
   *
   * @param mergeToXML The element into which the XML will be inserted as children.
   * @param mergeFromXML The element from which the XML will be copied and imported.
   * @param childrenOnly If <CODE>true</CODE> grab the children and ignore the parent. If <CODE>false</CODE> grab everything.
   */

  public static void mergeXML(Element mergeToXML, Element mergeFromXML, boolean childrenOnly) {
    Document toDoc = mergeToXML.getOwnerDocument();
    Element copyElem = (Element)(toDoc.importNode(mergeFromXML,true));
    if (childrenOnly) {
      NodeList nlist = copyElem.getChildNodes();
      for (int i=0; i < nlist.getLength(); i++) {
        org.w3c.dom.Node n = nlist.item(i);
        mergeToXML.appendChild(n);
      }
      return;
    } else {
      mergeToXML.appendChild(copyElem);
    }
  }

  /**
   * Retrieves an HTML page from a URL encoded as a <CODE>String</CODE> and
   * attempts to clean up the source of that HTML to remove author errors. If
   * successful, the resulting document is converted to XHTML and returned as
   * an XML document.
   *
   * @param url A <CODE>String</CODE> encoding of a URL (e.g. "http://www.ibm.com/index.html").
   * @return an XML document representing the XHTML of the source of the HTML file.
   * @exception XMLHelperException Thrown if the URL is malformed, the HTML source can not be obtained, or the tool is unable to convert the source to XML.
   */

  public static Document tidyHTML(String url) throws XMLHelperException {
    return tidyHTML(convertStringToURL(url));
  }

  /**
   * Retrieves an HTML page from a java <CODE>URL</CODE> object and
   * attempts to clean up the source of that HTML to remove author errors. If
   * successful, the resulting document is converted to XHTML and returned as
   * an XML document.
   *
   * @param url A <CODE>URL</CODE> object hopefully pointing to an HTML file.
   * @return an XML document representing the XHTML of the source of the HTML file.
   * @exception XMLHelperException Thrown if the HTML source can not be obtained or the tool is unable to convert the source to XML.
   */

  public static Document tidyHTML(URL url) throws XMLHelperException {
    try {
      URLConnection inConnection = url.openConnection();
      if (inConnection.getContentType().startsWith("text/xml") ||
          inConnection.getContentType().startsWith("text/xhtml")) {
        // All ready an XML source
        return parseXMLFromURL(url);
      } else if (inConnection.getContentType().startsWith("text/html")) {
        // An HTML source
        InputStream is = inConnection.getInputStream();

        // Clean the input stream
        ByteArrayOutputStream out = new ByteArrayOutputStream();

        int totalBytes = 0;
        byte[] buffer = new byte[20000];

        while (true) {
          int bytesRead = is.read(buffer, 0, buffer.length);
          if (bytesRead < 0) break;

          // Remove binary below space except tab and newline
          for (int i=0; i < bytesRead; i++) {
            byte b = buffer[i];
            if (b < 32 && b!= 10 && b != 13 && b != 9) b = 32;
            buffer[i] = b;
          }
          out.write(buffer, 0, bytesRead);
          totalBytes += bytesRead;
        }
        is.close();
        out.close();

        String outContent = out.toString();
        InputStream in = new ByteArrayInputStream(out.toByteArray());

        //org.w3c.tidy.TagTable tags = org.w3c.tidy.TagTable.getDefaultTagTable();
        //tags.defineBlockTag("script");
        org.w3c.tidy.TagTable tags = new org.w3c.tidy.TagTable();
        tags.defineBlockTag("script");

        Tidy tidy = new Tidy();

        tidy.setShowWarnings(false);
        tidy.setXmlOut(true);
        tidy.setXmlPi(false);
        tidy.setDocType("omit");
        tidy.setXHTML(false);
        tidy.setRawOut(true);
        tidy.setNumEntities(true);
        tidy.setQuiet(true);
        tidy.setFixComments(true);
        tidy.setIndentContent(true);
        tidy.setCharEncoding(org.w3c.tidy.Configuration.DOCTYPE_AUTO);

        ByteArrayOutputStream baos = new ByteArrayOutputStream();

        org.w3c.tidy.Node tNode = tidy.parse(in, baos);
        //String result = "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" ?>\n" + baos.toString();
        String result = "<?xml version=\"1.0\" encoding=\"gb2312\" ?>\n" + baos.toString();


        // Strip the DOCTYPE and script elements - This is an optional step
        int startIndex = 0;
        int endIndex = 0;
        if ((startIndex = result.indexOf("<!DOCTYPE")) >= 0) {
          endIndex = result.indexOf(">",startIndex);
          result = result.substring(0,startIndex) +
                   result.substring(endIndex + 1, result.length());
        }
        while ((startIndex = result.indexOf("<script")) >= 0) {
          endIndex = result.indexOf("</script>");
          result = result.substring(0,startIndex) + result.substring(endIndex + 9, result.length());
        }

        in.close();
        baos.close();

        return parseXMLFromString(result);

      } else {
        throw new XMLHelperException("Unable to tidy content type: " + inConnection.getContentType());
      }
    } catch (IOException ioe) {
      throw new XMLHelperException("Unable to perform input/output", ioe);
    }
  }

  // A utility method for converting a String encoding of a URL to a URL

  private static URL convertStringToURL(String url) throws XMLHelperException {
    try {
      return new URL(url);
    } catch (MalformedURLException murle) {
      throw new XMLHelperException(url + " is not a well formed URL", murle);
    }
  }
}
上一页 12
💿 文件大小 950 K
👤 上传用户 qingmei_changle
📂 所属分类 Java编程
🏷️ 相关标签

#JAVA #程序 #编制
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -