📄 cmshtmlimportconverter.java
字号:
/*
* File :
* Date :
* Version:
*
* This library is part of OpenCms -
* the Open Source Content Mananagement System
*
* Copyright (c) 2005 Alkacon Software GmbH (http://www.alkacon.com)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* For further information about Alkacon Software GmbH, please see the
* company website: http://www.alkacon.com
*
* For further information about OpenCms, please see the
* project website: http://www.opencms.org
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package org.opencms.workplace.tools.database;
import org.opencms.file.CmsPropertyDefinition;
import org.opencms.i18n.CmsEncoder;
import org.opencms.main.CmsLog;
import org.opencms.util.CmsStringUtil;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;
/**
* This class implements Html-converting routines based on tidy to modify the
* Html code of the imported Html pages.<p>
*
* @author Michael Emmerich
*
* @version $Revision: 1.9 $
*
* @since 6.0.0
*/
public class CmsHtmlImportConverter {
/** defintition of the alt attribute. */
private static final String ATTRIB_ALT = "alt";
/** defintition of the content attribute. */
private static final String ATTRIB_CONTENT = "content";
/** defintition of the href attribute. */
private static final String ATTRIB_HREF = "href";
/** defintition of the name attribute. */
private static final String ATTRIB_NAME = "name";
/** defintition of the src attribute. */
private static final String ATTRIB_SRC = "src";
/** defintition of the <BODY></BODY> node. */
private static final String NODE_BODY = "body";
/** defintition of the <HEAD></HEAD> node. */
private static final String NODE_HEAD = "head";
/** defintition of the <A></A> node. */
private static final String NODE_HREF = "a";
/** defintition of the <HTML></HTML> node. */
private static final String NODE_HTML = "html";
/** defintition of the <IMG></IMG> node. */
private static final String NODE_IMG = "img";
/** defintition of the <META></META> node. */
private static final String NODE_META = "meta";
/** defintition of the <TITLE></TITLE> node. */
private static final String NODE_TITLE = "title";
/**
* HashMap stores tag names, after the end-tag, a "\n" is added to the output.<p>
*/
private HashSet m_enterTags = new HashSet();
/**
* the absolute path in the real filesystem of the file to convert.
*/
private String m_filename;
/**
* reference to the HtmlImport object, required to access the link translation.
*/
private CmsHtmlImport m_htmlImport;
/**
* temporary buffer used in transformation method.
*/
private StringBuffer m_tempString;
/** instance of JTidy. */
private Tidy m_tidy = new Tidy();
/** flag to write the output. */
private boolean m_write;
/**
* Default constructor, creates a new HtmlConverter.<p>
*
* @param htmlImport reference to the htmlimport
* @param xmlMode switch for setting the import to HTML or XML mode
*/
public CmsHtmlImportConverter(CmsHtmlImport htmlImport, boolean xmlMode) {
m_tidy.setTidyMark(false);
m_tidy.setShowWarnings(false);
m_tidy.setQuiet(true);
m_tidy.setForceOutput(true);
if (xmlMode) {
m_tidy.setXmlTags(xmlMode);
m_tidy.setXmlSpace(true);
}
initialiseTags();
m_htmlImport = htmlImport;
}
/**
* Extracts the content of a HTML page.<p>
*
* This method should be pretty robust and work even if the input HTML does not contains
* the specified matchers.<p>
*
* @param content the content to extract the body from
* @param startpoint the point where matching starts
* @param endpoint the point where matching ends
* @return the extracted body tag content
*/
public static String extractHtml(String content, String startpoint, String endpoint) {
/** Regex that matches a start body tag. */
Pattern startPattern = Pattern.compile(startpoint, Pattern.CASE_INSENSITIVE);
/** Regex that matches an end body tag. */
Pattern endPattern = Pattern.compile(endpoint, Pattern.CASE_INSENSITIVE);
Matcher startMatcher = startPattern.matcher(content);
Matcher endMatcher = endPattern.matcher(content);
int start = 0;
int end = content.length();
if (startMatcher.find()) {
start = startMatcher.end();
}
if (endMatcher.find(start)) {
end = endMatcher.start();
}
return content.substring(start, end);
}
/**
* Transforms HTML code into user defined output.<p>
*
* @param input Reader with HTML code
* @param output Writer with transformed code
* @param startPattern the start pattern definition for content extracting
* @param endPattern the end pattern definition for content extracting
* @param properties the file properties
*/
public void convertHTML(Reader input, Writer output, String startPattern, String endPattern, Hashtable properties) {
/* local variables */
StringBuffer htmlString = new StringBuffer();
Node node;
String outString = "";
try {
/* write InputStream input in StringBuffer htmlString */
int c;
while ((c = input.read()) != -1) {
htmlString.append((char)c);
}
} catch (IOException e) {
if (CmsLog.INIT.isWarnEnabled()) {
CmsLog.INIT.warn(Messages.get().getBundle().key(Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_0, e.getLocalizedMessage()));
}
return;
}
outString = htmlString.toString();
// extract from html if even both patterns are defined
if (CmsStringUtil.isNotEmpty(startPattern) && CmsStringUtil.isNotEmpty(endPattern)) {
String extractMain = extractHtml(outString, startPattern, endPattern);
if (extractMain.length() != outString.length()) {
String extractHead = extractHtml(outString, "<html>", CmsStringUtil.BODY_START_REGEX);
//String extractHead = extractHtml(extractMain, "<html>", CmsStringUtil.C_BODY_START_REGEX);
StringBuffer buffer = new StringBuffer(extractHead.length() + extractMain.length() + 255);
buffer.append("<html>");
buffer.append(extractHead);
buffer.append("<body>");
buffer.append(extractMain);
buffer.append("</body></html>");
outString = buffer.toString();
}
}
/* convert htmlString in InputStream for parseDOM */
InputStream in;
try {
in = new ByteArrayInputStream(outString.getBytes(CmsEncoder.ENCODING_UTF_8));
} catch (UnsupportedEncodingException e) {
// this should never happen since UTF-8 is always supported
in = new ByteArrayInputStream(outString.getBytes());
}
m_tidy.setInputEncoding(CmsEncoder.ENCODING_UTF_8);
m_tidy.setOutputEncoding(CmsEncoder.ENCODING_UTF_8);
// hold tidy error information into a new PrintWriter Object
PrintWriter errorLog = new PrintWriter(new ByteArrayOutputStream(), true);
m_tidy.setErrout(errorLog);
node = m_tidy.parseDOM(in, null);
/* check if html code has errors */
if (m_tidy.getParseErrors() != 0) {
if (CmsLog.INIT.isWarnEnabled()) {
CmsLog.INIT.warn(Messages.get().getBundle().key(Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_0));
}
}
/* second step: create transformed output with printDocument from DOM */
this.printDocument(node, properties);
try {
String content = m_tempString.toString();
content = CmsStringUtil.substitute(content, "<br></br>", "<br>");
content = CmsStringUtil.substitutePerl(content, "</a>(\\w+)", "</a> $1", "g");
output.write(content);
output.close();
} catch (IOException e) {
if (CmsLog.INIT.isWarnEnabled()) {
CmsLog.INIT.warn(Messages.get().getBundle().key(Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_1, e.getLocalizedMessage()));
}
return;
}
}
/**
* Transforms HTML code into user defined output.<p>
*
* @param filename the absolute path in the real filesystem of the file to convert
* @param inString String with HTML code
* @param startPattern the start pattern definition for content extracting
* @param endPattern the end pattern definition for content extracting
* @param properties the file properties
* @return String with transformed code
*/
public String convertHTML(
String filename,
String inString,
String startPattern,
String endPattern,
Hashtable properties) {
m_tempString = new StringBuffer();
m_write = true;
m_filename = filename.replace('\\', '/');
Reader in = new StringReader(inString);
Writer out = new StringWriter();
convertHTML(in, out, startPattern, endPattern, properties);
return out.toString();
}
/**
* Initialises Vector m_enterTags with tag names.<p>
*/
private void initialiseTags() {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -