📄 htmlencoder.java
字号:
static final HashSet emptyTags = new HashSet();
static {
emptyTags.add("area");
emptyTags.add("base");
emptyTags.add("basefont");
emptyTags.add("br");
emptyTags.add("col");
emptyTags.add("frame");
emptyTags.add("hr");
emptyTags.add("img");
emptyTags.add("input");
emptyTags.add("isindex");
emptyTags.add("link");
emptyTags.add("meta");
emptyTags.add("param");
}
static final byte TAG_NAME = 0;
static final byte TAG_SPACE = 1;
static final byte TAG_ATT_NAME = 2;
static final byte TAG_ATT_VAL = 3;
static final byte TEXT = 0;
static final byte SEMIBLOCK = 1;
static final byte BLOCK = 2;
static final byte INTERNAL = 3;
static final String newLine = System.getProperty("line.separator");
/**
* Do "smart" encodging on a string. This means that valid HTML entities and tags,
* Helma macros and HTML comments are passed through unescaped, while
* other occurrences of '<', '>' and '&' are encoded to HTML entities.
*/
public final static String encode(String str) {
if (str == null) {
return null;
}
int l = str.length();
if (l == 0) {
return "";
}
// try to make stringbuffer large enough from the start
StringBuffer ret = new StringBuffer(Math.round(l * 1.4f));
encode(str, ret, false, null);
return ret.toString();
}
/**
* Do "smart" encodging on a string. This means that valid HTML entities and tags,
* Helma macros and HTML comments are passed through unescaped, while
* other occurrences of '<', '>' and '&' are encoded to HTML entities.
*/
public final static void encode(String str, StringBuffer ret) {
encode(str, ret, false, null);
}
/**
* Do "smart" encodging on a string. This means that valid HTML entities and tags,
* Helma macros and HTML comments are passed through unescaped, while
* other occurrences of '<', '>' and '&' are encoded to HTML entities.
*
* @param str the string to encode
* @param ret the string buffer to encode to
* @param paragraphs if true use p tags for paragraphs, otherwise just use br's
* @param allowedTags a set containing the names of allowed tags as strings. All other
* tags will be escaped
*/
public final static void encode(String str, StringBuffer ret,
boolean paragraphs, Set allowedTags) {
if (str == null) {
return;
}
int l = str.length();
// where to insert the <p> tag in case we want to create a paragraph later on
int paragraphStart = ret.length();
// what kind of element/text are we leaving and entering?
// this is one of TEXT|SEMIBLOCK|BLOCK|INTERNAL
// depending on this information, we decide whether and how to insert
// paragraphs and line breaks. "entering" a tag means we're at the '<'
// and exiting means we're at the '>', not that it's a start or close tag.
byte entering = TEXT;
byte exiting = TEXT;
Stack openTags = new Stack();
// are we currently within a < and a > that consitute some kind of tag?
// we use tag balancing to know whether we are inside a tag (and should
// pass things through unchanged) or outside (and should encode stuff).
boolean insideTag = false;
// are we inside an HTML tag?
boolean insideHtmlTag = false;
boolean insideCloseTag = false;
byte htmlTagMode = TAG_NAME;
// if we are inside a <code> tag, we encode everything to make
// documentation work easier
boolean insideCodeTag = false;
boolean insidePreTag = false;
// are we within a Helma <% macro %> tag? We treat macro tags and
// comments specially, since we can't rely on tag balancing
// to know when we leave a macro tag or comment.
boolean insideMacroTag = false;
// are we inside an HTML comment?
boolean insideComment = false;
// the quotation mark we are in within an HTML or Macro tag, if any
char htmlQuoteChar = '\u0000';
char macroQuoteChar = '\u0000';
// number of newlines met since the last non-whitespace character
int linebreaks = 0;
// did we meet a backslash escape?
boolean escape = false;
boolean triggerBreak = false;
for (int i = 0; i < l; i++) {
char c = str.charAt(i);
// step one: check if this is the beginning of an HTML tag, comment or
// Helma macro.
if (c == '<') {
if (i < (l - 2)) {
if (!insideMacroTag && ('%' == str.charAt(i + 1))) {
// this is the beginning of a Helma macro tag
if (!insideCodeTag) {
insideMacroTag = insideTag = true;
macroQuoteChar = '\u0000';
}
} else if (('!' == str.charAt(i + 1)) && ('-' == str.charAt(i + 2))) {
// the beginning of an HTML comment?
if (!insideCodeTag) {
insideComment = insideTag = ((i < (l - 3)) &&
('-' == str.charAt(i + 3)));
}
} else if (!insideTag) {
// check if this is a HTML tag.
insideCloseTag = ('/' == str.charAt(i + 1));
int tagStart = insideCloseTag ? (i + 2) : (i + 1);
int j = tagStart;
while ((j < l) && Character.isLetterOrDigit(str.charAt(j)))
j++;
if ((j > tagStart) && (j < l)) {
String tagName = str.substring(tagStart, j).toLowerCase();
if ("code".equals(tagName) && insideCloseTag &&
insideCodeTag) {
insideCodeTag = false;
}
if (((allowedTags == null) || allowedTags.contains(tagName)) &&
allTags.contains(tagName) && !insideCodeTag) {
insideHtmlTag = insideTag = true;
htmlQuoteChar = '\u0000';
htmlTagMode = TAG_NAME;
exiting = entering;
entering = TEXT;
if (internalTags.contains(tagName)) {
entering = INTERNAL;
} else if (blockTags.contains(tagName)) {
entering = BLOCK;
} else if (semiBlockTags.contains(tagName)) {
entering = paragraphs ? BLOCK : SEMIBLOCK;
}
if (entering > 0) {
triggerBreak = !insidePreTag;
}
if (insideCloseTag) {
int t = openTags.search(tagName);
if (t == -1) {
i = j;
insideHtmlTag = insideTag = false;
continue;
} else if (t > 1) {
for (int k = 1; k < t; k++) {
Object tag = openTags.pop();
if (!emptyTags.contains(tag)) {
ret.append("</");
ret.append(tag);
ret.append(">");
}
}
}
openTags.pop();
} else {
openTags.push(tagName);
}
if ("code".equals(tagName) && !insideCloseTag) {
insideCodeTag = true;
}
if ("pre".equals(tagName)) {
insidePreTag = !insideCloseTag;
}
}
}
}
} // if (i < l-2)
}
if ((triggerBreak || linebreaks > 0) && !Character.isWhitespace(c)) {
if (!insideTag) {
exiting = entering;
entering = TEXT;
if (exiting >= SEMIBLOCK) {
paragraphStart = ret.length();
}
}
if (entering != INTERNAL && exiting != INTERNAL) {
int swallowBreaks = 0;
if (paragraphs &&
(entering != BLOCK || exiting != BLOCK) &&
(exiting < BLOCK) &&
(linebreaks > 1) &&
paragraphStart < ret.length()) {
ret.insert(paragraphStart, "<p>");
ret.append("</p>");
swallowBreaks = 2;
}
// treat entering a SEMIBLOCK as entering a TEXT
int _entering = entering == SEMIBLOCK ? TEXT : entering;
for (int k = linebreaks-1; k>=0; k--) {
if (k >= swallowBreaks && k >= _entering && k >= exiting) {
ret.append("<br />");
}
ret.append(newLine);
}
if (exiting >= SEMIBLOCK || linebreaks > 1) {
paragraphStart = ret.length();
}
}
linebreaks = 0;
triggerBreak = false;
}
switch (c) {
case '<':
if (insideTag) {
ret.append('<');
} else {
ret.append("<");
}
break;
case '&':
// check if this is an HTML entity already,
// in which case we pass it though unchanged
if ((i < (l - 3)) && !insideCodeTag) {
// is this a numeric entity?
if (str.charAt(i + 1) == '#') {
int j = i + 2;
while ((j < l) && Character.isDigit(str.charAt(j)))
j++;
if ((j < l) && (str.charAt(j) == ';')) {
ret.append("&");
break;
}
} else {
int j = i + 1;
while ((j < l) && Character.isLetterOrDigit(str.charAt(j)))
j++;
if ((j < l) && (str.charAt(j) == ';')) {
ret.append("&");
break;
}
}
}
// we didn't reach a break, so encode the ampersand as HTML entity
ret.append("&");
break;
case '\\':
ret.append(c);
if (insideTag && !insideComment) {
escape = !escape;
}
break;
case '"':
case '\'':
ret.append(c);
if (!insideComment) {
// check if the quote is escaped
if (insideMacroTag) {
if (escape) {
escape = false;
} else if (macroQuoteChar == c) {
macroQuoteChar = '\u0000';
} else if (macroQuoteChar == '\u0000') {
macroQuoteChar = c;
}
} else if (insideHtmlTag) {
if (escape) {
escape = false;
} else if (htmlQuoteChar == c) {
htmlQuoteChar = '\u0000';
htmlTagMode = TAG_SPACE;
} else if (htmlQuoteChar == '\u0000') {
htmlQuoteChar = c;
}
}
}
break;
case '\n':
if (insideTag || insidePreTag) {
ret.append('\n');
} else {
linebreaks++;
}
break;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -