📄 htmlencoder.java

📁 使用工具jublider开发的一个聊天室实现基本功能,
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
    static final HashSet emptyTags = new HashSet();

    static {
        emptyTags.add("area");
        emptyTags.add("base");
        emptyTags.add("basefont");
        emptyTags.add("br");
        emptyTags.add("col");
        emptyTags.add("frame");
        emptyTags.add("hr");
        emptyTags.add("img");
        emptyTags.add("input");
        emptyTags.add("isindex");
        emptyTags.add("link");
        emptyTags.add("meta");
        emptyTags.add("param");
    }

    static final byte TAG_NAME = 0;
    static final byte TAG_SPACE = 1;
    static final byte TAG_ATT_NAME = 2;
    static final byte TAG_ATT_VAL = 3;

    static final byte TEXT = 0;
    static final byte SEMIBLOCK = 1;
    static final byte BLOCK = 2;
    static final byte INTERNAL = 3;

    static final String newLine = System.getProperty("line.separator");

    /**
     *  Do "smart" encodging on a string. This means that valid HTML entities and tags,
     *  Helma macros and HTML comments are passed through unescaped, while
     *  other occurrences of '<', '>' and '&' are encoded to HTML entities.
     */
    public final static String encode(String str) {
        if (str == null) {
            return null;
        }

        int l = str.length();

        if (l == 0) {
            return "";
        }

        // try to make stringbuffer large enough from the start
        StringBuffer ret = new StringBuffer(Math.round(l * 1.4f));

        encode(str, ret, false, null);

        return ret.toString();
    }

    /**
     *  Do "smart" encodging on a string. This means that valid HTML entities and tags,
     *  Helma macros and HTML comments are passed through unescaped, while
     *  other occurrences of '<', '>' and '&' are encoded to HTML entities.
     */
    public final static void encode(String str, StringBuffer ret) {
        encode(str, ret, false, null);
    }

    /**
     *  Do "smart" encodging on a string. This means that valid HTML entities and tags,
     *  Helma macros and HTML comments are passed through unescaped, while
     *  other occurrences of '<', '>' and '&' are encoded to HTML entities.
     *
     *  @param str the string to encode
     *  @param ret the string buffer to encode to
     *  @param paragraphs if true use p tags for paragraphs, otherwise just use br's
     *  @param allowedTags a set containing the names of allowed tags as strings. All other
     *                     tags will be escaped
     */
    public final static void encode(String str, StringBuffer ret,
                                    boolean paragraphs, Set allowedTags) {
        if (str == null) {
            return;
        }

        int l = str.length();

        // where to insert the <p> tag in case we want to create a paragraph later on
        int paragraphStart = ret.length();

        // what kind of element/text are we leaving and entering?
        // this is one of TEXT|SEMIBLOCK|BLOCK|INTERNAL
        // depending on this information, we decide whether and how to insert
        // paragraphs and line breaks. "entering" a tag means we're at the '<'
        // and exiting means we're at the '>', not that it's a start or close tag.
        byte entering = TEXT;
        byte exiting = TEXT;

        Stack openTags = new Stack();

        // are we currently within a < and a > that consitute some kind of tag?
        // we use tag balancing to know whether we are inside a tag (and should
        // pass things through unchanged) or outside (and should encode stuff).
        boolean insideTag = false;

        // are we inside an HTML tag?
        boolean insideHtmlTag = false;
        boolean insideCloseTag = false;
        byte htmlTagMode = TAG_NAME;

        // if we are inside a <code> tag, we encode everything to make
        // documentation work easier
        boolean insideCodeTag = false;
        boolean insidePreTag = false;

        // are we within a Helma <% macro %> tag? We treat macro tags and
        // comments specially, since we can't rely on tag balancing
        // to know when we leave a macro tag or comment.
        boolean insideMacroTag = false;

        // are we inside an HTML comment?
        boolean insideComment = false;

        // the quotation mark we are in within an HTML or Macro tag, if any
        char htmlQuoteChar = '\u0000';
        char macroQuoteChar = '\u0000';

        // number of newlines met since the last non-whitespace character
        int linebreaks = 0;

        // did we meet a backslash escape?
        boolean escape = false;

        boolean triggerBreak = false;

        for (int i = 0; i < l; i++) {
            char c = str.charAt(i);

            // step one: check if this is the beginning of an HTML tag, comment or
            // Helma macro.
            if (c == '<') {
                if (i < (l - 2)) {
                    if (!insideMacroTag && ('%' == str.charAt(i + 1))) {
                        // this is the beginning of a Helma macro tag
                        if (!insideCodeTag) {
                            insideMacroTag = insideTag = true;
                            macroQuoteChar = '\u0000';
                        }
                    } else if (('!' == str.charAt(i + 1)) && ('-' == str.charAt(i + 2))) {
                        // the beginning of an HTML comment?
                        if (!insideCodeTag) {
                            insideComment = insideTag = ((i < (l - 3)) &&
                                                        ('-' == str.charAt(i + 3)));
                        }
                    } else if (!insideTag) {
                        // check if this is a HTML tag.
                        insideCloseTag = ('/' == str.charAt(i + 1));
                        int tagStart = insideCloseTag ? (i + 2) : (i + 1);
                        int j = tagStart;

                        while ((j < l) && Character.isLetterOrDigit(str.charAt(j)))
                            j++;

                        if ((j > tagStart) && (j < l)) {
                            String tagName = str.substring(tagStart, j).toLowerCase();

                            if ("code".equals(tagName) && insideCloseTag &&
                                    insideCodeTag) {
                                insideCodeTag = false;
                            }

                            if (((allowedTags == null) || allowedTags.contains(tagName)) &&
                                    allTags.contains(tagName) && !insideCodeTag) {
                                insideHtmlTag = insideTag = true;
                                htmlQuoteChar = '\u0000';
                                htmlTagMode = TAG_NAME;

                                exiting = entering;
                                entering = TEXT;

                                if (internalTags.contains(tagName)) {
                                    entering = INTERNAL;
                                } else if (blockTags.contains(tagName)) {
                                    entering = BLOCK;
                                } else if (semiBlockTags.contains(tagName)) {
                                    entering = paragraphs ? BLOCK : SEMIBLOCK;
                                }

                                if (entering > 0) {
                                    triggerBreak = !insidePreTag;
                                }

                                if (insideCloseTag) {
                                    int t = openTags.search(tagName);

                                    if (t == -1) {
                                        i = j;
                                        insideHtmlTag = insideTag = false;

                                        continue;
                                    } else if (t > 1) {
                                        for (int k = 1; k < t; k++) {
                                            Object tag = openTags.pop();
                                            if (!emptyTags.contains(tag)) {
                                                ret.append("</");
                                                ret.append(tag);
                                                ret.append(">");
                                            }
                                        }
                                    }

                                    openTags.pop();
                                } else {
                                    openTags.push(tagName);
                                }

                                if ("code".equals(tagName) && !insideCloseTag) {
                                    insideCodeTag = true;
                                }

                                if ("pre".equals(tagName)) {
                                    insidePreTag = !insideCloseTag;
                                }
                            }
                        }
                    }
                } // if (i < l-2)
            }

            if ((triggerBreak || linebreaks > 0) && !Character.isWhitespace(c)) {

                if (!insideTag) {
                    exiting = entering;
                    entering = TEXT;
                    if (exiting >= SEMIBLOCK) {
                        paragraphStart = ret.length();
                    }
                }

                if (entering != INTERNAL && exiting != INTERNAL) {
                    int swallowBreaks = 0;
                    if (paragraphs && 
                          (entering != BLOCK || exiting != BLOCK) &&
                          (exiting < BLOCK) &&
                          (linebreaks > 1) &&
                          paragraphStart < ret.length()) {
                        ret.insert(paragraphStart, "<p>");
                        ret.append("</p>");
                        swallowBreaks = 2;
                    }

                    // treat entering a SEMIBLOCK as entering a TEXT 
                    int _entering = entering == SEMIBLOCK ? TEXT : entering;
                    for (int k = linebreaks-1; k>=0; k--) {
                        if (k >= swallowBreaks && k >= _entering && k >= exiting) {
                            ret.append("<br />");
                        }
                        ret.append(newLine);
                    }
                    if (exiting >= SEMIBLOCK || linebreaks > 1) {
                        paragraphStart = ret.length();
                    }

                }

                linebreaks = 0;
                triggerBreak = false;
            }

            switch (c) {
                case '<':

                    if (insideTag) {
                        ret.append('<');
                    } else {
                        ret.append("&lt;");
                    }

                    break;

                case '&':

                    // check if this is an HTML entity already,
                    // in which case we pass it though unchanged
                    if ((i < (l - 3)) && !insideCodeTag) {
                        // is this a numeric entity?
                        if (str.charAt(i + 1) == '#') {
                            int j = i + 2;

                            while ((j < l) && Character.isDigit(str.charAt(j)))
                                j++;

                            if ((j < l) && (str.charAt(j) == ';')) {
                                ret.append("&");

                                break;
                            }
                        } else {
                            int j = i + 1;

                            while ((j < l) && Character.isLetterOrDigit(str.charAt(j)))
                                j++;

                            if ((j < l) && (str.charAt(j) == ';')) {
                                ret.append("&");

                                break;
                            }
                        }
                    }

                    // we didn't reach a break, so encode the ampersand as HTML entity
                    ret.append("&amp;");

                    break;

                case '\\':
                    ret.append(c);

                    if (insideTag && !insideComment) {
                        escape = !escape;
                    }

                    break;

                case '"':
                case '\'':
                    ret.append(c);

                    if (!insideComment) {
                        // check if the quote is escaped
                        if (insideMacroTag) {
                            if (escape) {
                                escape = false;
                            } else if (macroQuoteChar == c) {
                                macroQuoteChar = '\u0000';
                            } else if (macroQuoteChar == '\u0000') {
                                macroQuoteChar = c;
                            }
                        } else if (insideHtmlTag) {
                            if (escape) {
                                escape = false;
                            } else if (htmlQuoteChar == c) {
                                htmlQuoteChar = '\u0000';
                                htmlTagMode = TAG_SPACE;
                            } else if (htmlQuoteChar == '\u0000') {
                                htmlQuoteChar = c;
                            }
                        }
                    }

                    break;

                case '\n':
                    if (insideTag || insidePreTag) {
                        ret.append('\n');
                    } else {
                        linebreaks++;
                    }

                    break;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -