📄 clean.java

📁 windows 代码
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
        Node child;

        if (node.content != null)
        {
            for (child = node.content;
                    child != null; child = child.next)
            {
                defineStyleRules(lexer, child);
            }
        }

        style2Rule(lexer, node);
    }

    public void cleanTree(Lexer lexer, Node doc)
    {
        doc = createStyleProperties(lexer, doc);

        if (!lexer.configuration.MakeClean)
        {
            defineStyleRules(lexer, doc);
            createStyleElement(lexer, doc);
        }
    }

    /* simplifies <b><b> ... </b> ...</b> etc. */
    public static void nestedEmphasis(Node node)
    {
        MutableObject o = new MutableObject();
        Node next;

        while (node != null)
        {
            next = node.next;

            if ((node.tag == TagTable.tagB || node.tag == TagTable.tagI)
                && node.parent != null && node.parent.tag == node.tag)
            {
                /* strip redundant inner element */
                o.setObject(next);
                discardContainer(node, o);
                next = (Node)o.getObject();
                node = next;
                continue;
            }

            if (node.content != null)
                nestedEmphasis(node.content);

            node = next;
        }
    }

    /* replace i by em and b by strong */
    public static void emFromI(Node node)
    {
        while (node != null)
        {
            if (node.tag == TagTable.tagI)
            {
                node.element = new String(TagTable.tagEm.name);
                node.tag = TagTable.tagEm;
            }
            else if (node.tag == TagTable.tagB)
            {
                node.element = new String(TagTable.tagStrong.name);
                node.tag = TagTable.tagStrong;
            }

            if (node.content != null)
                emFromI(node.content);

            node = node.next;
        }
    }

    /*
     Some people use dir or ul without an li
     to indent the content. The pattern to
     look for is a list with a single implicit
     li. This is recursively replaced by an
     implicit blockquote.
    */
    public static void list2BQ(Node node)
    {
        while (node != null)
        {
            if (node.content != null)
                list2BQ(node.content);

            if (node.tag != null && node.tag.parser == ParserImpl.getParseList() &&
                node.hasOneChild() && node.content.implicit)
            {
                stripOnlyChild(node);
                node.element = new String(TagTable.tagBlockquote.name);
                node.tag = TagTable.tagBlockquote;
                node.implicit = true;
            }

            node = node.next;
        }
    }

    /*
     Replace implicit blockquote by div with an indent
     taking care to reduce nested blockquotes to a single
     div with the indent set to match the nesting depth
    */
    public static void bQ2Div(Node node)
    {
        int indent;
        String indent_buf;

        while (node != null)
        {
            if (node.tag == TagTable.tagBlockquote && node.implicit)
            {
                indent = 1;

                while(node.hasOneChild() &&
                      node.content.tag == TagTable.tagBlockquote &&
                      node.implicit)
                {
                    ++indent;
                    stripOnlyChild(node);
                }

                if (node.content != null)
                    bQ2Div(node.content);

                indent_buf = "margin-left: " +
                             (new Integer(2*indent)).toString() + "em";

                node.element = new String(TagTable.tagDiv.name);
                node.tag = TagTable.tagDiv;
                node.addAttribute("style", indent_buf);
            }
            else if (node.content != null)
                bQ2Div(node.content);


            node = node.next;
        }
    }

    /* node is <![if ...]> prune up to <![endif]> */
    public static Node pruneSection(Lexer lexer, Node node)
    {
        for (;;)
        {
            /* discard node and returns next */
            node = Node.discardElement(node);

            if (node == null)
                return null;
        
            if (node.type == Node.SectionTag)
            {
                if ((Lexer.getString(node.textarray, node.start, 2)).equals("if"))
                {
                    node = pruneSection(lexer, node);
                    continue;
                }

                if ((Lexer.getString(node.textarray, node.start, 5)).equals("endif"))
                {
                    node = Node.discardElement(node);
                    break;
                }
            }
        }

        return node;
    }

    public static void dropSections(Lexer lexer, Node node)
    {
        while (node != null)
        {
            if (node.type == Node.SectionTag)
            {
                /* prune up to matching endif */
                if ((Lexer.getString(node.textarray, node.start, 2)).equals("if"))
                {
                    node = pruneSection(lexer, node);
                    continue;
                }

                /* discard others as well */
                node = Node.discardElement(node);
                continue;
            }

            if (node.content != null)
                dropSections(lexer, node.content);

            node = node.next;
        }
    }

    public static void purgeAttributes(Node node)
    {
        AttVal attr = node.attributes;
        AttVal next = null;
        AttVal prev = null;

        while (attr != null)
        {
            next = attr.next;

            /* special check for class="Code" denoting pre text */
            if (attr.attribute != null &&
                attr.value != null &&
                attr.attribute.equals("class") &&
                attr.value.equals("Code"))
            {
                prev = attr;
            }
            else if (attr.attribute != null &&
                (attr.attribute.equals("class") ||
                 attr.attribute.equals("style") ||
                 attr.attribute.equals("lang") ||
                 attr.attribute.startsWith("x:") ||
                 ((attr.attribute.equals("height") || attr.attribute.equals("width")) &&
                    (node.tag == TagTable.tagTd || node.tag == TagTable.tagTr || node.tag == TagTable.tagTh))))
            {
                if (prev != null)
                    prev.next = next;
                else
                    node.attributes = next;

            }
            else
                prev = attr;

            attr = next;
        }
    }

    /* Word2000 uses span excessively, so we strip span out */
    public static Node stripSpan(Lexer lexer, Node span)
    {
        Node node;
        Node prev = null;
        Node content;

        /*
         deal with span elements that have content
         by splicing the content in place of the span
         after having processed it
        */

        cleanWord2000(lexer, span.content);
        content = span.content;

        if (span.prev != null)
            prev = span.prev;
        else if (content != null)
        {
            node = content;
            content = content.next;
            Node.removeNode(node);
            Node.insertNodeBeforeElement(span, node);
            prev = node;
        }

        while (content != null)
        {
            node = content;
            content = content.next;
            Node.removeNode(node);
            Node.insertNodeAfterElement(prev, node);
            prev = node;
        }

        if (span.next == null)
            span.parent.last = prev;

        node = span.next;
        span.content = null;
        Node.discardElement(span);
        return node;
    }

    /* map non-breaking spaces to regular spaces */
    private static void normalizeSpaces(Lexer lexer, Node node)
    {
        while (node != null)
        {
            if (node.content != null)
                normalizeSpaces(lexer, node.content);

            if (node.type == Node.TextNode)
            {
                int i;
                MutableInteger c = new MutableInteger();
                int p = node.start;

                for (i = node.start; i < node.end; ++i)
                {
                    c.value = (int)node.textarray[i];

                    /* look for UTF-8 multibyte character */
                    if (c.value > 0x7F)
                        i += PPrint.getUTF8(node.textarray, i, c);

                    if (c.value == 160)
                        c.value = ' ';

                    p = PPrint.putUTF8(node.textarray, p, c.value);
                }
            }

            node = node.next;
        }
    }

    /*
     This is a major clean up to strip out all the extra stuff you get
     when you save as web page from Word 2000. It doesn't yet know what
     to do with VML tags, but these will appear as errors unless you
     declare them as new tags, such as o:p which needs to be declared
     as inline.
    */
    public static void cleanWord2000(Lexer lexer, Node node)
    {
        /* used to a list from a sequence of bulletted p's */
        Node list = null;

        while (node != null)
        {
            /* discard Word's style verbiage */
            if (node.tag == TagTable.tagStyle ||
                node.tag == TagTable.tagMeta ||
                node.type == Node.CommentTag)
            {
                node = Node.discardElement(node);
                continue;
            }

            /* strip out all span tags Word scatters so liberally! */
            if (node.tag == TagTable.tagSpan)
            {
                node = stripSpan(lexer, node);
                continue;
            }

            /* get rid of Word's xmlns attributes */
            if (node.tag == TagTable.tagHtml)
            {
                /* check that it's a Word 2000 document */
                if (node.getAttrByName("xmlns:o") == null)
                    return;
            }

            if (node.tag == TagTable.tagLink)
            {
                AttVal attr = node.getAttrByName("rel");

                if (attr != null && attr.value != null &&
                    attr.value.equals("File-List"))
                {
                    node = Node.discardElement(node);
                    continue;
                }
            }

            /* discard empty paragraphs */
            if (node.content == null && node.tag == TagTable.tagP)
            {
                node = Node.discardElement(node);
                continue;
            }

            if (node.tag == TagTable.tagP)
            {
                AttVal attr = node.getAttrByName("class");

                /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
                if (attr != null && attr.value != null &&
                    attr.value.equals("MsoListBullet"))
                {
                    Node.coerceNode(lexer, node, TagTable.tagLi);

                    if (list == null || list.tag != TagTable.tagUl)
                    {
                        list = lexer.inferredTag("ul");
                        Node.insertNodeBeforeElement(node, list);
                    }

                    purgeAttributes(node);

                    if (node.content != null)
                        cleanWord2000(lexer, node.content);

                    /* remove node and append to contents of list */
                    Node.removeNode(node);
                    Node.insertNodeAtEnd(list, node);
                    node = list.next;
                }
                /* map sequence of <p class="Code"> to <pre>...</pre> */
                else if (attr != null && attr.value != null &&
                         attr.value.equals("Code"))
                {
                    Node br = lexer.newLineNode();
                    normalizeSpaces(lexer, node);

                    if (list == null || list.tag != TagTable.tagPre)
                    {
                        list = lexer.inferredTag("pre");
                        Node.insertNodeBeforeElement(node, list);
                    }

                    /* remove node and append to contents of list */
                    Node.removeNode(node);
                    Node.insertNodeAtEnd(list, node);
                    stripSpan(lexer, node);
                    Node.insertNodeAtEnd(list, br);
                    node = list.next;
                }
                else
                    list = null;
            }
            else
                list = null;

            /* strip out style and class attributes */
            if (node.type == Node.StartTag || node.type == Node.StartEndTag)
                purgeAttributes(node);

            if (node.content != null)
                cleanWord2000(lexer, node.content);

            node = node.next;
        }
    }

    public static boolean isWord2000(Node root)
    {
        Node html = root.findHTML();

        return (html != null && html.getAttrByName("xmlns:o") != null);
    }
}
上一页 1 2 34
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -