📄 kittest.java

📁 html解析包可以很方便的解析html 纯java 实现
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
// HTMLParser Library $Name: v1_6_20051112 $ - A java-based parser for HTML// Copyright (C) August 26, 2003 Derrick Oswald//// Revision Control Information////    $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/KitTest.java,v $//    $Author: derrickoswald $//    $Date: 2005/05/15 11:49:05 $//    $Revision: 1.10 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA//package org.htmlparser.tests.lexerTests;import java.io.IOException;import java.net.URL;import java.util.Vector;import javax.swing.text.BadLocationException;import javax.swing.text.MutableAttributeSet;import javax.swing.text.html.HTML;import javax.swing.text.html.HTMLEditorKit;import javax.swing.text.html.HTMLEditorKit.Parser;import javax.swing.text.html.HTMLEditorKit.ParserCallback;import org.htmlparser.Attribute;import org.htmlparser.Node;import org.htmlparser.Tag;import org.htmlparser.nodes.AbstractNode;import org.htmlparser.lexer.Cursor;import org.htmlparser.lexer.Lexer;import org.htmlparser.util.ParserException;import org.htmlparser.util.Translate;/** * Compare output from javax.swing.text.html.HTMLEditorKit with Lexer. * This test provides a means of comparing the lexemes from * javax.swing.text.html.HTMLEditorKit.Parser class with the lexemes * produced by the org.htmlparser.lexer.Lexer class. * <blockquote> * The differences have eluded automation since the HTMLEditorKit parser * adds spurious nodes where it thinks elements need closing or it gets * confused.  The intent is to eventually incorporate this into the * 'fit test' and run it against lots of HTML pages, but so far you must * analyse the differences by hand. * </blockquote> */public class KitTest extends ParserCallback{    Vector mNodes;    int mIndex;    /**     * Creates a new instance of KitTest     * @param nodes The list of lexemes from Lexer to compare with the kit lexemes.     */    public KitTest (Vector nodes)    {        mNodes = nodes;        mIndex = 0;    }    /**     * Remove whitespace from a string.     * @param s The string to crunch.     * @return The string with whitespace characters removed.     */    String snowhite (String s)    {        int length;        char ch;        StringBuffer ret;        length = s.length ();        ret = new StringBuffer (length);        for (int i = 0; i < length; i++)        {            ch = s.charAt (i);            if (!Character.isWhitespace (ch) && !(160 == ch))                ret.append (ch);        }        return (ret.toString ());    }    /**     * Check if two strings match.     * @param s1 One string.     * @param s2 The other string.     * @return <code>true</code> if the strings are equivalent ignoring whitespace.     */    boolean match (String s1, String s2)    {        s1 = snowhite (Translate.decode (s1));        s2 = snowhite (Translate.decode (s2));        return (s1.equalsIgnoreCase (s2));    }    /**     * Callback for a text lexeme.     * @param data The text extracted from the page.     * @param pos The position in the page.     * <em>Note: This differs from the Lexer concept of position which is an     * absolute location in the HTML input stream. This position is the character     * position if the text from the page were displayed in a browser.</em>     */    public void handleText (char[] data, int pos)    {        StringBuffer sb;        String theirs;        Node node;        int match;        String ours;        sb = new StringBuffer (data.length);        for (int i = 0; i < data.length; i++)        {            if (160 == data[i])                sb.append ("&nbsp;");            else                sb.append (data[i]);        }        theirs = sb.toString ();        match = -1;        for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)        {            node = (Node)mNodes.elementAt (i);            ours = node.getText ();            if (match (theirs, ours))            {                match = i;                break;            }        }        if (-1 == match)        {            node = (Node)mNodes.elementAt (mIndex);            ours = node.getText ();            System.out.println ("theirs: " + theirs);            Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.getStartPosition ());            System.out.println ("ours " + cursor + ": " + ours);        }        else        {            boolean skipped = false;            for (int i = mIndex; i < match; i++)            {                ours = ((Node)mNodes.elementAt (i)).toHtml ();                if (0 != ours.trim ().length ())                {                    if (!skipped)                        System.out.println ("skipping:");                    System.out.println (ours);                    skipped = true;                }            }            if (skipped)            {                System.out.println ("to match:");                node = (Node)mNodes.elementAt (match);                Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.getStartPosition ());                System.out.println ("@" + cursor + ": " + node.toHtml ());            }//            System.out.println (" match: " + theirs);            mIndex = match + 1;        }    }    /**     * Callback for a remark lexeme.     * @param data The text extracted from the page.     * @param pos The position in the page.     * <em>Note: This differs from the Lexer concept of position which is an     * absolute location in the HTML input stream. This position is the character     * position if the text from the page were displayed in a browser.</em>     */    public void handleComment (char[] data, int pos)    {        StringBuffer sb;        String theirs;        Node node;        int match;        String ours;        sb = new StringBuffer (data.length);        sb.append (data);        theirs = sb.toString ();        match = -1;        for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)        {            node = (Node)mNodes.elementAt (i);            ours = node.getText ();            if (match (theirs, ours))            {                match = i;                break;            }        }        if (-1 == match)        {            node = (Node)mNodes.elementAt (mIndex);            ours = node.getText ();            System.out.println ("theirs: " + theirs);            Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.getStartPosition ());            System.out.println ("ours " + cursor + ": " + ours);        }        else        {            boolean skipped = false;            for (int i = mIndex; i < match; i++)            {                ours = ((Node)mNodes.elementAt (i)).toHtml ();                if (0 != ours.trim ().length ())                {                    if (!skipped)                        System.out.println ("skipping:");                    System.out.println (ours);                    skipped = true;                }            }            if (skipped)            {                System.out.println ("to match:");                node = (Node)mNodes.elementAt (match);                Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.getStartPosition ());                System.out.println ("@" + cursor + ": " + node.toHtml ());            }//            System.out.println (" match: " + theirs);            mIndex = match + 1;        }    }    /**     * Callback for a start tag lexeme.     * @param t The tag extracted from the page.     * @param a The attributes parsed out of the tag.     * @param pos The position in the page.     * <em>Note: This differs from the Lexer concept of position which is an     * absolute location in the HTML input stream. This position is the character     * position if the text from the page were displayed in a browser.</em>     */    public void handleStartTag (HTML.Tag t, MutableAttributeSet a, int pos)    {        String theirs;        Node node;        int match;        String ours;        theirs = t.toString ();        match = -1;        for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)        {            node = (Node)mNodes.elementAt (i);            if (node instanceof Tag)            {                ours = ((Attribute)(((Tag)node).getAttributesEx ().elementAt (0))).getName ();                if (match (theirs, ours))                {                    match = i;                    break;                }            }        }        if (-1 == match)        {            node = (Node)mNodes.elementAt (mIndex);            ours = node.getText ();            System.out.println ("theirs: " + theirs);            Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.getStartPosition ());            System.out.println ("ours " + cursor + ": " + ours);        }        else        {            boolean skipped = false;            for (int i = mIndex; i < match; i++)            {                ours = ((Node)mNodes.elementAt (i)).toHtml ();                if (0 != ours.trim ().length ())                {                    if (!skipped)                        System.out.println ("skipping:");                    System.out.println (ours);                    skipped = true;                }            }            if (skipped)            {                System.out.println ("to match:");                node = (Node)mNodes.elementAt (match);                Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.getStartPosition ());                System.out.println ("@" + cursor + ": " + node.toHtml ());            }//            System.out.println (" match: " + theirs);            mIndex = match + 1;        }    }    /**     * Callback for an end tag lexeme.     * @param t The tag extracted from the page.     * @param pos The position in the page.     * <em>Note: This differs from the Lexer concept of position which is an     * absolute location in the HTML input stream. This position is the character     * position if the text from the page were displayed in a browser.</em>     */    public void handleEndTag (HTML.Tag t, int pos)    {        String theirs;        Node node;        int match;        String ours;        theirs = t.toString ();        match = -1;        for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)        {
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -