📄 kittest.java

📁 html解析包可以很方便的解析html 纯java 实现
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
            node = (Node)mNodes.elementAt (i);            if (node instanceof Tag)            {                ours = ((Attribute)(((Tag)node).getAttributesEx ().elementAt (0))).getName ().substring (1);                if (match (theirs, ours))                {                    match = i;                    break;                }            }        }        if (-1 == match)        {            node = (Node)mNodes.elementAt (mIndex);            ours = node.getText ();            System.out.println ("theirs: " + theirs);            Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.getStartPosition ());            System.out.println ("ours " + cursor + ": " + ours);        }        else        {            boolean skipped = false;            for (int i = mIndex; i < match; i++)            {                ours = ((Node)mNodes.elementAt (i)).toHtml ();                if (0 != ours.trim ().length ())                {                    if (!skipped)                        System.out.println ("skipping:");                    System.out.println (ours);                    skipped = true;                }            }            if (skipped)            {                System.out.println ("to match:");                node = (Node)mNodes.elementAt (match);                Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.getStartPosition ());                System.out.println ("@" + cursor + ": " + node.toHtml ());            }//            System.out.println (" match: " + theirs);            mIndex = match + 1;        }    }    /**     * Callback for a non-composite tag.     * @param t The tag extracted from the page.     * @param a The attributes parsed out of the tag.     * @param pos The position in the page.     * <em>Note: This differs from the Lexer concept of position which is an     * absolute location in the HTML input stream. This position is the character     * position if the text from the page were displayed in a browser.</em>     */    public void handleSimpleTag (HTML.Tag t, MutableAttributeSet a, int pos)    {        String theirs;        Node node;        int match;        String ours;        theirs = t.toString ();        match = -1;        for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)        {            node = (Node)mNodes.elementAt (i);            if (node instanceof Tag)            {                ours = ((Attribute)(((Tag)node).getAttributesEx ().elementAt (0))).getName ();                if (match (theirs, ours))                {                    match = i;                    break;                }                if (match (theirs, ours))                {                    match = i;                    break;                }            }        }        if (-1 == match)        {            node = (Node)mNodes.elementAt (mIndex);            ours = node.getText ();            System.out.println ("theirs: " + theirs);            Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.getStartPosition ());            System.out.println ("ours " + cursor + ": " + ours);        }        else        {            boolean skipped = false;            for (int i = mIndex; i < match; i++)            {                ours = ((Node)mNodes.elementAt (i)).toHtml ();                if (0 != ours.trim ().length ())                {                    if (!skipped)                        System.out.println ("skipping:");                    System.out.println (ours);                    skipped = true;                }            }            if (skipped)            {                System.out.println ("to match:");                node = (Node)mNodes.elementAt (match);                Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.getStartPosition ());                System.out.println ("@" + cursor + ": " + node.toHtml ());            }//            System.out.println (" match: " + theirs);            mIndex = match + 1;        }    }    /**     * Callback for an error condition.     * @param errorMsg The error condition as a text message.     * @param pos The position in the page.     * <em>Note: This differs from the Lexer concept of position which is an     * absolute location in the HTML input stream. This position is the character     * position if the text from the page were displayed in a browser.</em>     */    public void handleError (String errorMsg, int pos)    {        System.out.println ("******* error @" + pos + " ******** " + errorMsg);    }    /**     * Callback for flushing the state, just prior to shutting down the parser.     */    public void flush () throws BadLocationException    {    }    /**     * This is invoked after the stream has been parsed, but before     * <code>flush</code>. <code>eol</code> will be one of \n, \r     * or \r\n, which ever is encountered the most in parsing the     * stream.     *     * @since 1.3     */    public void handleEndOfLineString (String eol)    {    }//    /**//     * Get the document data from the URL.//     * @param rd The reader to read bytes from.//     * @return The parsed HTML document.//     *///    protected static Element[] getData (Reader rd) throws IOException//    {//        EditorKit kit;//        Document doc;//        Element[] ret;////        ret = null;////        // need this because HTMLEditorKit is not thread safe apparently//        synchronized (Boolean.TRUE)//        {//            kit = new HTMLEditorKit ();//            doc = kit.createDefaultDocument ();//            // the Document class does not yet handle charset's properly//            doc.putProperty ("IgnoreCharsetDirective", Boolean.TRUE);////            try//            {//                // parse the HTML//                kit.read (rd, doc, 0);//            }//            catch (BadLocationException ble)//            {//                throw new IOException ("parse error " + ble.getMessage ());//            }////            ret = doc.getRootElements ();//        }////        return (ret);//    }//    public static void scanElements (Element element) throws BadLocationException//    {//        int start;//        int end;//        String string;//        ElementIterator it;//        Element child;////        if (element.isLeaf ())//        {//            start = element.getStartOffset ();//            end = element.getEndOffset ();//            string = element.getDocument ().getText (start, end - start);//            System.out.println (string);//        }//        else//            // iterate through the elements of the element//            for (int i = 0; i < element.getElementCount (); i++)//            {//                child = element.getElement (i);//                scanElements (child);//            }//    }    /**     * Subclass of HTMLEditorKit to expose getParser().     */    class MyKit extends HTMLEditorKit    {        public MyKit ()        {        }        public HTMLEditorKit.Parser getParser ()        {            return (super.getParser ());        }    }    /**     * Return a editor kit.     */    public MyKit getKit ()    {        return (new MyKit ());    }    /**     * Manline for the test.     * @param args the command line arguments.     * If present the first array element is used as a URL to parse.     */    public static void main (String[] args) throws ParserException, IOException    {        String link;        Lexer lexer;        Node node;        Vector nodes;        KitTest test;        MyKit kit;        Parser parser;        if (0 == args.length)            link = "http://sourceforge.net/projects/htmlparser";        else            link = args[0];        // pass through it once to read the entire page        URL url = new URL (link);        lexer = new Lexer (url.openConnection ());        nodes = new Vector ();        while (null != (node = lexer.nextNode ()))            nodes.addElement (node);        // reset the reader        lexer.getPage ().getSource ().reset ();        test = new KitTest (nodes);        kit = test.getKit ();        parser = kit.getParser ();        parser.parse (lexer.getPage ().getSource (), test, true);    }}/* * Revision Control Modification History * * $Log: KitTest.java,v $ * Revision 1.10  2005/05/15 11:49:05  derrickoswald * Documentation revamp part four. * Remove some checkstyle warnings. * * Revision 1.9  2005/04/10 23:20:46  derrickoswald * Documentation revamp part one. * Deprecated node decorators. * Added doSemanticAction for Text and Comment nodes. * Added missing sitecapturer scripts. * Fixed DOS batch files to work when called from any location. * * Revision 1.8  2004/07/31 16:42:31  derrickoswald * Remove unused variables and other fixes exposed by turning on compiler warnings. * * Revision 1.7  2004/05/24 16:18:31  derrickoswald * Part three of a multiphase refactoring. * The three node types are now fronted by interfaces (program to the interface paradigm) * with concrete implementations in the new htmlparser.nodes package. Classes from the * lexer.nodes package are moved to this package, and obvious references to the concrete * classes that got broken by this have been changed to use the interfaces where possible. * * Revision 1.6  2004/01/14 02:53:47  derrickoswald * *** empty log message *** * * Revision 1.5  2003/10/20 01:28:03  derrickoswald * Removed lexer level AbstractNode. * Removed data package from parser level tags. * Separated tag creation from recursion in NodeFactory interface. * * Revision 1.4  2003/09/10 03:38:24  derrickoswald * Add style checking target to ant build script: *     ant checkstyle * It uses a jar from http://checkstyle.sourceforge.net which is dropped in the lib directory. * The rules are in the file htmlparser_checks.xml in the src directory. * * Added lexerapplications package with Tabby as the first app. It performs whitespace manipulation * on source files to follow the style rules. This reduced the number of style violations to roughly 14,000. * * There are a few issues with the style checker that need to be resolved before it should be taken too seriously. * For example: * It thinks all method arguments should be final, even if they are modified by the code (which the compiler frowns on). * It complains about long lines, even when there is no possibility of wrapping the line, i.e. a URL in a comment * that's more than 80 characters long. * It considers all naked integers as 'magic numbers', even when they are obvious, i.e. the 4 corners of a box. * It complains about whitespace following braces, even in array initializers, i.e. X[][] = { {a, b} { } } * * But it points out some really interesting things, even if you don't agree with the style guidelines, * so it's worth a look. * * Revision 1.3  2003/08/27 02:40:24  derrickoswald * Testing cvs keyword substitution. * * */
上一页 12
💿 文件大小 2128 K
👤 上传用户 hcwlxhyq
📂 所属分类其他
🏷️ 相关标签

#html #java
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -