📄 parser.java

📁 html 解析处理代码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
    {        return (getLexer ().getPage ().getConnection ());    }    /**     * Set the URL for this parser.     * This method creates a new Lexer reading from the given URL.     * Trying to set the url to null or an empty string is a no-op.     * @param url The new URL for the parser.     * @throws ParserException If the url is invalid or creation of the     * underlying Lexer cannot be performed.     * @exception ParserException if a problem occurs in connecting.     * @see #getURL     */    public void setURL (String url)        throws            ParserException    {        if ((null != url) && !"".equals (url))            setConnection (getConnectionManager ().openConnection (url));    }    /**     * Return the current URL being parsed.     * @return The current url. This is the URL for the current page.     * A string passed into the constructor or set via setURL may be altered,     * for example, a file name may be modified to be a URL.     * @see Page#getUrl     * @see #setURL     */    public String getURL ()    {        return (getLexer ().getPage ().getUrl ());    }    /**     * Set the encoding for the page this parser is reading from.     * @param encoding The new character set to use.     * @throws ParserException If the encoding change causes characters that     * have already been consumed to differ from the characters that would     * have been seen had the new encoding been in force.     * @see org.htmlparser.util.EncodingChangeException     * @see #getEncoding     */    public void setEncoding (String encoding)        throws            ParserException    {        getLexer ().getPage ().setEncoding (encoding);    }    /**     * Get the encoding for the page this parser is reading from.     * This item is set from the HTTP header but may be overridden by meta     * tags in the head, so this may change after the head has been parsed.     * @return The encoding currently in force.     * @see #setEncoding     */    public String getEncoding ()    {        return (getLexer ().getPage ().getEncoding ());    }    /**     * Set the lexer for this parser.     * The current NodeFactory is transferred to (set on) the given lexer,     * since the lexer owns the node factory object.     * It does not adjust the <code>feedback</code> object.     * @param lexer The lexer object to use.     * @see #setNodeFactory     * @see #getLexer     * @exception IllegalArgumentException if <code>lexer</code> is <code>null</code>.     */    public void setLexer (Lexer lexer)    {        NodeFactory factory;        String type;        if (null == lexer)            throw new IllegalArgumentException ("lexer cannot be null");        // move a node factory that's been set to the new lexer        factory = null;        if (null != getLexer ())            factory = getLexer ().getNodeFactory ();        if (null != factory)            lexer.setNodeFactory (factory);        mLexer = lexer;        // warn about content that's not likely text        type = mLexer.getPage ().getContentType ();        if (type != null && !type.startsWith ("text"))            getFeedback ().warning (                "URL "                + mLexer.getPage ().getUrl ()                + " does not contain text");    }    /**     * Returns the lexer associated with the parser.     * @return The current lexer.     * @see #setLexer     */    public Lexer getLexer ()    {        return (mLexer);    }    /**     * Get the current node factory.     * @return The current lexer's node factory.     * @see #setNodeFactory     */    public NodeFactory getNodeFactory ()    {        return (getLexer ().getNodeFactory ());    }    /**     * Set the current node factory.     * @param factory The new node factory for the current lexer.     * @see #getNodeFactory     * @exception IllegalArgumentException if <code>factory</code> is <code>null</code>.     */    public void setNodeFactory (NodeFactory factory)    {        if (null == factory)            throw new IllegalArgumentException ("node factory cannot be null");        getLexer ().setNodeFactory (factory);    }    /**     * Sets the feedback object used in scanning.     * @param fb The new feedback object to use. If this is null a     * {@link #DEVNULL silent feedback object} is used.     * @see #getFeedback     */    public void setFeedback (ParserFeedback fb)    {        if (null == fb)            mFeedback = DEVNULL;        else            mFeedback = fb;    }    /**     * Returns the current feedback object.     * @return The feedback object currently being used.     * @see #setFeedback     */    public ParserFeedback getFeedback()    {        return (mFeedback);    }    //    // Public methods    //    /**     * Reset the parser to start from the beginning again.     * This assumes support for a reset from the underlying     * {@link org.htmlparser.lexer.Source} object.     * <p>This is cheaper (in terms of time) than resetting the URL, i.e.     * <pre>     * parser.setURL (parser.getURL ());     * </pre>     * because the page is not refetched from the internet.     * <em>Note: the nodes returned on the second parse are new     * nodes and not the same nodes returned on the first parse. If you     * want the same nodes for re-use, collect them in a NodeList with     * {@link #parse(NodeFilter) parse(null)} and operate on the NodeList.</em>     */    public void reset ()    {        getLexer ().reset ();    }    /**     * Returns an iterator (enumeration) over the html nodes.     * {@link org.htmlparser.nodes Nodes} can be of three main types:     * <ul>     * <li>{@link org.htmlparser.nodes.TagNode TagNode}</li>     * <li>{@link org.htmlparser.nodes.TextNode TextNode}</li>     * <li>{@link org.htmlparser.nodes.RemarkNode RemarkNode}</li>     * </ul>     * In general, when parsing with an iterator or processing a NodeList,     * you will need to use recursion. For example:     * <code>     * <pre>     * void processMyNodes (Node node)     * {     *     if (node instanceof TextNode)     *     {     *         // downcast to TextNode     *         TextNode text = (TextNode)node;     *         // do whatever processing you want with the text     *         System.out.println (text.getText ());     *     }     *     if (node instanceof RemarkNode)     *     {     *         // downcast to RemarkNode     *         RemarkNode remark = (RemarkNode)node;     *         // do whatever processing you want with the comment     *     }     *     else if (node instanceof TagNode)     *     {     *         // downcast to TagNode     *         TagNode tag = (TagNode)node;     *         // do whatever processing you want with the tag itself     *         // ...     *         // process recursively (nodes within nodes) via getChildren()     *         NodeList nl = tag.getChildren ();     *         if (null != nl)     *             for (NodeIterator i = nl.elements (); i.hasMoreElements (); )     *                 processMyNodes (i.nextNode ());     *     }     * }     *     * Parser parser = new Parser ("http://www.yahoo.com");     * for (NodeIterator i = parser.elements (); i.hasMoreElements (); )     *     processMyNodes (i.nextNode ());     * </pre>     * </code>     * @throws ParserException If a parsing error occurs.     * @return An iterator over the top level nodes (usually {@.html <html>}).     */    public NodeIterator elements () throws ParserException    {        return (new IteratorImpl (getLexer (), getFeedback ()));    }    /**     * Parse the given resource, using the filter provided.     * This can be used to extract information from specific nodes.     * When used with a <code>null</code> filter it returns an     * entire page which can then be modified and converted back to HTML     * (Note: the synthesis use-case is not handled very well; the parser     * is more often used to extract information from a web page).     * <p>For example, to replace the entire contents of the HEAD with a     * single TITLE tag you could do this:     * <pre>     * NodeList nl = parser.parse (null); // here is your two node list     * NodeList heads = nl.extractAllNodesThatMatch (new TagNameFilter ("HEAD"))     * if (heads.size () > 0) // there may not be a HEAD tag     * {     *     Head head = heads.elementAt (0); // there should be only one     *     head.removeAll (); // clean out the contents     *     Tag title = new TitleTag ();     *     title.setTagName ("title");     *     title.setChildren (new NodeList (new TextNode ("The New Title")));     *     Tag title_end = new TitleTag ();     *     title_end.setTagName ("/title");     *     title.setEndTag (title_end);     *     head.add (title);     * }     * System.out.println (nl.toHtml ()); // output the modified HTML     * </pre>     * @return The list of matching nodes (for a <code>null</code>     * filter this is all the top level nodes).     * @param filter The filter to apply to the parsed nodes,     * or <code>null</code> to retrieve all the top level nodes.     * @throws ParserException If a parsing error occurs.     */    public NodeList parse (NodeFilter filter) throws ParserException    {        NodeIterator e;        Node node;        NodeList ret;        ret = new NodeList ();        for (e = elements (); e.hasMoreNodes (); )        {            node = e.nextNode ();            if (null != filter)                node.collectInto (ret, filter);            else                ret.add (node);        }        return (ret);    }    /**     * Apply the given visitor to the current page.     * The visitor is passed to the <code>accept()</code> method of each node     * in the page in a depth first traversal. The visitor     * <code>beginParsing()</code> method is called prior to processing the     * page and <code>finishedParsing()</code> is called after the processing.     * @param visitor The visitor to visit all nodes with.     * @throws ParserException If a parse error occurs while traversing     * the page with the visitor.     */    public void visitAllNodesWith (NodeVisitor visitor) throws ParserException    {        Node node;        visitor.beginParsing();        for (NodeIterator e = elements(); e.hasMoreNodes(); )        {            node = e.nextNode();            node.accept(visitor);        }        visitor.finishedParsing();    }    /**     * Initializes the parser with the given input HTML String.     * @param inputHTML the input HTML that is to be parsed.     * @throws ParserException If a error occurs in setting up the     * underlying Lexer.     * @exception IllegalArgumentException if <code>inputHTML</code> is <code>null</code>.     */    public void setInputHTML (String inputHTML)        throws            ParserException    {        if (null == inputHTML)            throw new IllegalArgumentException ("html cannot be null");        if (!"".equals (inputHTML))            setLexer (new Lexer (new Page (inputHTML)));    }    /**     * Extract all nodes matching the given filter.     * @see Node#collectInto(NodeList, NodeFilter)     * @param filter The filter to be applied to the nodes.     * @throws ParserException If a parse error occurs.     * @return A list of nodes matching the filter criteria,     * i.e. for which the filter's accept method     * returned <code>true</code>.     */    public NodeList extractAllNodesThatMatch (NodeFilter filter)        throws            ParserException    {        NodeIterator e;        NodeList ret;        ret = new NodeList ();        for (e = elements (); e.hasMoreNodes (); )            e.nextNode ().collectInto (ret, filter);        return (ret);    }    //    // ConnectionMonitor interface    //    /**     * Called just prior to calling connect.     * Part of the ConnectionMonitor interface, this implementation just     * sends the request header to the feedback object if any.     * @param connection The connection which is about to be connected.     * @throws ParserException <em>Not used</em>     * @see ConnectionMonitor#preConnect     */    public void preConnect (HttpURLConnection connection)        throws            ParserException    {        getFeedback ().info (HttpHeader.getRequestHeader (connection));    }    /**     * Called just after calling connect.     * Part of the ConnectionMonitor interface, this implementation just     * sends the response header to the feedback object if any.     * @param connection The connection that was just connected.     * @throws ParserException <em>Not used.</em>     * @see ConnectionMonitor#postConnect     */    public void postConnect (HttpURLConnection connection)        throws            ParserException    {        getFeedback ().info (HttpHeader.getResponseHeader (connection));    }    /**     * The main program, which can be executed from the command line.     * @param args A URL or file name to parse, and an optional tag name to be     * used as a filter.     */    public static void main (String [] args)    {        Parser parser;        NodeFilter filter;        if (args.length < 1 || args[0].equals ("-help"))        {            System.out.println ("HTML Parser v" + getVersion () + "\n");            System.out.println ();            System.out.println ("Syntax : java -jar htmlparser.jar"                    + " <file/page> [type]");            System.out.println ("   <file/page> the URL or file to be parsed");            System.out.println ("   type the node type, for example:");            System.out.println ("     A - Show only the link tags");            System.out.println ("     IMG - Show only the image tags");            System.out.println ("     TITLE - Show only the title tag");            System.out.println ();            System.out.println ("Example : java -jar htmlparser.jar"                    + " http://www.yahoo.com");            System.out.println ();        }        else            try            {                parser = new Parser ();                if (1 < args.length)                    filter = new TagNameFilter (args[1]);                else                {                    filter = null;                    // for a simple dump, use more verbose settings                    parser.setFeedback (Parser.STDOUT);                    getConnectionManager ().setMonitor (parser);                }                getConnectionManager ().setRedirectionProcessingEnabled (true);                getConnectionManager ().setCookieProcessingEnabled (true);                parser.setResource (args[0]);                System.out.println (parser.parse (filter));            }            catch (ParserException e)            {                e.printStackTrace ();            }    }}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -