📄 parser.java
字号:
{ return (getLexer ().getPage ().getConnection ()); } /** * Set the URL for this parser. * This method creates a new Lexer reading from the given URL. * Trying to set the url to null or an empty string is a no-op. * @param url The new URL for the parser. * @throws ParserException If the url is invalid or creation of the * underlying Lexer cannot be performed. * @exception ParserException if a problem occurs in connecting. * @see #getURL */ public void setURL (String url) throws ParserException { if ((null != url) && !"".equals (url)) setConnection (getConnectionManager ().openConnection (url)); } /** * Return the current URL being parsed. * @return The current url. This is the URL for the current page. * A string passed into the constructor or set via setURL may be altered, * for example, a file name may be modified to be a URL. * @see Page#getUrl * @see #setURL */ public String getURL () { return (getLexer ().getPage ().getUrl ()); } /** * Set the encoding for the page this parser is reading from. * @param encoding The new character set to use. * @throws ParserException If the encoding change causes characters that * have already been consumed to differ from the characters that would * have been seen had the new encoding been in force. * @see org.htmlparser.util.EncodingChangeException * @see #getEncoding */ public void setEncoding (String encoding) throws ParserException { getLexer ().getPage ().setEncoding (encoding); } /** * Get the encoding for the page this parser is reading from. * This item is set from the HTTP header but may be overridden by meta * tags in the head, so this may change after the head has been parsed. * @return The encoding currently in force. * @see #setEncoding */ public String getEncoding () { return (getLexer ().getPage ().getEncoding ()); } /** * Set the lexer for this parser. * The current NodeFactory is transferred to (set on) the given lexer, * since the lexer owns the node factory object. * It does not adjust the <code>feedback</code> object. * @param lexer The lexer object to use. * @see #setNodeFactory * @see #getLexer * @exception IllegalArgumentException if <code>lexer</code> is <code>null</code>. */ public void setLexer (Lexer lexer) { NodeFactory factory; String type; if (null == lexer) throw new IllegalArgumentException ("lexer cannot be null"); // move a node factory that's been set to the new lexer factory = null; if (null != getLexer ()) factory = getLexer ().getNodeFactory (); if (null != factory) lexer.setNodeFactory (factory); mLexer = lexer; // warn about content that's not likely text type = mLexer.getPage ().getContentType (); if (type != null && !type.startsWith ("text")) getFeedback ().warning ( "URL " + mLexer.getPage ().getUrl () + " does not contain text"); } /** * Returns the lexer associated with the parser. * @return The current lexer. * @see #setLexer */ public Lexer getLexer () { return (mLexer); } /** * Get the current node factory. * @return The current lexer's node factory. * @see #setNodeFactory */ public NodeFactory getNodeFactory () { return (getLexer ().getNodeFactory ()); } /** * Set the current node factory. * @param factory The new node factory for the current lexer. * @see #getNodeFactory * @exception IllegalArgumentException if <code>factory</code> is <code>null</code>. */ public void setNodeFactory (NodeFactory factory) { if (null == factory) throw new IllegalArgumentException ("node factory cannot be null"); getLexer ().setNodeFactory (factory); } /** * Sets the feedback object used in scanning. * @param fb The new feedback object to use. If this is null a * {@link #DEVNULL silent feedback object} is used. * @see #getFeedback */ public void setFeedback (ParserFeedback fb) { if (null == fb) mFeedback = DEVNULL; else mFeedback = fb; } /** * Returns the current feedback object. * @return The feedback object currently being used. * @see #setFeedback */ public ParserFeedback getFeedback() { return (mFeedback); } // // Public methods // /** * Reset the parser to start from the beginning again. * This assumes support for a reset from the underlying * {@link org.htmlparser.lexer.Source} object. * <p>This is cheaper (in terms of time) than resetting the URL, i.e. * <pre> * parser.setURL (parser.getURL ()); * </pre> * because the page is not refetched from the internet. * <em>Note: the nodes returned on the second parse are new * nodes and not the same nodes returned on the first parse. If you * want the same nodes for re-use, collect them in a NodeList with * {@link #parse(NodeFilter) parse(null)} and operate on the NodeList.</em> */ public void reset () { getLexer ().reset (); } /** * Returns an iterator (enumeration) over the html nodes. * {@link org.htmlparser.nodes Nodes} can be of three main types: * <ul> * <li>{@link org.htmlparser.nodes.TagNode TagNode}</li> * <li>{@link org.htmlparser.nodes.TextNode TextNode}</li> * <li>{@link org.htmlparser.nodes.RemarkNode RemarkNode}</li> * </ul> * In general, when parsing with an iterator or processing a NodeList, * you will need to use recursion. For example: * <code> * <pre> * void processMyNodes (Node node) * { * if (node instanceof TextNode) * { * // downcast to TextNode * TextNode text = (TextNode)node; * // do whatever processing you want with the text * System.out.println (text.getText ()); * } * if (node instanceof RemarkNode) * { * // downcast to RemarkNode * RemarkNode remark = (RemarkNode)node; * // do whatever processing you want with the comment * } * else if (node instanceof TagNode) * { * // downcast to TagNode * TagNode tag = (TagNode)node; * // do whatever processing you want with the tag itself * // ... * // process recursively (nodes within nodes) via getChildren() * NodeList nl = tag.getChildren (); * if (null != nl) * for (NodeIterator i = nl.elements (); i.hasMoreElements (); ) * processMyNodes (i.nextNode ()); * } * } * * Parser parser = new Parser ("http://www.yahoo.com"); * for (NodeIterator i = parser.elements (); i.hasMoreElements (); ) * processMyNodes (i.nextNode ()); * </pre> * </code> * @throws ParserException If a parsing error occurs. * @return An iterator over the top level nodes (usually {@.html <html>}). */ public NodeIterator elements () throws ParserException { return (new IteratorImpl (getLexer (), getFeedback ())); } /** * Parse the given resource, using the filter provided. * This can be used to extract information from specific nodes. * When used with a <code>null</code> filter it returns an * entire page which can then be modified and converted back to HTML * (Note: the synthesis use-case is not handled very well; the parser * is more often used to extract information from a web page). * <p>For example, to replace the entire contents of the HEAD with a * single TITLE tag you could do this: * <pre> * NodeList nl = parser.parse (null); // here is your two node list * NodeList heads = nl.extractAllNodesThatMatch (new TagNameFilter ("HEAD")) * if (heads.size () > 0) // there may not be a HEAD tag * { * Head head = heads.elementAt (0); // there should be only one * head.removeAll (); // clean out the contents * Tag title = new TitleTag (); * title.setTagName ("title"); * title.setChildren (new NodeList (new TextNode ("The New Title"))); * Tag title_end = new TitleTag (); * title_end.setTagName ("/title"); * title.setEndTag (title_end); * head.add (title); * } * System.out.println (nl.toHtml ()); // output the modified HTML * </pre> * @return The list of matching nodes (for a <code>null</code> * filter this is all the top level nodes). * @param filter The filter to apply to the parsed nodes, * or <code>null</code> to retrieve all the top level nodes. * @throws ParserException If a parsing error occurs. */ public NodeList parse (NodeFilter filter) throws ParserException { NodeIterator e; Node node; NodeList ret; ret = new NodeList (); for (e = elements (); e.hasMoreNodes (); ) { node = e.nextNode (); if (null != filter) node.collectInto (ret, filter); else ret.add (node); } return (ret); } /** * Apply the given visitor to the current page. * The visitor is passed to the <code>accept()</code> method of each node * in the page in a depth first traversal. The visitor * <code>beginParsing()</code> method is called prior to processing the * page and <code>finishedParsing()</code> is called after the processing. * @param visitor The visitor to visit all nodes with. * @throws ParserException If a parse error occurs while traversing * the page with the visitor. */ public void visitAllNodesWith (NodeVisitor visitor) throws ParserException { Node node; visitor.beginParsing(); for (NodeIterator e = elements(); e.hasMoreNodes(); ) { node = e.nextNode(); node.accept(visitor); } visitor.finishedParsing(); } /** * Initializes the parser with the given input HTML String. * @param inputHTML the input HTML that is to be parsed. * @throws ParserException If a error occurs in setting up the * underlying Lexer. * @exception IllegalArgumentException if <code>inputHTML</code> is <code>null</code>. */ public void setInputHTML (String inputHTML) throws ParserException { if (null == inputHTML) throw new IllegalArgumentException ("html cannot be null"); if (!"".equals (inputHTML)) setLexer (new Lexer (new Page (inputHTML))); } /** * Extract all nodes matching the given filter. * @see Node#collectInto(NodeList, NodeFilter) * @param filter The filter to be applied to the nodes. * @throws ParserException If a parse error occurs. * @return A list of nodes matching the filter criteria, * i.e. for which the filter's accept method * returned <code>true</code>. */ public NodeList extractAllNodesThatMatch (NodeFilter filter) throws ParserException { NodeIterator e; NodeList ret; ret = new NodeList (); for (e = elements (); e.hasMoreNodes (); ) e.nextNode ().collectInto (ret, filter); return (ret); } // // ConnectionMonitor interface // /** * Called just prior to calling connect. * Part of the ConnectionMonitor interface, this implementation just * sends the request header to the feedback object if any. * @param connection The connection which is about to be connected. * @throws ParserException <em>Not used</em> * @see ConnectionMonitor#preConnect */ public void preConnect (HttpURLConnection connection) throws ParserException { getFeedback ().info (HttpHeader.getRequestHeader (connection)); } /** * Called just after calling connect. * Part of the ConnectionMonitor interface, this implementation just * sends the response header to the feedback object if any. * @param connection The connection that was just connected. * @throws ParserException <em>Not used.</em> * @see ConnectionMonitor#postConnect */ public void postConnect (HttpURLConnection connection) throws ParserException { getFeedback ().info (HttpHeader.getResponseHeader (connection)); } /** * The main program, which can be executed from the command line. * @param args A URL or file name to parse, and an optional tag name to be * used as a filter. */ public static void main (String [] args) { Parser parser; NodeFilter filter; if (args.length < 1 || args[0].equals ("-help")) { System.out.println ("HTML Parser v" + getVersion () + "\n"); System.out.println (); System.out.println ("Syntax : java -jar htmlparser.jar" + " <file/page> [type]"); System.out.println (" <file/page> the URL or file to be parsed"); System.out.println (" type the node type, for example:"); System.out.println (" A - Show only the link tags"); System.out.println (" IMG - Show only the image tags"); System.out.println (" TITLE - Show only the title tag"); System.out.println (); System.out.println ("Example : java -jar htmlparser.jar" + " http://www.yahoo.com"); System.out.println (); } else try { parser = new Parser (); if (1 < args.length) filter = new TagNameFilter (args[1]); else { filter = null; // for a simple dump, use more verbose settings parser.setFeedback (Parser.STDOUT); getConnectionManager ().setMonitor (parser); } getConnectionManager ().setRedirectionProcessingEnabled (true); getConnectionManager ().setCookieProcessingEnabled (true); parser.setResource (args[0]); System.out.println (parser.parse (filter)); } catch (ParserException e) { e.printStackTrace (); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -