⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 webpagextractor.java

📁 网络实验参考资料,希望对大家有点用,也是希望资源共享啊
💻 JAVA
字号:
package bplatt.spider;/** * WebPageXtractor - extracts information from a WebPage * passed as an input stream.  Makes use of SimpleHTMLParser * object.  Used to use HTMLEditorKit and HTMLEditorKit.Parser.  * This turned out to be too buggy for this application. * Cannot use XML parser as HTML does not follow stricter XML * syntax rules.  In fact many Web pages are a "tag salad" that * don't even follow proper HTML syntax.  WebPageXtractor parses * a page and extracts links, images, and title(s). *  * Copyright 2002, Robert L. Platt, All rights reserved * @author Robert L. Platt  *  */  import java.io.*; import java.util.*;  public class WebPageXtractor extends SimpleHTMLParser { 	private ArrayList links;	private ArrayList images;	private ArrayList title;	private boolean inTitle;		/** Constructor */	public WebPageXtractor() {		super();		links = new ArrayList();		images = new ArrayList();		title = new ArrayList();	}		/**	 * If we're within TITLE tags - save the title	 * @see SimpleHTMLParser#processContent(SimpleHTMLToken)	 */	public void processContent(SimpleHTMLToken token) {		String s = token.getContent().trim();		if (s != null && s.length() != 0) {		 	if (inTitle) title.add(s);		}	}	/**	 * Look for </title> tags	 * @see SimpleHTMLParser#processEndTag(SimpleHTMLToken)	 */	public void processEndTag(SimpleHTMLToken token) throws IOException	{		String tag = SimpleHTMLParser.getTagType(token,true);		if (tag == null) throw new IOException("HTML parsing error");		else if (tag.equals("title")) inTitle = false;	}	/**	 * Handle Anchor, Image, Frame, and Title tags	 * @see SimpleHTMLParser#processTag(SimpleHTMLToken)	 */	public void processTag(SimpleHTMLToken token) throws IOException	{		String tag = SimpleHTMLParser.getTagType(token,true);		if (tag == null) throw new IOException("HTML parsing error");		else if (tag.equals("a")) {			String link = extractHref(token.getContent());			if (link != null) links.add(link);		}		else if (tag.equals("img")) {			String image = extractSrc(token.getContent());			if (image != null) images.add(image);		}		else if (tag.equals("frame")) {			String link = extractSrc(token.getContent());			if (link != null) links.add(link);		}		else if (tag.equals("title")) inTitle = true;	}			// Utility method for extracting href attribute	private String extractHref(String tag)	{		String delims="\t\r\f\n \'\"=";		StringTokenizer tt = new StringTokenizer(tag,delims);		while(tt.hasMoreElements()) {			String s = tt.nextToken();			if (s.equalsIgnoreCase("href")) {				if (!tt.hasMoreElements()) return(null);				else return(tt.nextToken());			}		}		return(null);	}		// Utility method for extracting src attribute	private String extractSrc(String tag)	{		String delims="\t\r\f\n \'\"=";		StringTokenizer tt = new StringTokenizer(tag,delims);		while(tt.hasMoreElements()) {			String s = tt.nextToken();			if (s.equalsIgnoreCase("src")) {				if (!tt.hasMoreElements()) return(null);				else return(tt.nextToken());			}		}		return(null);	}	/**	 * Returns the images.	 * @return ArrayList	 */	public ArrayList getImages() {		return images;	}	/**	 * Returns the links.	 * @return ArrayList	 */	public ArrayList getLinks() {		return links;	}	/**	 * Returns the title.	 * @return ArrayList	 */	public ArrayList getTitle() {		return title;	}}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -