⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 parser.java

📁 一个主题相关的网络爬虫,实现与某一主题相关的网页的爬取
💻 JAVA
字号:
package com.parser;

import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import java.util.*;
import java.net.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
import javax.swing.text.html.parser.*;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import java.io.*;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import java.util.*;
import java.net.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
import javax.swing.text.html.parser.*;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import java.io.*;

//继承ParserCallback的类,用来解析网页文档
public class Parser extends ParserCallback {
	protected String base;

	protected boolean isLink = false;

	protected boolean isParagraph = false;

	protected boolean isTitle = false;

	protected String htmlbody = new String();

	// 得到标题文本
	protected String urlTitle = new String();

	// 得到某一网页上的所有链接
	protected Vector<String> links = new Vector<String>();

	protected Vector<String> linkname = new Vector<String>();

	// 得到网页上的正文文本
	protected String paragraphText = new String();

	protected String linkandparagraph = new String();

	protected String encode = new String();

	public Parser(String baseurl) {
		base = baseurl;
	}

	public String getEncode() {
		return encode;
	}

	// 获得该网页标题
	public String getURLtitle() {
		return urlTitle;
	}

	// 获得该网页的所有链接
	public Vector getLinks() {
		return links;
	}

	// 获得所有该网页的链接名
	public Vector getLinkName() {
		return linkname;
	}

	// 获得网页正文
	public String getParagraphText() {
		return paragraphText;
	}

	/*
	 * public String getLinknameAndParagraph() { return linkandparagraph; }
	 */

	/*
	 * public void handleComment(char[] data, int pos) { }
	 */
	// 处理开始标签
	public void handleEndTag(HTML.Tag t, int pos) {
		if (t == HTML.Tag.A) {
			if (isLink) {
				isLink = false;
			}
		} else if (t == HTML.Tag.P) {
			if (isParagraph) {
				isParagraph = false;
			}
		} else if (t == HTML.Tag.TITLE) {
			isTitle = false;
		} else if (t == HTML.Tag.AREA) {
			isLink = false;
		}
	}

	/*
	 * public void handleError(String errorMsg, int pos) { }
	 */
	// 处理简单标签
	public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
		handleStartTag(t, a, pos);
	}

	// 处理结束标签
	public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
		// is it some sort of a link
		String href = "";
		if ((t == HTML.Tag.A) && (t != HTML.Tag.BASE)) {
			href = (String) a.getAttribute(HTML.Attribute.HREF);
			if (href != null) {
				try {
					URL url = new URL(new URL(base), href);
					links.addElement(url.toString());
					isLink = true;
				} catch (MalformedURLException e) {
					// System.out.println(e.getMessage());
				}
			}
		} else if (t == HTML.Tag.AREA) {
			href = (String) a.getAttribute(HTML.Attribute.HREF);
			if (href != null) {
				String alt = (String) a.getAttribute(HTML.Attribute.ALT);
				try {
					URL url = new URL(new URL(base), href);
					links.addElement(url.toString());
					if (alt != null) {
						linkname.addElement(alt);
						linkandparagraph += alt;
					}
					isLink = true;
				} catch (MalformedURLException e) {
					// System.out.println(e.getMessage());
				}
			}
		} else if (t == HTML.Tag.TITLE) {
			isTitle = true;
		} else if (t == HTML.Tag.P) {
			isParagraph = true;
		} else if (t == HTML.Tag.BASE) {
			href = (String) a.getAttribute(HTML.Attribute.HREF);
			if (href != null)
				base = href;
		}
	}

	// 处理文本 标签
	public void handleText(char[] data, int pos) {
		if (isLink) {
			String urlname = new String(data);
			if (urlname != null) {
				linkname.addElement(urlname);
				linkandparagraph += urlname;
			}
		}
		if (isTitle) {
			String temptitle = new String(data);
			urlTitle = temptitle;
		}
		if (isParagraph) {
			String tempParagraphText = new String(data);
			if (paragraphText != null) {
				paragraphText += tempParagraphText;
				linkandparagraph += tempParagraphText;
			}
		}
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -