📄 parser.java
字号:
package com.parser;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import java.util.*;
import java.net.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
import javax.swing.text.html.parser.*;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import java.io.*;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import java.util.*;
import java.net.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
import javax.swing.text.html.parser.*;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import java.io.*;
//继承ParserCallback的类,用来解析网页文档
public class Parser extends ParserCallback {
protected String base;
protected boolean isLink = false;
protected boolean isParagraph = false;
protected boolean isTitle = false;
protected String htmlbody = new String();
// 得到标题文本
protected String urlTitle = new String();
// 得到某一网页上的所有链接
protected Vector<String> links = new Vector<String>();
protected Vector<String> linkname = new Vector<String>();
// 得到网页上的正文文本
protected String paragraphText = new String();
protected String linkandparagraph = new String();
protected String encode = new String();
public Parser(String baseurl) {
base = baseurl;
}
public String getEncode() {
return encode;
}
// 获得该网页标题
public String getURLtitle() {
return urlTitle;
}
// 获得该网页的所有链接
public Vector getLinks() {
return links;
}
// 获得所有该网页的链接名
public Vector getLinkName() {
return linkname;
}
// 获得网页正文
public String getParagraphText() {
return paragraphText;
}
/*
* public String getLinknameAndParagraph() { return linkandparagraph; }
*/
/*
* public void handleComment(char[] data, int pos) { }
*/
// 处理开始标签
public void handleEndTag(HTML.Tag t, int pos) {
if (t == HTML.Tag.A) {
if (isLink) {
isLink = false;
}
} else if (t == HTML.Tag.P) {
if (isParagraph) {
isParagraph = false;
}
} else if (t == HTML.Tag.TITLE) {
isTitle = false;
} else if (t == HTML.Tag.AREA) {
isLink = false;
}
}
/*
* public void handleError(String errorMsg, int pos) { }
*/
// 处理简单标签
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
handleStartTag(t, a, pos);
}
// 处理结束标签
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
// is it some sort of a link
String href = "";
if ((t == HTML.Tag.A) && (t != HTML.Tag.BASE)) {
href = (String) a.getAttribute(HTML.Attribute.HREF);
if (href != null) {
try {
URL url = new URL(new URL(base), href);
links.addElement(url.toString());
isLink = true;
} catch (MalformedURLException e) {
// System.out.println(e.getMessage());
}
}
} else if (t == HTML.Tag.AREA) {
href = (String) a.getAttribute(HTML.Attribute.HREF);
if (href != null) {
String alt = (String) a.getAttribute(HTML.Attribute.ALT);
try {
URL url = new URL(new URL(base), href);
links.addElement(url.toString());
if (alt != null) {
linkname.addElement(alt);
linkandparagraph += alt;
}
isLink = true;
} catch (MalformedURLException e) {
// System.out.println(e.getMessage());
}
}
} else if (t == HTML.Tag.TITLE) {
isTitle = true;
} else if (t == HTML.Tag.P) {
isParagraph = true;
} else if (t == HTML.Tag.BASE) {
href = (String) a.getAttribute(HTML.Attribute.HREF);
if (href != null)
base = href;
}
}
// 处理文本 标签
public void handleText(char[] data, int pos) {
if (isLink) {
String urlname = new String(data);
if (urlname != null) {
linkname.addElement(urlname);
linkandparagraph += urlname;
}
}
if (isTitle) {
String temptitle = new String(data);
urlTitle = temptitle;
}
if (isParagraph) {
String tempParagraphText = new String(data);
if (paragraphText != null) {
paragraphText += tempParagraphText;
linkandparagraph += tempParagraphText;
}
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -