📄 procurl.java
字号:
package procURL;
import java.io.*;
import java.net.*;
import java.util.ArrayList;
import java.util.Collection;
import java.net.MalformedURLException;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
public class ProcURL {
/**
* 用于标记版块页面URL中除页码的其他部分
*/
private String urlhead;
/**
* 版块列表完成标记,若为真,则表示版块列表已完成,开始进行主题过滤
*/
private boolean blockListFlag = true;
/**
* 等待处理的版块URL列表
*/
protected Collection blockList = new ArrayList(3);
/**
* 等待处理的主题URL列表
*/
protected Collection topicList = new ArrayList(3);
/**
* 版块页码列表
*/
protected Collection blockPageNumList = new ArrayList(3);
/**
* 主题页码列表
*/
protected Collection topicPageNumList = new ArrayList(3);
/**
* 存储版块每一页中的多页主题的tid
*/
protected Collection tidList = new ArrayList(3);
/**
* 存储版块中的单页主题tid
*/
protected Collection onePageList = new ArrayList(3);
/**
* 存储版块每一页中的多页主题的最大页码
*/
protected Collection maxPageList = new ArrayList(100);
/**
* 获取等待处理的URL列表
* @return 等待处理的URL列表
*/
@SuppressWarnings("unchecked")
public Collection<URL> getBlockList()
{
return blockList;
}
/**
* 获取已完成URL列表
* @return 已完成URL列表
*/
@SuppressWarnings("unchecked")
public Collection<URL> getTopicList()
{
return topicList;
}
/**
* 获取主题URL列表
* @return tidList
*/
@SuppressWarnings("unchecked")
public Collection<Integer> getTidList(){
return tidList;
}
public Collection getOnePageList(){
return onePageList;
}
@SuppressWarnings("unchecked")
public Collection<Object> getMaxPageList(){
return maxPageList;
}
/**
* 获取已完成的版块页面标号列表
* @return blockPageNumList
*/
@SuppressWarnings("unchecked")
public Collection<Integer> getBlockPageNumList(){
return blockPageNumList;
}
/**
* 获取存储的主题页码标号列表
* @return topicPageNumList
*/
@SuppressWarnings("unchecked")
public Collection<Integer> getTopicPageNumList(){
return topicPageNumList;
}
/**
* 链接处理函数
* @param url
*/
//处理URL,分析并得出该页面的所有URL
public void processURL(URL url)
{
URLConnection connection =null;
try {
connection = url.openConnection();
System.out.println("Processing: " + url );
//获取URL内容
if ( (connection.getContentType()!=null) &&
!connection.getContentType().toLowerCase().startsWith("text/") ) {
System.out.println("Not processing because content type is: " +
connection.getContentType() );
return;
}
// read the URL
InputStream is = connection.getInputStream();
Reader r = new InputStreamReader(is);
// parse the URL
HTMLEditorKit.Parser parse = new HTMLParse().getParser();
parse.parse(r,new Parser(url),true);
} catch ( IOException e ) {
System.out.println("Error: " + url );
return;
}
}
protected class Parser
extends HTMLEditorKit.ParserCallback {
protected URL base;
public Parser(URL base)
{
this.base = base;
}
public void handleSimpleTag(HTML.Tag t,
MutableAttributeSet a,int pos)
{
String href = (String)a.getAttribute(HTML.Attribute.HREF);
if( (href==null) && (t==HTML.Tag.FRAME) )
href = (String)a.getAttribute(HTML.Attribute.SRC);
if ( href==null )
return;
int i = href.indexOf('#');
if ( i!=-1 )
href = href.substring(0,i);
handleLink(base,href);
}
public void handleStartTag(HTML.Tag t,
MutableAttributeSet a,int pos)
{
handleSimpleTag(t,a,pos); // handle the same way
}
protected void handleLink(URL base,String str)
{
try {
URL url = new URL(base,str);
System.out.println("URL"+url);
// 需修改,将版块列表全部获取后才可获取主题列表。。。????
//若版块列表为空,则获取版块页面URL;否则,获取主题页面URL
if(blockListFlag == true)
filterBlockPageURL(url);
else
filterTopicPageURL(url);
} catch ( MalformedURLException e ) {
System.out.println("Found malformed URL: " + str );
}
}
public void filterBlockPageURL(URL url){
String str1;
int index,page;
str1 = url.toString();
//提取可抽取的版块页面的URL中的页码,存入BlockPageNumList中
//(http://bbs.breezecn.com/thread.php?fid=5&search=&page=1)
if (str1.regionMatches(str1.indexOf("fid"),"fid",0,3)&&str1.regionMatches(str1.indexOf("&search=&page"),"&search=&page",0,13)){
// str0 = str1;
index = str1.lastIndexOf("=");
urlhead = str1.substring(0, index+1);
if(!((str1.substring(index+1)).equals("e"))){
page = Integer.parseInt(str1.substring(index+1));
getBlockPageNumList().add(page);
}
}
}
@SuppressWarnings("unchecked")
public void filterTopicPageURL(URL url){
String str1,str2;
int index1,index2,index3,index4,index5,tid,page;
str1 = url.toString();
//获取页数大于一页的主题的URL
//(http://bbs.breezecn.com/read.php?tid=112735&page=3&fpage=1)
if(str1.regionMatches(str1.indexOf("tid"),"tid",0,3)&&str1.regionMatches(str1.indexOf("&page"),"&page",0,5)){
index1 = str1.indexOf("=");
index2 = str1.indexOf("&");
tid = Integer.parseInt(str1.substring(index1+1, index2));
index3 = str1.indexOf("=", index2);
index4 = str1.lastIndexOf("&");
str2 = str1.substring(index3+1,index4);
//将page=e的链接过滤掉,最大页码为最后一项
if(!str2.equals("e")){
page = Integer.parseInt(str2);
if(getTidList().contains(tid)){
getTopicPageNumList().add(page);
}
else{
getTidList().add(tid);
if(((ArrayList)getTopicPageNumList()).isEmpty()){
return;//为空表示添加第一个多页信息,所以不做任何处理
}
//topicPageNumList不为空,则获取其中最后一个元素,将之添加到maxPageList,作为与
//tidlist相对应的最大页数
else{
index5 = ((ArrayList)getTopicPageNumList()).size();
getMaxPageList().add(((ArrayList)getTopicPageNumList()).get(index5-1));
getTopicPageNumList().clear();
}
}
}
}
//获取第1页中页数为一页的主题的URL。。。
//(http://bbs.breezecn.com/read.php?tid=112735
if(str1.regionMatches(str1.indexOf("tid"),"tid",0,3)&&(!str1.regionMatches(str1.indexOf("&page"),"&page",0,5))&&(!str1.regionMatches(str1.indexOf("&fpage"),"&fpage",0,6))){
index1 = str1.indexOf("=");
str2 = str1.substring(index1+1);
tid = Integer.parseInt(str2);
getOnePageList().add(tid);
}
//获取除第1页外页数为一页的主题的URL。。。
//http://bbs.breezecn.com/read.php?tid=141834&fpage=2
if(str1.regionMatches(str1.indexOf("tid"),"tid",0,3)&&(!str1.regionMatches(str1.indexOf("&page"),"&page",0,5))&&str1.regionMatches(str1.indexOf("&fpage"),"&fpage",0,6)){
index1 = str1.indexOf("=");
index2 = str1.indexOf("&");
str2 = str1.substring(index1+1,index2);
tid = Integer.parseInt(str2);
getOnePageList().add(tid);
}
}
}
/**
* 获取版块URL列表,加入blockList中
*
*/
public void getBlockPageURL(){
int size;
int maxPage;
String blockURL = null;
Object maxPage1;
size = getBlockPageNumList().size();
maxPage1 = ((ArrayList)getBlockPageNumList()).get(size-1);
maxPage= Integer.parseInt(maxPage1.toString());
for(int i=1;i<maxPage+1;i++){
try{
blockURL = urlhead + i;
URL url = new URL(blockURL);
getBlockList().add(url);
} catch ( MalformedURLException e ) {
System.out.println("Found malformed URL: " + blockURL );
}
}
blockListFlag = false;
}
/**
* 获取主题页面URL列表,加入topicList中
*
*/
public void getTopicPageURL(){
Object o1,o2;
String str;
int index;
//将单页的URL添加到待处理列表中
for(int i=0;i<((ArrayList)getOnePageList()).size();i++){
o1=((ArrayList)getOnePageList()).get(i);
if(getTidList().contains(o1)){
continue;
}
else{
str = "http://bbs.breezecn.com/read.php?tid="+o1.toString()+"&page=1";
try {
URL url = new URL(str);
addTopicURL(url);
} catch ( MalformedURLException e ) {
System.out.println("Found malformed URL: " + str );
}
}
}
//将多页的最后一个主题的最大页数添加到maxPageList列表中
if(!((ArrayList)getTopicPageNumList()).isEmpty()){
index = ((ArrayList)getTopicPageNumList()).size();
getMaxPageList().add(((ArrayList)getTopicPageNumList()).get(index-1));
}
//将多页的主题URL添加到待处理列表中http://bbs.breezecn.com/read.php?tid=112735&page=3&fpage=1
for(int i=0;i<((ArrayList)getTidList()).size();i++){
o1 = ((ArrayList)getTidList()).get(i);
o2 = ((ArrayList)getMaxPageList()).get(i);
for(int j=1;j<Integer.parseInt(o2.toString())+1;j++){
str = "http://bbs.breezecn.com/read.php?tid="+o1.toString()+"&page="+j;
try {
URL url = new URL(str);
addTopicURL(url);
} catch ( MalformedURLException e ) {
System.out.println("Found malformed URL: " + str );
}
}
}
}
public void addTopicURL(URL url){
if (getTopicList().contains(url))
return;
getTopicList().add(url);
}
//清除tidList,tempList,maxPageList
public void clear(){
getTidList().clear();
getTopicPageNumList().clear();
getMaxPageList().clear();
getOnePageList().clear();
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -