⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 makeupurl.java

📁 java写的搜索引擎网络爬虫 java写的搜索引擎网络爬虫 java写的搜索引擎网络爬虫
💻 JAVA
字号:
/* 
 * FileName MakeUpUrl.java
 * Create Time 2006-5-17 17:06:05
 * Author shiwei
 * Descript 组织单前页面的URL
 * Version 
 */

package com.snoics.reptile.regex.url.impl;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import com.snoics.reptile.link.createUrl.BuildUrl;
import com.snoics.reptile.link.createUrl.IBuildUrl;
import com.snoics.reptile.regex.filter.DownloadUrlFilter;
import com.snoics.reptile.regex.filter.ForbidUrlFilter;
import com.snoics.reptile.regex.filter.IRegexFilter;
import com.snoics.reptile.regex.filter.RangeUrlFilter;
import com.snoics.reptile.regex.filter.RemoteUrlFilter;
import com.snoics.reptile.regex.filter.UnDownloadUrlFilter;
import com.snoics.reptile.regex.url.IFilterAllUrl;
import com.snoics.reptile.regex.url.IMakeUpUrl;
import com.snoics.reptile.util.UrlUtil;

public class MakeUpUrl implements IMakeUpUrl{
	private String parentUrl="";
	
	private Set allHtmlUrl=null;
	private List remogeUrl=new ArrayList();
	private List rangerUrl=new ArrayList();
	private List forbidUrl=new ArrayList();
	private List downloadUrl=new ArrayList();
	private List unDownloadUrl=new ArrayList();
	private List urlRegexList=new ArrayList();
	private String htmlString="";
	private IBuildUrl buildUrl=new BuildUrl();
	
	public String getParentUrl() {
		return parentUrl;
	}
	public void setParentUrl(String parentUrl) {
		this.parentUrl = parentUrl;
	}
	
	public String getHtmlString() {
		return htmlString;
	}
	public void setHtmlString(String htmlString) {
		this.htmlString = htmlString;
	}
	
	/**
	 * 设置当前页面的所有URL
	 * @param allHtmlUrl
	 */
	public void setAllHtmlUrl(Set allHtmlUrl) {
		this.allHtmlUrl=allHtmlUrl;
	}
	/**
	 * 获取单前页面的所有URL
	 * @return Set
	 */
	public Set getAllHtmlUrl() {
	  return allHtmlUrl;
    }
	/**
	 * 获取所有远程URL
	 * @return List
	 */
	public List getRemoteUrl() {
		return remogeUrl;
	}
	/**
	 * 获取处于解析范围之内的URL
	 * @return List
	 */
	public List getRangeUrl() {
		return rangerUrl;
	}
	/**
	 * 获取处于解析范围之内,但不被抓取的URL
	 * @return List
	 */
	public List getForbidUrl() {
		return forbidUrl;
	}
	/**
	 * 获取不被抓取的二进制文件URL
	 * @return List
	 */
	public List getUnDownloadUrl() {
		return unDownloadUrl;
	}
	/**
	 * 获取将被抓取到本地的二进制文件URL
	 * @return List
	 */
	public List getDownloadUrl() {
		return downloadUrl;
	}
	
	/**
	 * 获取解析当前页面使用到的正则表达式
	 * @return List
	 */
	public List getUrlRegexList(){
		return urlRegexList;
	}
	
	/**
	 * 设置解析当前页面使用到的正则表达式
	 * @param urlRegexList
	 */
	public void setUrlRegexList(List urlRegexList){
		this.urlRegexList=urlRegexList;
	}
	
	/**
	 * 执行组织操作
	 *
	 */
	public void makeUp() {
		if(allHtmlUrl==null) {
			return;
		}
		Iterator iterator=allHtmlUrl.iterator();
		while(iterator.hasNext()) {
			String theHtmlUrl=(String)iterator.next();
			String theNewHtmlUrl=buildUrl.buildResolvedUrl(theHtmlUrl,parentUrl);
			IRegexFilter downloadUrlFilter=new DownloadUrlFilter();
			downloadUrlFilter.setUrl(theNewHtmlUrl);
			//如果属于需要下载到本地的二进制文件类型
			if(downloadUrlFilter.filter()) {
				downloadUrl.add(theHtmlUrl);
			}else {
				IRegexFilter remoteUrlFilter=new RemoteUrlFilter();
				remoteUrlFilter.setUrl(theNewHtmlUrl);
				//如果属于远程的URL
				if(remoteUrlFilter.filter()) {
					remogeUrl.add(theHtmlUrl);
				}else {
					IRegexFilter unDownloadUrlFilter=new UnDownloadUrlFilter();
					unDownloadUrlFilter.setUrl(theNewHtmlUrl);
					//如果属于不需要下载到本地的二进制文件类型
					if(unDownloadUrlFilter.filter()) {
						unDownloadUrl.add(theHtmlUrl);
					}else {
						IRegexFilter forbidUrlFilter=new ForbidUrlFilter();
						forbidUrlFilter.setUrl(theNewHtmlUrl);
						//如果属于禁止抓取的页面
						if(forbidUrlFilter.filter()) {
							forbidUrl.add(theHtmlUrl);
						}else {
							IRegexFilter rangeUrlFilter=new RangeUrlFilter();
							rangeUrlFilter.setUrl(theNewHtmlUrl);
							//如果属于抓取的页面范围
							if(rangeUrlFilter.filter()) {
								rangerUrl.add(theHtmlUrl);
							}
						}
					}
				}
			}
		}
		//showInfo();
	}
	
	private void showInfo(){
		System.out.println("getDownloadUrl="+getDownloadUrl());

		System.out.println("getForbidUrl="+getForbidUrl());

		System.out.println("getRangeUrl="+getRangeUrl());

		System.out.println("getRemoteUrl="+getRemoteUrl());

		System.out.println("getUnDownloadUrl="+getUnDownloadUrl());
	}
	
	public static void main(String[] args){
		UrlUtil urlRegexUtil=new UrlUtil();
		String url="http://192.168.0.1/index.jsp";
		String htmlString=urlRegexUtil.getHtmlString(url);
		//String htmlString="<  a href = 11fdsjkf  ><a href = fdsjkf><a href = fdsjkf>    <a href = fdsjkf><a href = fdsjkf><a href = fdsjkf>";
		IFilterAllUrl filterAllUrl=new FilterAllUrl();
		filterAllUrl.setHtmlString(htmlString);
		Set list=filterAllUrl.getHtmlUrl();
		MakeUpUrl makeUpUrl=new MakeUpUrl();
		makeUpUrl.setParentUrl(url);
		makeUpUrl.setAllHtmlUrl(list);
		makeUpUrl.makeUp();
		
		makeUpUrl.showInfo();
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -