⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 filterallurl.java

📁 java写的搜索引擎网络爬虫 java写的搜索引擎网络爬虫 java写的搜索引擎网络爬虫
💻 JAVA
字号:
/* 
 * FileName FilterAllUrl.java
 * Create Time 2006-5-17 12:41:18
 * Author shiwei
 * Descript 过滤出所有的URL
 * Version 
 */

package com.snoics.reptile.regex.url.impl;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import com.snoics.base.util.UtilTool;
import com.snoics.reptile.regex.url.IFilterAllUrl;
import com.snoics.reptile.regex.url.ISingleRegexUrl;
import com.snoics.reptile.regex.url.IUrlRegex;
import com.snoics.reptile.regex.url.IUrlRegexMap;
import com.snoics.reptile.system.common.CommonObject;
import com.snoics.reptile.util.UrlUtil;

public class FilterAllUrl implements IFilterAllUrl{
	private String htmlString="";
	private CommonObject commonObject=new CommonObject();
	private List urlRegexList=new ArrayList();
	
	/**
	 * @return Returns the htmlString.
	 */
	public String getHtmlString() {
		return htmlString;
	}
	
	/**
	 * @param htmlString The htmlString to set.
	 */
	public void setHtmlString(String htmlString) {
		this.htmlString = htmlString;
	}

	
	/**
	 * 获取解析当前页面使用到的正则表达式
	 * @return List
	 */
	public List getUrlRegexList(){
		return urlRegexList;
	}
	
	private List filterUrl() {
		List allUrlList=new ArrayList();
		IUrlRegexMap urlRegexMap=commonObject.getConfigInfoObject().getUrlRegexMap();
		Map map=urlRegexMap.getUlrRegexMap();
		Object[][] objects=UtilTool.getMapKeyValue(map);
		if((objects!=null)&&(objects.length>0)){
			int length=objects.length;
			for(int i=0;i<length;i++) {
				IUrlRegex urlRegex=(IUrlRegex)objects[i][1];
				ISingleRegexUrl singleRegexlUrl=new SingleRegexlUrl();
				singleRegexlUrl.setUrl(htmlString);
				singleRegexlUrl.setUrlRegex(urlRegex);
				List nowUrlList=singleRegexlUrl.filterUrls();
				if(nowUrlList!=null) {
					urlRegexList.add(urlRegex);
					allUrlList.addAll(nowUrlList);
				}
			}
		}
		return allUrlList;
	}
	
	public Set getHtmlUrl() {
		List allUrlList=filterUrl();
		Set htmlUrl=new HashSet();
		if(allUrlList!=null) {
			Iterator iterator=allUrlList.iterator();
			while(iterator.hasNext()) {
				String url=(String)iterator.next();
				htmlUrl.add(url);
			}
		}
		return htmlUrl;
	}
	
	public static void main(String[] ags) {
		UrlUtil urlRegexUtil=new UrlUtil();
		String url="http://192.168.0.1/index.jsp";
		String htmlString=urlRegexUtil.getHtmlString(url);
		//String htmlString="<  a href = 11fdsjkf  ><a href = fdsjkf><a href = fdsjkf>    <a href = fdsjkf><a href = fdsjkf><a href = fdsjkf>";
		IFilterAllUrl filterAllUrl=new FilterAllUrl();
		filterAllUrl.setHtmlString(htmlString);
		Set list=filterAllUrl.getHtmlUrl();
		System.out.println("----------------------------------------------");
		System.out.println("list="+list);
		Iterator iterator=list.iterator();
		while(iterator.hasNext()) {
			String urlS=(String)iterator.next();
			System.out.println(urlS);
		}
		System.out.println("----------------------------------------------");
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -