⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 srcgetregexurl.java

📁 java写的搜索引擎网络爬虫 java写的搜索引擎网络爬虫 java写的搜索引擎网络爬虫
💻 JAVA
字号:
/* 
 * FileName SrcGetUrlRegex.java
 * Create Time 2006-5-17 10:07:26
 * Author shiwei
 * Descript 获取a href中的连接
 * Version 
 */

package com.snoics.reptile.regex.url.impl;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

import com.snoics.base.util.StringClass;
import com.snoics.base.util.regex.Regex;
import com.snoics.reptile.regex.url.IGetRegexUrl;
import com.snoics.reptile.system.common.Common;

public class SrcGetRegexUrl implements IGetRegexUrl{
	private Regex regex=new Regex();
	private String regexString="";
	private String unIncludeRegexString="";

	/**
	 * @return Returns the unIncludeRegexString.
	 */
	public String getUnIncludeRegexString() {
		return unIncludeRegexString;
	}

	/**
	 * @param unIncludeRegexString The unIncludeRegexString to set.
	 */
	public void setUnIncludeRegexString(String unIncludeRegexString) {
		this.unIncludeRegexString = unIncludeRegexString;
	}

	/**
	 * @param regexString The regexString to set.
	 */
	public void setRegexString(String regexString) {
		this.regexString = regexString;
	}	
	
	/**
	 * 获取提取URL使用的正则表达式
	 * @return String
	 */
	public String getRegexString(){
		return regexString;
	}

	public List getUrl(String urlString) {
        List list=regex.group(urlString,regexString,Pattern.CASE_INSENSITIVE);
        List newString=new ArrayList();
    	String tempRegex="(\\s|\"|'|(src|background)\\s?=|>|<)";
    	List unIncludeRegexList=null;
    	unIncludeRegexString=StringClass.getString(unIncludeRegexString);
    	if(!unIncludeRegexString.equals("")) {
    		unIncludeRegexList=StringClass.getInterString(Common.STRING_SEPARATE_FLAG,unIncludeRegexString);
    	}
        if(list!=null) {
        	Iterator it=list.iterator();
        	while(it.hasNext()) {
        		String theUrlString=(String)it.next();
        		theUrlString=regex.getReplaceAll(theUrlString,"",tempRegex,Pattern.CASE_INSENSITIVE);
        		boolean unIncludeFlag=false;
        		if(unIncludeRegexList!=null) {
        			Iterator itUnIncludeRegexList=unIncludeRegexList.iterator();
        			while(itUnIncludeRegexList.hasNext()) {
        				String nowUnIncludeRegex=(String)itUnIncludeRegexList.next();
        				if(!nowUnIncludeRegex.equals("")) {
                        	if(regex.find(theUrlString,nowUnIncludeRegex,Pattern.CASE_INSENSITIVE)) {
                        		unIncludeFlag=true;
                        		break;
                        	}
        				}
        			}
        		}
        		if(!unIncludeFlag) {
        			theUrlString=theUrlString.trim();
        			newString.add(theUrlString);
        		}
        	}
        }
		return newString;
	}
	
	public static void main(String[] args) {
		SrcGetRegexUrl ahrefGetUrlRegex=new SrcGetRegexUrl();
		System.out.println(ahrefGetUrlRegex.getUrl("<a href =fdsjkfsd fdsf>"));
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -