⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 spiderconfig.java

📁 是个java写的sipder,非常不错!能承受很大的压力,每天采集的数量在10000万
💻 JAVA
📖 第 1 页 / 共 2 页
字号:

    public boolean refreshImages()
    {
        return refreshImages;
    }

    public void setRefreshOthers(boolean refreshOthers)
    {
        this.refreshOthers = refreshOthers;
    }

    public boolean refreshOthers()
    {
        return refreshOthers;
    }

	public boolean downloadMonternet()
	{
		return downloadMonternet;
	}

	public boolean filterBySecondDomain()
	{
		return filterBySecondDomain;
	}

	public boolean overrideLog()
	{
		return overrideLog;
	}

	public void setDownloadMonternet(boolean downloadMonternet)
	{
		this.downloadMonternet = downloadMonternet;
	}
	
    public void setSaveRootDirectory(File saveRootDirectory)
    {
        this.saveRootDirectory = saveRootDirectory;
    }

    public File getSaveRootDirectory()
    {
        return saveRootDirectory;
    }

    public void setSaveAnchorDirectory(File saveAnchorDirectory)
    {
        this.saveAnchorDirectory = saveAnchorDirectory;
    }

    public File getSaveAnchorDirectory()
    {
        return saveAnchorDirectory;
    }

    public void setMailtoLogFile(File mailtoLogFile)
    {
        this.mailtoLogFile = mailtoLogFile;
    }

    public File getMailtoLogFile()
    {
        return mailtoLogFile;
    }

    public void setStartLocation(URL startLocation)
    {
        this.startLocation = startLocation;
    }

    public URL getStartLocation()
    {
        return startLocation;
    }

    public void setURLMatch(String urlMatch)
    {
        this.urlMatch = urlMatch;
    }

    public String getURLMatch()
    {
        return urlMatch;
    }

    public List getInterestingURLSubstrings()
    {
        return interestingURLSubstrings;
    }

    public void setInterestingURLSubstrings(List interestingURLSubstrings)
    {
        this.interestingURLSubstrings = interestingURLSubstrings;
    }

    public List getBoringURLSubstrings()
    {
        return boringURLSubstrings;
    }

    public void setBoringURLSubstrings(List boringURLSubstrings)
    {
        this.boringURLSubstrings = boringURLSubstrings;
    }

    public boolean isInteresting(URL u)
    {
        return matchURL(u, interestingURLSubstrings);
    }

    public boolean isBoring(URL u)
    {
        return matchURL(u, boringURLSubstrings);
    }

    private boolean matchURL(URL u, List substrings)
    {
        String str = u.toExternalForm();
        for(Iterator i = substrings.iterator(); i.hasNext(); )
        {
            String substr = (String) i.next();
            if(str.indexOf(substr) != -1)
            {
                return true;
            }
        }
        return false;
    }

    public void setDepthFirstSearch(boolean depthFirst)
    {
        this.depthFirst = depthFirst;
    }

    public boolean isDepthFirstSearch()
    {
        return depthFirst;
    }

    public void setMaxDepth(int maxDepth)
    {
        this.maxDepth = maxDepth;
    }

    public int getMaxDepth()
    {
        return maxDepth;
    }

    public void setUserAgent(String userAgent)
    {
        this.userAgent = userAgent;
    }

    public String getUserAgent()
    {
        return userAgent;
    }

    public void setBasicAuthUser(String basicAuthUser)
    {
        this.basicAuthUser = basicAuthUser;
    }

    public String getBasicAuthUser()
    {
        return basicAuthUser;
    }

    public void setBasicAuthPassword(String basicAuthPassword)
    {
        this.basicAuthPassword = basicAuthPassword;
    }

    public String getBasicAuthPassword()
    {
        return basicAuthPassword;
    }

    public void setSpiderThreads(int spiderThreads)
    {
        this.spiderThreads = spiderThreads;
    }

    public int getSpiderThreads()
    {
        return spiderThreads;
    }

	public void setMaxDownloadTime(int maxDownloadTime)
	{
		this.maxDownloadTime = maxDownloadTime;
	}

	public int getMaxDownloadTime()
	{
		return maxDownloadTime;
	}

	public void setMaxDownloadPages(int maxDownloadPages)
	{
		this.maxDownloadPages = maxDownloadPages;
	}

	public int getMaxDownloadPages()
	{
		return maxDownloadPages;
	}

	public void setMaxTryCount(int maxTryCount)
	{
		this.maxTryCount = maxTryCount;
	}

	public int getMaxTryCount()
	{
		return maxTryCount;
	}

	public String getProxyHost()
	{
		return proxyHost;
	}

	public void setProxyHost(String proxyHost)
	{
		this.proxyHost = proxyHost;
	}

	public String getProxyPort()
	{
		return proxyPort;
	}

	public void setProxyPort(String proxyPort)
	{
		this.proxyPort = proxyPort;
	}
	
    public void setCheckpointInterval(long interval)
    {
        this.checkpointInterval = interval;
    }

    public long getCheckpointInterval()
    {
        return checkpointInterval;
    }

	public Set getInvalidExtensions()
	{
		return invalidExtensions;
	}

	public Set getRingExtensions()
	{
		return ringExtensions;
	}

	public Set getGameExtensions()
	{
		return gameExtensions;
	}

	public String getSaveRingFile()
	{
		return saveRingFile;
	}

	public String getSaveGameFile()
	{
		return saveGameFile;
	}
	
    public String toString()
    {
        return "depthFirst:\t" + depthFirst
           + "\nmaxDepth:\t" + maxDepth
           + "\nhtmlExtensions:\t" + fromSet(htmlExtensions)
           + "\nimageExtensions:\t" + fromSet(imageExtensions)
           + "\nringExtensions:\t" + fromSet(ringExtensions)
           + "\ngameExtensions:\t" + fromSet(gameExtensions)
           + "\ninvalidExtensions:\t" + fromSet(invalidExtensions)
           + "\nrefreshHTMLs:\t" + refreshHTMLs
           + "\nrefreshImages:\t" + refreshImages
           + "\nrefreshOthers:\t" + refreshOthers
           + "\nfilterBySecondDomain:\t" + filterBySecondDomain
           + "\ndownloadMonternet:\t" + downloadMonternet
           + "\nsaveRootDirectory:\t" + saveRootDirectory
           + "\nsaveAnchorDirectory:\t" + saveAnchorDirectory
           + "\nstartLocation:\t" + startLocation
           + "\nurlMatch:\t" + urlMatch
           + "\nuserAgent:\t" + userAgent
           + "\nbasicAuthUser:\t" + basicAuthUser
           + "\nbasicAuthPassword:\t" + "***"
           + "\nspiderThreads:\t" + spiderThreads
           + "\ncheckpointInterval:\t" + checkpointInterval;
    }

    private Set parseSet(String str)
    {
        _logClass.debug("parseSet(" + str + ")");
        HashSet result = new HashSet();
        StringTokenizer sTok = new StringTokenizer(str, ",");
        while(sTok.hasMoreTokens())
        {
            String tok = sTok.nextToken().trim();
            result.add(tok);
        }
        return result;
    }

    private String fromSet(Set s)
    {
        StringBuffer sb = new StringBuffer();
        boolean first = true;
        for(Iterator i = s.iterator(); i.hasNext(); )
        {
            String str = (String) i.next();
            if(first)
            {
                first = false;
            }
            else
            {
                sb.append(",");
            }
            sb.append(str);
        }
        return sb.toString();
    }
	/**
    * 解析待下载的网站列表- new
    */
    public boolean parseWebsites2(String websiteFile)
    {
    	_websiteHash = new HashSet();

    	// *read user agent config file
        Properties webSite = null;
        try
        {
            FileInputStream propsWs = new FileInputStream(websiteFile);
            webSite = new Properties();
            webSite.load(propsWs);
            propsWs.close();
        }
        catch(FileNotFoundException fnfe)
        {
            _logClass.error("UserAgentList File not found: ", fnfe);
            System.exit(1);
        }
        catch(IOException ioe)
        {
            _logClass.error("IO Exception caught reading config file: " + ioe.getMessage(), ioe);
            System.exit(1);
        }
        
	// 读取网站url
	String siteUrl = webSite.getProperty("Url");
	if (siteUrl == null)
	{
		return false;
	}

	// 把url写入SiteConfig中
	SiteConfig sc = new SiteConfig();
	sc.setSiteUrl(siteUrl);

	// 读取网站filter
	String filter = webSite.getProperty("Filter");
	if (filter.length() > 0)
	{
		sc.setFilterPattern(filter);
	}

	_websiteHash.add(sc);

	return true;
	}
    
    /**
     * 解析待下载的网站列表
     */
     public boolean setWebsites(GameSite[] gss)
     {
    	 _logClass.info("start to fetch sites...");
    	 
 		// 读取网站列表
    	 _websiteHash = new HashSet();
    	 for (int i = 0; i < gss.length; i ++) {
    		SiteConfig sc = new SiteConfig();
			
    		sc.setSiteUrl(gss[i].get_firstUrl());
    		_logClass.info("first url: " + gss[i].get_firstUrl());
    		
    		sc.setFilterPattern(gss[i].get_urlFilter());
    		_logClass.info("url filter: " + gss[i].get_urlFilter());
    		
    		_websiteHash.add(sc);
    	 }
    	 
 		return true;
     }

	public HashSet getHsPreLinks() {
		return _hsPreLinks;
	}

	public void setHsPreLinks(HashSet hsPreLinks) {
		this._hsPreLinks = hsPreLinks;
	}
} // End class SpiderConfig

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -