⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 spider.java

📁 是个java写的sipder,非常不错!能承受很大的压力,每天采集的数量在10000万
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
					}
					else {
						continue;
					}
				}
				downloadsInProgress++;
		    }

			// 如果下载网页已发生错误,不再下载
			synchronized(urlsError)
			{
				if (urlsError.contains(nextURL.getURL().getHost())) {
					_logClass.info("error url --> " + nextURL);
					downloadsInProgress--;
					continue;
				}
			}

			// 更新当前下载网页队列
			synchronized(urlsDownloading)
			{
				urlsDownloading.add(nextURL);
			}
			 
		    int newDepth = nextURL.getDepth() +	1;
		    int maxDepth = config.getMaxDepth();
		    synchronized(urlsDownloading)
		    {
				urlsDownloading.remove(nextURL);
		    }

		    // 获取需下载的新URL数据,设置serviceID,并对其过滤
		    List newURLs = downloadURL(nextURL,	urlGetter, htmlParser, newDepth);
			setServiceID(newURLs, nextURL);
		    newURLs = filterURLs(newURLs, nextURL);

			// 向待下载队列添加本次分析网页得到的所有新链接
		    ArrayList u2dsToQueue = new	ArrayList();
		    for(Iterator i = newURLs.iterator(); i.hasNext(); )
		    {
				URLToDownload u	= (URLToDownload) i.next();

				// Download if not yet downloaded, and the new depth is	less than the maximum
				synchronized(urlsDownloadedOrScheduled)
				{
				    if(!urlsDownloadedOrScheduled.contains(u.getURL().toExternalForm())
					    && (maxDepth == 0 || newDepth <= maxDepth))
				    {
						u2dsToQueue.add(new URLToDownload(u.getURL(), nextURL.getURL(),	newDepth, u.getServiceID(), u.getFilterPattern(), u.getIsBeforeSubs()));
						urlsDownloadedOrScheduled.add(u.getURL().toExternalForm());
				    }
				}
		    }
		    nextURL = null;
		    newURLs = null;

		    synchronized(queue)
		    {
				queue.queueURLs(u2dsToQueue);
				downloadsInProgress--;
		    }				
		}
		_logClass.info("Spider thread stopping");
		NonPageLinkLog.tagFinish();
		running--;
    }

    /**
     * Get the size of the download queue in a thread-safe manner.
     */
    private int queueSize()
    {
        synchronized(queue)
        {
            return queue.size();
        }
    }

    /**
     * Get a URL, and return new URLs that are referenced from it.
     *
     * @return A List of URL objects.
     */
    private List downloadURL(URLToDownload url, URLGetter urlGetter, HTMLParser htmlParser, int depth)
    {
        _logClass.info("downloadURL(" + url + ")");
		ArrayList urlList = new ArrayList();

		// 如果已经下载到本地,根据配置标志判断是否需要更新数据
        URLObject obj = new URLObject(url.getURL(), config, url.getServiceID(), url.getIsBeforeSubs());

        if(obj.existsOnDisk()) {
            if(config.refreshHTMLs() && obj.isValidDoc() ) {
                obj = urlGetter.getURL(url);
            }
        } else if (FileSpider.is_resume() && obj.existsOnPrevLog()) {// 链接存在于先前的log文件中
        	if (!config.refreshHTMLs()) {
        		NonPageLinkLog.add((HeaderContent) FileSpider.get_prevLinkht().get(url.getURL().toExternalForm()));
        		obj.setErrorType(URLObject._OTHER_EXCEPTION);
        	}
        } else {
			obj = urlGetter.getURL(url);
		}

		// 如果未下载成功,向错误页面集合中添加当前页面
		if(!obj.isValidObj()) {
			// System.out.println("invalid url: " + url.getURL().toExternalForm());
			StoreDiffUrl.saveExceptionUrl(obj.getSourceURL().toExternalForm());
			if (obj.getErrorType() == URLObject._CONNECT_TIMEOUT_EXCEPTION) {
			    synchronized(urlsError)
			    {
			    	_logClass.info("add error host --> " + url.getURL().getHost());
					// urlsError.add(url.getURL().getHost());
			    }
			}
            return urlList;
        }

		// 只处理有效网页
		if (obj.isValidDoc()) 
		{
			// 把下载内容存储到本地文件
		        if(config.refreshHTMLs() || !obj.existsOnDisk())
			{
		            obj.writeToFile();
		        }
			else 
			{
				//_logClass.info(obj.getSourceURL().toExternalForm() + " has existed in disk!");
				url.setEndURL(url.getURL());
			}

			//System.out.println("START TO PRASER ...");

			// 分析网页中的链接
	        	return htmlParser.parseLinksInDocument(url, obj.getStringContent());
 	       }
		
		obj = null;
		return urlList;
    }

	/**
	* 过滤URL地址列表,并设置定购前后标志
	*/
    private List filterURLs(List URLs, URLToDownload sourceUrl)
    {
        ArrayList retVal = new ArrayList();

        synchronized(urlsDownloadedOrScheduled)
        {
            for(Iterator i = URLs.iterator(); i.hasNext(); )
            {
                URLToDownload downUrl = (URLToDownload) i.next();
                URL u = downUrl.getURL();
				//System.out.println("url to be filtered: " + u.toString());
				
                // 已经被添加进队列
                if(urlsDownloadedOrScheduled.contains(u.toExternalForm())) {
                    continue;
                }

				// 过滤非法URL地址
				if (downUrl.isValidUrl()) 
				{
					retVal.add(downUrl);
				}
				else {
					_logClass.info("invalid url --> " + u.toExternalForm());
				}
            }
        }
        return retVal;
    }

	/**
	* 设置新下载页面的ServiceID属性,以便作数据整理
	*/
	private void setServiceID(List URLs, URLToDownload sourceURL)
	{
		Iterator it = URLs.iterator();
		while (it.hasNext()) 
		{
			URLToDownload u2d = (URLToDownload) it.next();

			// 根据URL获取serviceID,梦网搜索是SP服务标识,免费搜索是二级域名或三级域名
			String serviceID;
			if (config.downloadMonternet()) 
			{
				serviceID = u2d.fetchServiceIDFromUrl();
				if (serviceID == null) {
					// 取上一级URL的serviceID
					serviceID = sourceURL.getServiceID();
				}
			}
			else 
			{
				if (config.filterBySecondDomain()) {
					serviceID = ParseUrl.getDomainName(u2d.getURL());
				}
				else {
					serviceID = u2d.getURL().getHost();
				}
			}
			
			u2d.setServiceID(serviceID);
			u2d.setFilterPattern(sourceURL.getFilterPattern());
		}
	}
	
	/**
	* 测试打印信息
	*/
	private void printErrorUrls()
	{
		Iterator it = urlsError.iterator();
		while (it.hasNext()) {
			String errorUrl = (String) it.next();
			_logClass.info("print error url --> " + errorUrl);
		}
	}

	public static void main(String[] args)
	{
		// 读取配置文件
        Properties props = null;
        try
        {
        	String propsFile = "config/spider.properties";
            FileInputStream propsIn = new FileInputStream(propsFile);
            props = new Properties();
            props.load(propsIn);
            propsIn.close();
        }
        catch(FileNotFoundException fnfe)
        {
            _logClass.error("File not found: " + args[0], fnfe);
            System.exit(1);
        }
        catch(IOException ioe)
        {
            _logClass.error("IO Exception caught reading config file: " + ioe.getMessage(), ioe);
            System.exit(1);
        }

		try {
	        SpiderConfig config = new SpiderConfig(props);

			// 读取URL
			String url = "http://waps.cn/site_new.wap?page=161";
			URLToDownload urlDown = new URLToDownload(new URL(url), 0, "");

			URLGetter urlGetter = new URLGetter(config);
			HTMLParser htmlParser =	new HTMLParser(config);

			URLObject urlObj = urlGetter.getURL(urlDown);
			htmlParser.parseLinksInDocument(urlDown, urlObj.getStringContent());
		}
		catch (Exception ex) {
			ex.printStackTrace();
		}
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -