⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gather.java

📁 使用Eclipse编写的java的网络图片爬虫
💻 JAVA
字号:
package gather;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.ConnectException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.URLConnection;
import java.net.UnknownHostException;
import java.util.Date;

public class gather extends Thread
{
	// -------------------------------------------------------
	// 变量定义区:
	static int nRunCount = 0;

	int nID = 0;

	String strurl;

	URL url;

	URLConnection urlconnection;

	InputStream inputstream;

	BufferedReader bufreader;

	boolean bSaveToFile;

	private Date date;

	private boolean exit;

	// -------------------------------------------------------
	private String[] strCNMsgOfResponse = new String[1024];// 用于存储http返回值对应的中文意思
	private final int nMaxTimeOfTimeout=15*60000;//网络超时时间;
	private int nTimeOfTimeout=60000;//网络超时时间;
	private int nCountOfTimeout=0;//网络超时次数;
	// -------------------------------------------------------
	public gather()
	{
		initgather();
	}
	public gather(String strInUrl,boolean bIn)
	{
		this.strurl=strInUrl;
		this.bSaveToFile=bIn;
		initgather();
	}

	public void initgather()
	{
		date = new Date();
		// ------------------------------------------
		int i = 0;
		for (i = 0; i < 1024; i++)
		{
			strCNMsgOfResponse[i] = "";
		}
		strCNMsgOfResponse[201] = "";
		strCNMsgOfResponse[202] = "已经接受请求,但处理尚未完成";
		strCNMsgOfResponse[203] = "文档已经正常地返回,但一些应答头可能不正确,因为使用的是文档的拷贝";
		strCNMsgOfResponse[204] = "没有新文档,浏览器应该继续显示原来的文档";
		strCNMsgOfResponse[205] = "";
		strCNMsgOfResponse[206] = "";

		strCNMsgOfResponse[300] = "";
		strCNMsgOfResponse[301] = "";
		strCNMsgOfResponse[302] = "";
		strCNMsgOfResponse[303] = "";
		strCNMsgOfResponse[304] = "";
		strCNMsgOfResponse[305] = "";
		strCNMsgOfResponse[307] = "";

		strCNMsgOfResponse[400] = "";
		strCNMsgOfResponse[401] = "";
		strCNMsgOfResponse[402] = "";
		strCNMsgOfResponse[403] = "";
		strCNMsgOfResponse[404] = "无法找到指定位置的资源,该页面不存在";
		strCNMsgOfResponse[405] = "";
		strCNMsgOfResponse[406] = "";
		strCNMsgOfResponse[407] = "";
		strCNMsgOfResponse[408] = "";
		strCNMsgOfResponse[409] = "";
		strCNMsgOfResponse[410] = "";
		strCNMsgOfResponse[411] = "";
		strCNMsgOfResponse[412] = "";
		strCNMsgOfResponse[413] = "";
		strCNMsgOfResponse[414] = "";
		strCNMsgOfResponse[415] = "";
		strCNMsgOfResponse[416] = "";
		strCNMsgOfResponse[417] = "";

		strCNMsgOfResponse[500] = "";
		strCNMsgOfResponse[501] = "";
		strCNMsgOfResponse[502] = "";
		strCNMsgOfResponse[503] = "";
		strCNMsgOfResponse[504] = "";
		strCNMsgOfResponse[505] = "";

		strCNMsgOfResponse[999] = "服务器返回:999。不明错误。";
	}

	/**
	 * 功 能:线程结束 输入参数: 输出参数: 返 回 值: 备 注: 作 者:
	 */
	public void exitThread()
	{
		exit = true;
	}

	private void establishConnection()throws Exception
	{
		int nHttpResponseCode = -1;
		String strHttpResponse = "";
		String strResponseMsg = "";
		String strErrorMsg = "";
		String strDateString = "";
		try
		{
			date.getTime();
			strDateString = "[" + date.toLocaleString() + "] ";

			url = new URL(this.strurl);
			// -------------------------------------------------------
			// 开始建立连接:
			urlconnection = url.openConnection();// 不使用代理

			// 设置连接超时时间:
			urlconnection.setReadTimeout(nTimeOfTimeout);
			// -------------------------------------------------------
			// 连接之后:获取返回值和返回消息:
			if (urlconnection instanceof HttpURLConnection)
			{
				try
				{
					HttpURLConnection httpurlconn = (HttpURLConnection) urlconnection;// 转换成HttpURLConnection
					nHttpResponseCode = httpurlconn.getResponseCode();// 获取http服务器的返回值
					strHttpResponse = httpurlconn.getResponseMessage();// 获取http服务器的返回消息

					if (nHttpResponseCode >= 1024)
						strErrorMsg = "服务器返回值大于1024.[" + this.strurl + "]";
					else if (nHttpResponseCode < 0)
						strErrorMsg = "服务器返回值小于0.[" + this.strurl + "]";
					else
						strResponseMsg = "服务器返回:[" + nHttpResponseCode + "," + strHttpResponse + "] " + strCNMsgOfResponse[nHttpResponseCode];
					if (nHttpResponseCode > 600)// 测试用:[2007-10-20]
						System.out.println(strResponseMsg + "URL:" + this.strurl);
				}
				catch (IOException ex)
				{
					strErrorMsg = " 读取服务器返回值时发生IO错误,需重新连接。[" + ex + "]";
				}
				catch (Exception ex)
				{
					strErrorMsg = " 读取服务器返回值时发生不明错误,---->[" + ex + "]";
				}
			}
			// -------------------------------------------------------
			// 如果连接成功:使用Writer对象的writeData方法记录网页数据:
			if ((nHttpResponseCode == 200) || (strHttpResponse.equalsIgnoreCase("ok")))
			{
//				System.out.println(" 已连接成功![" +this.strurl+"]-->nCount:" + nRunCount++);
//				System.out.println(" 已连接成功![" +this.strurl+"]-->nCount:" + nRunCount+++this.bSaveToFile);
				new Writer().writeData(urlconnection, this.strurl, this.bSaveToFile);
			}
			else
			{
				// 在此记录连接失败的返回值和返回消息:
				if (strResponseMsg.isEmpty())
					strResponseMsg = "连接失败! 程序运行正常,请检查网络或者代理或者所输入的URL";
				strErrorMsg = strResponseMsg;
			}
		}
		catch (MalformedURLException e_url)
		{
			strErrorMsg = " 输入的URL不正确!";
		}
		catch (ConnectException e_con)
		{// 连接错误:
			strErrorMsg = " 连接错误:不能连接到服务器!请检查网络连接或代理设置";
		}
		catch (UnknownHostException e_con)
		{// 错误的服务器或不明主机:
			strErrorMsg = " 错误的服务器或不明主机! 请检查URL是否正确";
		}
		catch (SocketTimeoutException e_con)
		{// 连接超时:
			strErrorMsg = " 连接超时!请检查网络连接或代理设置!";
			nCountOfTimeout++;
			//如果超时次数大于100次,则需要调制超时时间:
			if(nCountOfTimeout>=100)
			{
				nCountOfTimeout=0;
				if(nTimeOfTimeout<nMaxTimeOfTimeout)
				{
					nTimeOfTimeout*=2;
					System.out.println("已调整网络链接超时时间为:"+nTimeOfTimeout);
				}
			}
		}
		catch (FileNotFoundException e_url)
		{
			strErrorMsg = " 该页面不存在! ";
		}
		catch (Exception e_url)
		{
			strErrorMsg = " !不明错误![" + e_url + "]";
			e_url.printStackTrace();
		}
		if (!strErrorMsg.isEmpty())
		{// 如果产生错误:则同时记录下此时的时间:
			strErrorMsg = strDateString + strErrorMsg;
			System.out.println("URL:"+this.strurl + strErrorMsg);
		}
	}

	/**
	 * 功 能:线程的固定函数:run() :
	 */
	public void run()
	{
		if (this.strurl == null || this.strurl.isEmpty())
			return;
		controler.nThreadCount++;
			// 开始执行任务:
//		System.out.println(" 任务开始 。当前线程数:"+controler.nThreadCount+"  ["+this.strurl+"]");
		try
		{
			this.strurl = this.strurl.toLowerCase();
			if (!this.strurl.startsWith("http://") && !this.strurl.startsWith("ftp://"))
			{
					this.strurl = "http://" + this.strurl;
			}
			this.strurl = this.strurl.replace('\\', '/');
			// ------------------------------------------------------
			// 开始建立连接:
			establishConnection();

		}
		catch (Exception e)
		{
				System.out.println("ERROR:-->" + e);// 输出Crawler的错误信息
		}

		// --------------------------------------------
		finally
		{
			controler.nThreadCount--;
		}
//		System.out.println("线程已经停止。");
	}

	public static void main(String[] args) throws Exception
	{
		//运行入口:
		//在此输入种子网址:
		try
		{
			gather gg = new gather();
			gg.strurl = "http://www.ivsky.com/";
			gg.bSaveToFile=false;
			gg.start();
			controler.URLList.add(gg.strurl);
		}
		catch(Exception e)
		{
			e.printStackTrace();
		}
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -