📄 gather.java
字号:
package gather;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.ConnectException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.URLConnection;
import java.net.UnknownHostException;
import java.util.Date;
public class gather extends Thread
{
// -------------------------------------------------------
// 变量定义区:
static int nRunCount = 0;
int nID = 0;
String strurl;
URL url;
URLConnection urlconnection;
InputStream inputstream;
BufferedReader bufreader;
boolean bSaveToFile;
private Date date;
private boolean exit;
// -------------------------------------------------------
private String[] strCNMsgOfResponse = new String[1024];// 用于存储http返回值对应的中文意思
private final int nMaxTimeOfTimeout=15*60000;//网络超时时间;
private int nTimeOfTimeout=60000;//网络超时时间;
private int nCountOfTimeout=0;//网络超时次数;
// -------------------------------------------------------
public gather()
{
initgather();
}
public gather(String strInUrl,boolean bIn)
{
this.strurl=strInUrl;
this.bSaveToFile=bIn;
initgather();
}
public void initgather()
{
date = new Date();
// ------------------------------------------
int i = 0;
for (i = 0; i < 1024; i++)
{
strCNMsgOfResponse[i] = "";
}
strCNMsgOfResponse[201] = "";
strCNMsgOfResponse[202] = "已经接受请求,但处理尚未完成";
strCNMsgOfResponse[203] = "文档已经正常地返回,但一些应答头可能不正确,因为使用的是文档的拷贝";
strCNMsgOfResponse[204] = "没有新文档,浏览器应该继续显示原来的文档";
strCNMsgOfResponse[205] = "";
strCNMsgOfResponse[206] = "";
strCNMsgOfResponse[300] = "";
strCNMsgOfResponse[301] = "";
strCNMsgOfResponse[302] = "";
strCNMsgOfResponse[303] = "";
strCNMsgOfResponse[304] = "";
strCNMsgOfResponse[305] = "";
strCNMsgOfResponse[307] = "";
strCNMsgOfResponse[400] = "";
strCNMsgOfResponse[401] = "";
strCNMsgOfResponse[402] = "";
strCNMsgOfResponse[403] = "";
strCNMsgOfResponse[404] = "无法找到指定位置的资源,该页面不存在";
strCNMsgOfResponse[405] = "";
strCNMsgOfResponse[406] = "";
strCNMsgOfResponse[407] = "";
strCNMsgOfResponse[408] = "";
strCNMsgOfResponse[409] = "";
strCNMsgOfResponse[410] = "";
strCNMsgOfResponse[411] = "";
strCNMsgOfResponse[412] = "";
strCNMsgOfResponse[413] = "";
strCNMsgOfResponse[414] = "";
strCNMsgOfResponse[415] = "";
strCNMsgOfResponse[416] = "";
strCNMsgOfResponse[417] = "";
strCNMsgOfResponse[500] = "";
strCNMsgOfResponse[501] = "";
strCNMsgOfResponse[502] = "";
strCNMsgOfResponse[503] = "";
strCNMsgOfResponse[504] = "";
strCNMsgOfResponse[505] = "";
strCNMsgOfResponse[999] = "服务器返回:999。不明错误。";
}
/**
* 功 能:线程结束 输入参数: 输出参数: 返 回 值: 备 注: 作 者:
*/
public void exitThread()
{
exit = true;
}
private void establishConnection()throws Exception
{
int nHttpResponseCode = -1;
String strHttpResponse = "";
String strResponseMsg = "";
String strErrorMsg = "";
String strDateString = "";
try
{
date.getTime();
strDateString = "[" + date.toLocaleString() + "] ";
url = new URL(this.strurl);
// -------------------------------------------------------
// 开始建立连接:
urlconnection = url.openConnection();// 不使用代理
// 设置连接超时时间:
urlconnection.setReadTimeout(nTimeOfTimeout);
// -------------------------------------------------------
// 连接之后:获取返回值和返回消息:
if (urlconnection instanceof HttpURLConnection)
{
try
{
HttpURLConnection httpurlconn = (HttpURLConnection) urlconnection;// 转换成HttpURLConnection
nHttpResponseCode = httpurlconn.getResponseCode();// 获取http服务器的返回值
strHttpResponse = httpurlconn.getResponseMessage();// 获取http服务器的返回消息
if (nHttpResponseCode >= 1024)
strErrorMsg = "服务器返回值大于1024.[" + this.strurl + "]";
else if (nHttpResponseCode < 0)
strErrorMsg = "服务器返回值小于0.[" + this.strurl + "]";
else
strResponseMsg = "服务器返回:[" + nHttpResponseCode + "," + strHttpResponse + "] " + strCNMsgOfResponse[nHttpResponseCode];
if (nHttpResponseCode > 600)// 测试用:[2007-10-20]
System.out.println(strResponseMsg + "URL:" + this.strurl);
}
catch (IOException ex)
{
strErrorMsg = " 读取服务器返回值时发生IO错误,需重新连接。[" + ex + "]";
}
catch (Exception ex)
{
strErrorMsg = " 读取服务器返回值时发生不明错误,---->[" + ex + "]";
}
}
// -------------------------------------------------------
// 如果连接成功:使用Writer对象的writeData方法记录网页数据:
if ((nHttpResponseCode == 200) || (strHttpResponse.equalsIgnoreCase("ok")))
{
// System.out.println(" 已连接成功![" +this.strurl+"]-->nCount:" + nRunCount++);
// System.out.println(" 已连接成功![" +this.strurl+"]-->nCount:" + nRunCount+++this.bSaveToFile);
new Writer().writeData(urlconnection, this.strurl, this.bSaveToFile);
}
else
{
// 在此记录连接失败的返回值和返回消息:
if (strResponseMsg.isEmpty())
strResponseMsg = "连接失败! 程序运行正常,请检查网络或者代理或者所输入的URL";
strErrorMsg = strResponseMsg;
}
}
catch (MalformedURLException e_url)
{
strErrorMsg = " 输入的URL不正确!";
}
catch (ConnectException e_con)
{// 连接错误:
strErrorMsg = " 连接错误:不能连接到服务器!请检查网络连接或代理设置";
}
catch (UnknownHostException e_con)
{// 错误的服务器或不明主机:
strErrorMsg = " 错误的服务器或不明主机! 请检查URL是否正确";
}
catch (SocketTimeoutException e_con)
{// 连接超时:
strErrorMsg = " 连接超时!请检查网络连接或代理设置!";
nCountOfTimeout++;
//如果超时次数大于100次,则需要调制超时时间:
if(nCountOfTimeout>=100)
{
nCountOfTimeout=0;
if(nTimeOfTimeout<nMaxTimeOfTimeout)
{
nTimeOfTimeout*=2;
System.out.println("已调整网络链接超时时间为:"+nTimeOfTimeout);
}
}
}
catch (FileNotFoundException e_url)
{
strErrorMsg = " 该页面不存在! ";
}
catch (Exception e_url)
{
strErrorMsg = " !不明错误![" + e_url + "]";
e_url.printStackTrace();
}
if (!strErrorMsg.isEmpty())
{// 如果产生错误:则同时记录下此时的时间:
strErrorMsg = strDateString + strErrorMsg;
System.out.println("URL:"+this.strurl + strErrorMsg);
}
}
/**
* 功 能:线程的固定函数:run() :
*/
public void run()
{
if (this.strurl == null || this.strurl.isEmpty())
return;
controler.nThreadCount++;
// 开始执行任务:
// System.out.println(" 任务开始 。当前线程数:"+controler.nThreadCount+" ["+this.strurl+"]");
try
{
this.strurl = this.strurl.toLowerCase();
if (!this.strurl.startsWith("http://") && !this.strurl.startsWith("ftp://"))
{
this.strurl = "http://" + this.strurl;
}
this.strurl = this.strurl.replace('\\', '/');
// ------------------------------------------------------
// 开始建立连接:
establishConnection();
}
catch (Exception e)
{
System.out.println("ERROR:-->" + e);// 输出Crawler的错误信息
}
// --------------------------------------------
finally
{
controler.nThreadCount--;
}
// System.out.println("线程已经停止。");
}
public static void main(String[] args) throws Exception
{
//运行入口:
//在此输入种子网址:
try
{
gather gg = new gather();
gg.strurl = "http://www.ivsky.com/";
gg.bSaveToFile=false;
gg.start();
controler.URLList.add(gg.strurl);
}
catch(Exception e)
{
e.printStackTrace();
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -