📄 mainfetch.java
字号:
package com.code10.fetch;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Queue;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import com.code10.access.DbAccess;
import com.code10.basecomponent.Entry;
import com.code10.basecomponent.UrlEntry;
import com.code10.basecomponent.strDeal;
public class MainFetch {
//private String strStartURL;
private static Logger logger = Logger.getLogger(MainFetch.class.getName());
private int intLayer;
private Queue<UrlEntry> queueStrTmp1;
private Queue<UrlEntry> queueStrTmp2;
private ArrayList<Entry> aryRs;
private static HttpClient client = new HttpClient();
private int intStatus = 0;
private ThreadFetchURL threadFetchURL;
private ThreadParse threadParse;
public MainFetch(String strStartURL, int intLayer){
client.getParams().setContentCharset("GB2312");
logger.setLevel(Level.ALL);
UrlEntry.setStrHost(strStartURL);
UrlEntry temp = new UrlEntry();
this.intLayer = intLayer;
temp.setUrl(strStartURL);
temp.setILayer(0);
queueStrTmp1 = new LinkedList<UrlEntry>();
queueStrTmp1.add(temp);
queueStrTmp2 = new LinkedList<UrlEntry>();
queueStrTmp2.add(temp);
aryRs = new ArrayList<Entry>();
}
public void start(){
logger.debug("开始抓取 .....");
threadFetchURL = new ThreadFetchURL();
threadFetchURL.start();
threadParse = new ThreadParse();
threadParse.start();
}
public ArrayList<Entry> getEntry(){
while(intStatus != 1){
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return this.aryRs;
}
private class ThreadFetchURL extends Thread{
@Override
public void run() {
// TODO Auto-generated method stub
logger.debug("ThreadFetchURL is running!");
UrlEntry UrlTemp = null;
while(true){
logger.debug("ThreadFetchURL : in while");
UrlTemp = getURL1();
String url = UrlTemp.getUrl();
if(url != null ){
if(UrlTemp.getILayer() != intLayer){
String strContent = getContent(url);
if(strContent == null) continue;
parseAndaddURL(strContent, UrlTemp);
}
else{
break;
}
}
//logger.debug("out while!");
}
}
}
private class ThreadParse extends Thread{
@Override
public void run() {
// TODO Auto-generated method stub
logger.debug("ThreadParse : emailparse is starting...");
UrlEntry UrlTemp = null;
String url = null;
while(true){
logger.debug("ThreadParse : emailparse while is running...");
UrlTemp = getURL2();
url = UrlTemp.getUrl();
if(url == null ){
if(threadFetchURL.isAlive()){
try {
sleep(100);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}else{
logger.debug("*******************************************\n" +
"抓取结束,开始导入数据库......\n");
DbAccess dba = new DbAccess();
dba.insertInto(aryRs);
logger.debug("成功导入数据库 \n" +
"*******************************************\n");
break;
}
}
else{
String strTemp = getContent(url);
if(strTemp == null) continue;
parseAndAddEmail(strTemp , UrlTemp);
}
}
}
}
/**
* 根据URL得到网页内容
* @param strURL
* @return
*/
private synchronized String getContent(String strURL){
try {
logger.debug("ThreadFetchURL : start get the URL : " + strURL);
GetMethod getMethod = new GetMethod(strURL);
client.executeMethod(getMethod);
if(getMethod.getStatusCode () == HttpStatus.SC_OK){
return getMethod.getResponseBodyAsString();
}
} catch (HttpException e) {
// TODO Auto-generated catch block
e.printStackTrace();
logger.debug("ThreadFetchURL : get the URL : " + strURL + "failed! (HttpException)");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
logger.debug("ThreadFetchURL : get the URL : " + strURL + "failed! (IOException)");
} catch (Exception e) {
e.printStackTrace();
logger.debug("ThreadFetchURL : get the URL : " + strURL + "failed! (Exception)");
}
return null;
}
/**
* 根据网页内容解析出URL,并且放到列表中
* @param strContent
* @return
* @throws IOException
*/
private synchronized void parseAndaddURL(String strContent , UrlEntry urlParent) {
ArrayList<String> ary = strDeal.parseURL(strContent);
logger.debug("ThreadFetchURL : in parseAndaddURL");
for(int i = 0; i < ary.size(); i++){
if(!UrlEntry.isInWebSite(ary.get(i)) || !UrlEntry.isLegURL(ary.get(i))) continue; //如果是站外的话,或非法URL 就丢弃
UrlEntry UrlTemp = new UrlEntry();
UrlTemp.setStrParentUrl(urlParent.getUrl());
UrlTemp.setIntParentNum(urlParent.getIntSelfNum());
UrlTemp.setLayer(urlParent.getILayer() + 1);
UrlTemp.creatCompeleteURL(ary.get(i));
if(!isExist(UrlTemp.getUrl())){
logger.debug("==========================================================\n");
UrlTemp.setIntSelfNum(urlParent.getIntSonNum() + 1);
logger.debug("ThreadFetchURL : 层数 :" + UrlTemp.getILayer() + " 父URL编号 :" +
UrlTemp.getIntParentNum() + " 编号 :" + UrlTemp.getIntSelfNum() + " 内容 :" + UrlTemp.getUrl());
queueStrTmp1.add(UrlTemp);
queueStrTmp2.add(UrlTemp);
}
}
return;
}
/**
* 根据网页内容解析出EMail和用户名,并加入到返回列表中
* @param strContent
* @return
*/
private void parseAndAddEmail(String strContent ,UrlEntry urlEntry){
ArrayList<String> ary = strDeal.parseEmail(strContent);
logger.debug("ThreadParse : in parseAndAddEmail");
int temp = ary.size();
Entry entry = null;
for(int i = 0 ; i < temp ; i++){
entry = new Entry();
entry.setEmail(ary.get(i));
entry.setUsername("");
if(!isExistToo(entry.getEmail()))
{
logger.debug("ThreadParse : Parsing the URL : " + urlEntry.getIntSelfNum() + " " + urlEntry.getUrl());
logger.debug("ThreadParse : the geted email is : " + ary.get(i));
aryRs.add(entry);
logger.debug("ThreadParse : the size of aryRs is " + aryRs.size());
}
}
logger.debug("ThreadParse : out parseAndAddEmail");
return ;
}
/**
* 在列表中取URL
* @return
*/
private UrlEntry getURL1(){
if(queueStrTmp1.isEmpty())
return new UrlEntry();
else
return queueStrTmp1.poll();
}
private UrlEntry getURL2(){
if(queueStrTmp2.isEmpty())
return new UrlEntry();
else
return queueStrTmp2.poll();
}
public boolean isExist(String strUrl){
Iterator<UrlEntry> IteratorTmp = queueStrTmp1.iterator();
while(IteratorTmp.hasNext()){
if(IteratorTmp.next().getUrl().compareToIgnoreCase(strUrl) == 0){
return true;
}
}
return false;
}
public boolean isExistToo(String strEmail){
Iterator<Entry> IteratorTmp = aryRs.iterator();
while(IteratorTmp.hasNext()){
if(IteratorTmp.next().getEmail().compareToIgnoreCase(strEmail) == 0){
return true;
}
}
return false;
}
public synchronized void appendMsg(String str){
// MainFrame.getInstance().appendMsg(str);
}
public void stopFetch(){
// logger.debug("停止抓取 .....");
// if(threadFetchURL.isAlive()){
// threadFetchURL.stop();
// }
// if(threadParse.isAlive()){
// threadParse.stop();
// }
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -