📄 mycrawlerframe.java
字号:
});
thread.start();
try
{
Thread.sleep(2000);
}
catch(Exception e)
{
}
}
//进行实际的crawl
private void crawlAction(String startUrl,int maxUrls)
{
//Set up crawL list
HashSet crawledList = new HashSet();
LinkedHashSet toCrawlList = new LinkedHashSet();
HashSet notHostLink = new HashSet();
HashSet gt30kbList = new HashSet();
//Add start Url to the to crawl list
toCrawlList.add(startUrl);
String urlActionTime = getTimeStamp();
addResult(urlActionTime,"Add to list",startUrl);
int i = 0;
//Perform actual crawling by looping through the To Crawl list
while(crawling && toCrawlList.size() > 0)
{
//check to see if the max url count has been reached
if(maxUrls != -1)
{
if(crawledList.size()== maxUrls)
{
break;
}
}
//Get URL at bottom of the list
String url = null;
if(toCrawlList.iterator().hasNext())
{
try{
url = (String)toCrawlList.iterator().next();
}
catch(ClassCastException e)
{
}
}
//Remove URL from the to Crawl list
toCrawlList.remove(url);
//convert string url to URL object
URL verifiedUrl = verifyUrl(url);
String verifiedUrlActionTime = getTimeStamp();
addResult(verifiedUrlActionTime,"verify",url);
//Skip URL if robots are not allow to access it
if(!isRobotAllowed(verifiedUrl))
{
continue;
}
//Update crawling stats
updateStats(url,crawledList.size(),toCrawlList.size(),maxUrls,notHostLink.size(),gt30kbList.size());
//Add page to the crawled list
crawledList.add(url);
//Download the page at the given Url
String pageContents = downloadPage(verifiedUrl);
String processUrlActionTime = getTimeStamp();
addResult(processUrlActionTime,"process",url);
writePage(pageContents,verifiedUrl,Integer.toString(i));
i++;
//If the page was downloaded sucessfully,retrive all its
//links
if(pageContents != null && pageContents.length()>0)
{
//首先要检查机器人元标签,检查这个网页是否允许
if(!allowFollow(pageContents,verifiedUrl))
{
continue;
}
ArrayList links = retrieveLinks(verifiedUrl,pageContents,crawledList,notHostLink,gt30kbList);
//Add links to the To Crawl list
toCrawlList.addAll(links);
}
//Update crawling stats
updateStats(url,crawledList.size(),toCrawlList.size(),maxUrls,notHostLink.size(),gt30kbList.size());
}
}
//检查机器人元标签
private boolean allowFollow(String pageContents,URL verifiedUrl)
{
Pattern p = Pattern.compile("<meta\\s+name\\s*=\\s*\"robots\"\\s*\"?(.*?)[\"|>]",
Pattern.CASE_INSENSITIVE);
Matcher m= p.matcher(pageContents);
while(m.find())
{
String content = m.group(1).trim();
if(content.indexOf("all")!=-1)
{
return true;
}
else if(content.indexOf("follow")!= -1)
{
return true;
}
else
{
return false;
}
}
return true;
}
//add the result to the result table and log file
private void addResult(String actionTime,String action,String url)
{
//Add action time,action,url to result table
DefaultTableModel model =
(DefaultTableModel) resultTable.getModel();
model.addRow(new Object[]{actionTime,action,url});
//Add actiontime ,action url to log file
String result = actionTime +"\t"+action+"\t"+url;
try
{
logFileWriter.println(result);
}
catch(Exception e)
{
showError("Unable to log file");
}
}
private void addResult(String actionTime,String action,URL url)
{
//Add action time,action,url to result table
DefaultTableModel model =
(DefaultTableModel) resultTable.getModel();
model.addRow(new Object[]{actionTime,action,url});
//Add actiontime ,action url to log file
String result = actionTime +"\t"+action+"\t"+url;
try
{
logFileWriter.println(result);
}
catch(Exception e)
{
showError("Unable to log file");
}
}
//Get current time stamp
private String getTimeStamp()
{
Date d = new Date(System.currentTimeMillis());
Timestamp ts = new Timestamp(d.getTime());
String tsStr=ts.toString();
return tsStr;
}
//Parse through page contents and retrieve links
private ArrayList retrieveLinks(URL pageUrl,String pageContents,HashSet crawledList,HashSet notHostLink,
HashSet gt30kbList)
{
//complie link matching pattern
Pattern p = Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]",Pattern.CASE_INSENSITIVE);
Matcher m= p.matcher(pageContents);
//Create list of link matches
ArrayList linkList = new ArrayList();
while(m.find())
{
String link = m.group(1).trim();
//Skip empty links
if(link.length()<1)
{
continue;
}
//skip links that are just page anchors
if(link.charAt(0)=='#')
{
continue;
}
//skip mailto links
if(link.indexOf("mailto")!= -1)
{
continue;
}
//Skip javascript links
if(link.toLowerCase().indexOf("javascript")!=-1)
{
continue;
}
//Prefix absolute and relative URLs if necessary
if(link.indexOf("://")==-1)
{
//Handle absolute URLS
if(link.startsWith("./"))
{
link = "http://"+pageUrl.getHost()+"/"+pageUrl.getFile()+"/"+link.substring(2,link.length());
}
else if(link.startsWith("../"))
{
link = "http://"+pageUrl.getHost()+"/"+link.substring(3,link.length());
}
else
{
String file = pageUrl.getFile();
if(file.indexOf('/')==-1)
{
link = "http://"+pageUrl.getHost() +"/"+link;
}
else
{
String path = file.substring(0, file.lastIndexOf('/')+1);
link = "http://"+pageUrl.getHost()+path+link;
}
}
}
//Remove anchors from link
int index = link.indexOf('#');
if(index!=-1)
{
link = link.substring(0,index);
}
//verified link and skip if invalid
URL verifiedLink = verifyUrl(link);
if(verifiedLink == null)
{
continue;
}
//link must in the same host as the start url,if find an link is not this host,just keep a
//record of it,not to process it
String startHost = verifiedLink.getHost().toLowerCase();
int indexofpoint;
if((indexofpoint=startHost.indexOf("."))!= -1)
{
startHost=startHost.substring(indexofpoint+1);
}
String pageHost = pageUrl.getHost().toLowerCase();
if((indexofpoint=pageHost.indexOf("."))!= -1)
{
pageHost = pageHost.substring(indexofpoint+1);
}
if(!pageHost.equalsIgnoreCase(startHost))
{
if(! notHostLink.contains(verifiedLink))
{
notHostLink.add(verifiedLink);
String notHostLinkAction = getTimeStamp();
addResult(notHostLinkAction,"link not this host",verifiedLink);
}
continue;
}
//如果网页大小超过30kb,记录该ip,不作download处理
try{
int size;
if((size = verifiedLink.openConnection().getContentLength())>1024*30)
{
if(!gt30kbList.contains(verifiedLink))
{
gt30kbList.add(verifiedLink);
String gt30kbLinkAction = getTimeStamp();
addResult(gt30kbLinkAction,"page size > 30 KB",verifiedLink);
}
continue;
}
}
catch(IOException e)
{
e.printStackTrace();
}
//
//Skip link if it has already been crawled
if(crawledList.contains(link))
{
continue;
}
//Add link to list
linkList.add(link);
String urlActionTime = getTimeStamp();
addResult(urlActionTime,"Add to list",link);
}
return (linkList);
}
//Check if robot is allowed to access the given url
private boolean isRobotAllowed(URL urlToCheck)
{
String host = null;
try
{
host = (String)urlToCheck.getHost().toLowerCase();
}
catch(NullPointerException e)
{
}
//Retrive host's disallow list from cache
ArrayList disallowList = (ArrayList)disallowListCache.get(host);
//If list is not in the cache,download and cache it
if(disallowList == null)
{
disallowList = new ArrayList();
try
{
URL robotsFileUrl = new URL("http://"+host+"/robots.txt");
//Open connection to robot file for reading
BufferedReader reader = new BufferedReader(new InputStreamReader(robotsFileUrl.openStream()));
//Read robot file,creating list of disallowed paths
String line;
while((line=reader.readLine())!=null)
{
if(line.indexOf("Disallow:")== 0)
{
String disallowPath = line.substring("Disallow:".length());
//Check disallow path for comments and remove if present
int commentIndex = disallowPath.indexOf("#");
if(commentIndex != -1)
{
disallowPath.substring(0,commentIndex);
}
//Remove leading or trailing sapces from disallow path
disallowPath = disallowPath.trim();
//Add disallow path to list
disallowList.add(disallowPath);
}
}
}
catch(Exception e)
{
//Assum robot is allowed since an exception is
//throw if the robot file doesn't exist
return true;
}
}
//loop through disallow list to see if crawling is allowed for the given url
String file =urlToCheck.getFile();
for(int i= 0;i<disallowList.size();i++)
{
String disallow = (String)disallowList.get(i);
if(file.startsWith(disallow))
{
return false;
}
}
return true;
}
//设置新的状态
private void updateStats(String crawling,int crawled,int toCrawl,int maxUrls,int notHostLinkSize,int gt30kbListSize)
{
crawlingLabel2.setText(crawling);
crawledLabel2.setText(" "+ crawled);
toCrawlLabel2.setText(" "+ toCrawl);
notHostLinkLabel2.setText(" "+notHostLinkSize);
gt30kbLinkLabel2.setText(" "+gt30kbListSize);
}
// Verify URL format,
private URL verifyUrl(String url)
{
//only allow http urls
try
{
if(!(url.toLowerCase()).startsWith("http://"))
{
return null;
}
}
catch(NullPointerException e)
{
}
URL verifiedUrl = null;
try
{
verifiedUrl = new URL(url);
URLConnection verifiedConnection = verifiedUrl.openConnection();
verifiedConnection.setRequestProperty("User-Agent", "Test Crawler for Course IR");
}
catch(Exception e)
{
return null;
}
return verifiedUrl;
}
//显示错误信息
private void showError(String message)
{
JOptionPane.showMessageDialog(this,message,"Error",JOptionPane.ERROR_MESSAGE);
}
//现在指定页面
private String downloadPage(URL pageUrl)
{
try
{
//open connection to URL for reading
BufferedReader reader = new BufferedReader(new InputStreamReader(pageUrl.openStream()));
//Read page into buffer
String line;
StringBuffer pageBuffer = new StringBuffer();
while((line=reader.readLine())!= null)
{
pageBuffer.append(line);
}
return pageBuffer.toString();
}
catch(Exception e)
{
return null;
}
}
private void writePage(String bufferString,URL pageUrl,String fileName)
{
String path= System.getProperty("user.dir") + System.getProperty("file.separator")+"tmp"+
System.getProperty("file.separator");
try
{
PrintWriter pw = new PrintWriter( new FileWriter( path+fileName+".txt" ) );
pw.println(pageUrl.toString());
pw.print(bufferString);
pw.close( );
}
catch(IOException e)
{
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -