📄 crawler.java
字号:
import java.text.*;
import java.awt.*;
import java.awt.event.*;
import java.sql.*;
import java.util.*;
import java.net.*;
import java.io.*;
public class Crawler implements Runnable {
public static final String DISALLOW = "Disallow:";
//public static final int SEARCH_LIMIT = 150;
public static int fileCounter=1;
// public static count=1;
CrawlTable tab;
public Crawler() {
tab=new CrawlTable();
URLConnection.setDefaultAllowUserInteraction(false);
Properties props= new Properties(System.getProperties());
props.put("http.proxySet", "true");
props.put("http.proxyHost", "webcache-cup");
props.put("http.proxyPort", "8080");
Properties newprops = new Properties(props);
System.setProperties(newprops);
}
//////////////////////////////////////////////////////////////////////////////////ROBOT SAFE
boolean robotSafe(URL url) {
String strHost = url.getHost();
String strRobot = "http://" + strHost + "/robots.txt";// form URL of the robots.txt file
URL urlRobot;
try {
urlRobot = new URL(strRobot);
}
catch (MalformedURLException e) {
return false;
}
String strCommands;
try {
InputStream urlRobotStream = urlRobot.openStream();
byte b[] = new byte[1000];
int numRead = urlRobotStream.read(b);
strCommands = new String(b, 0, numRead);
while (numRead != -1) {
if (Thread.currentThread() != CrawlerFrame.clThread)
break;
numRead = urlRobotStream.read(b);
if (numRead != -1) {
String newCommands = new String(b, 0, numRead);
strCommands += newCommands;
}
}
urlRobotStream.close();
}
catch (IOException e)
{
// if there is no robots.txt file, it is OK to search
return true;
}
// assume that this robots.txt refers to us and
// search for "Disallow:" commands.
String strURL = url.getFile();
int index = 0;
while ((index = strCommands.indexOf(DISALLOW, index)) != -1) {
index += DISALLOW.length();
String strPath = strCommands.substring(index);
StringTokenizer st = new StringTokenizer(strPath);
if (!st.hasMoreTokens())
break;
String strBadPath = st.nextToken();
// if the URL starts with a disallowed path, it is not safe
if (strURL.indexOf(strBadPath) == 0)
return false;
}//while end
return true;
}//end robot safe
String avoidHTMLTag(String s){
StringBuffer sb=new StringBuffer();
sb.ensureCapacity((s.length())*2);
sb.append(s);
int start = 0;
int end = 0;
try{
while (((start = sb.indexOf("<",start)) != -1)|((end = sb.indexOf(">",start)) != -1))
{ try{
if(end<start)
continue;
sb.replace(start,end+1," ");
start--;
end=start;
}
catch( Exception ex)
{
CrawlerFrame.jListURL.add("ERROR:HTML FORMAT");
}
}
String s1=new String (sb);
return s1;
}
catch(Exception e)
{
CrawlerFrame.jListURL.add("WRONG HTML FORMAT");
return "";
}
}//end of htmlavoid
public void run()
{
try
{
FileReader clCountRead=new FileReader("c:/search/resources/crawlcount.txt");
StreamTokenizer countTok=new StreamTokenizer(clCountRead);
countTok.resetSyntax();
countTok.wordChars(33,65535);
countTok.whitespaceChars(0,' ');
countTok.eolIsSignificant(false);
countTok.nextToken();
Crawler.fileCounter=Integer.parseInt(countTok.sval);
int filecount=Crawler.fileCounter;
Indexer.filePointerb4=filecount;
clCountRead.close();
String strURL= CrawlerFrame.jTextFieldUrlAddress.getText();
setStatus("CRAWLER STARTING....");
CrawlerFrame.jListURL.removeAll();
int counter=0;
boolean condition;
URL url;
try
{
url = new URL(strURL);
if (!tab.contains(strURL))
{
// test to make sure it is robot-safe!
if (robotSafe(url))
tab.insertRecord(strURL);
}
}
catch (MalformedURLException e)
{
if(!strURL.equals("")){
setStatus("ERROR: invalid URL " + strURL);
CrawlerFrame.jTextFieldUrlAddress.setText("");
}
}
while(((condition=tab.isRecordFalse())||strURL.length()!=0)&& (Thread.currentThread() == CrawlerFrame.clThread))
{
if(condition)
{
strURL = tab.retrieveFirst();
CrawlerFrame.jTextFieldUrlAddress.setText(strURL);
tab.updateRecord(strURL);
setStatus("searching " + strURL);
CrawlerFrame.jListURL.add(strURL);
}
else
strURL="";
if (strURL.length() == 0)
{
setStatus("Enter a starting URL then press RUN");
break;
}
try
{
url = new URL(strURL);
}
catch (MalformedURLException e)
{
setStatus("ERROR: invalid URL " + strURL);
tab.delete(strURL);
//CrawlerFrame.jTextFieldUrlAddress.setText("");
strURL="";
continue;
}
tab.updateRecord(strURL);//sss
CrawlerFrame.jListURL.add(strURL); //ss
// can only search http: protocol URLs
if (url.getProtocol().compareTo("http") != 0)
break;
// test to make sure it is before searching
if (!robotSafe(url))
break;
try
{
// try opening the URL
URLConnection urlConnection = url.openConnection();
urlConnection.setAllowUserInteraction(false);
InputStream urlStream = url.openStream();
String type
= URLConnection.guessContentTypeFromName(url.getFile());
if (type == null)
break;
if (type.compareTo("text/html") != 0)
break;
byte b[] = new byte[1000];
int numRead = urlStream.read(b);
String content = new String(b, 0, numRead);
while (numRead != -1)
{
if (Thread.currentThread() != CrawlerFrame.clThread)
break;
numRead = urlStream.read(b);
if (numRead != -1)
{
String newContent = new String(b, 0, numRead);
content += newContent;
}
}
String fileString=content;
fileString=fileString.replace('(',' ');
fileString=fileString.replace(')',' ');
fileString=fileString.replace(',',' ');
fileString=fileString.replace('.',' ');
fileString=fileString.replace(':',' ');
fileString=fileString.replace('?',' ');
fileString=fileString.replace('!',' ');
fileString=fileString.replace('@',' ');
fileString=fileString.replace('\'',' ');
fileString=fileString.replace('\"',' ');
fileString=strURL+" "+fileString;
//fileString.replace('',' ');
File htmlDoc=new File("c:/search/repository/doc"+fileCounter+".txt");
FileWriter fp=new FileWriter(htmlDoc);
fp.write(avoidHTMLTag(fileString));
//fp.write(fileString);
fp.close();
fileCounter++;
urlStream.close();
if (Thread.currentThread() != CrawlerFrame.clThread)
break;
String lowerCaseContent = content.toLowerCase();
int index = 0;
while ((index = lowerCaseContent.indexOf("<a", index)) != -1)
{
if ((index = lowerCaseContent.indexOf("href", index)) == -1)
break;
if ((index = lowerCaseContent.indexOf("=", index)) == -1)
break;
if (Thread.currentThread() !=CrawlerFrame.clThread)
break;
index++;
CrawlTable.count++;
String remaining = content.substring(index);
StringTokenizer st
= new StringTokenizer(remaining, "\t\n\r\">#");
String strLink = st.nextToken();
URL urlLink;
try
{
urlLink = new URL(url, strLink);
strLink = urlLink.toString();
}
catch (MalformedURLException e)
{
setStatus("ERROR: bad URL " + strLink);
tab.delete(strLink);
//CrawlerFrame.jTextFieldUrlAddress.setText("");
strURL="";
continue;
}
if (urlLink.getProtocol().compareTo("http") != 0)
break;
if (Thread.currentThread() != CrawlerFrame.clThread)
break;
try
{
// try opening the URL
URLConnection urlLinkConnection
= urlLink.openConnection();
urlLinkConnection.setAllowUserInteraction(false);
InputStream linkStream = urlLink.openStream();
String strType
= urlLinkConnection.guessContentTypeFromName(urlLink.getFile());
linkStream.close();
// if another page, add to the end of search list
if (strType == null)
break;
if (strType.compareTo("text/html") == 0) {
// check to see if this URL has already been
// searched or is going to be searched
//////////////////////////////////////////////////////
if (!tab.contains(strLink))
{
// test to make sure it is robot-safe!
//if (robotSafe(urlLink))
tab.insertRecord(strLink);
}
}
}
catch (IOException e)
{
setStatus("ERROR: couldn't open URL " + strLink);
continue;
}
if (strURL.length() == 0)
{
setStatus("Enter a starting URL then press RUN");
/////return;
break;
}
}//end of try
} catch (IOException e)
{
setStatus("ERROR1: couldn't open URL " + strURL);
tab.delete(strURL);
//CrawlerFrame.jTextFieldUrlAddress.setText("");
strURL="";
continue;
}
}//end while
setStatus("done");
CrawlerFrame.jButtonStop.setEnabled(false);
CrawlerFrame.jButtonRun.setEnabled(true);
FileWriter clCountWrite=new FileWriter("c:/search/resources/crawlcount.txt",false);
Integer count=new Integer(fileCounter);
Indexer.filePointerafter=fileCounter;
clCountWrite.write(count.toString(),0,count.toString().length());
clCountWrite.close();
CrawlerFrame.clThread = null;
Thread.currentThread().stop();
}//end of try
catch(Exception e)
{
setStatus("ERROR:"+e.getMessage());
}
}//end of run
static void setStatus(String status)
{
CrawlerFrame.textStatus.setText(status);
}
}//end of classCrawler
class CrawlTable
{
public static int count=1;
String connectionAddress1=
"jdbc:odbc:search";
String connectionAddress2=
"jdbc:odbc:search1";
String connectionAddress3=
"jdbc:odbc:search2";
String connectionAddress4=
"jdbc:odbc:search3";
Connection con1;
Connection con2;
Connection con3;
Connection con4;
Statement stmt;
ResultSet rs;
public void insertRecord(String urlAddress)
{
String insertString;
insertString="insert into CRAWLTABLE (URLADDRESS,ISCRAWLED)"+
" values('"+urlAddress+"','f')";
try
{
Class.forName("sun.jdbc.odbc.JdbcOdbcDriver");
}
catch(java.lang.ClassNotFoundException e)
{
System.err.print("ClassNotFoundException: ");
System.err.println(e.getMessage());
}
try
{
if(count == 1)
{
con1=DriverManager.getConnection(connectionAddress1,"","");
stmt=con1.createStatement();
stmt.executeUpdate(insertString);
stmt.close();
con1.close();
}
else if(count == 2)
{
con2=DriverManager.getConnection(connectionAddress2,"","");
stmt=con2.createStatement();
stmt.executeUpdate(insertString);
stmt.close();
con2.close();
}
else if(count == 3)
{
con3=DriverManager.getConnection(connectionAddress3,"","");
stmt=con3.createStatement();
stmt.executeUpdate(insertString);
stmt.close();
con3.close();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -