📄 crawler.java

📁 Grid based Search Engine
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页


import java.text.*;
import java.awt.*;
import java.awt.event.*;
import java.sql.*;
import java.util.*;
import java.net.*;
import java.io.*;




public class Crawler implements  Runnable {

    public static final String DISALLOW = "Disallow:";
    //public static final int SEARCH_LIMIT = 150;
    public static int fileCounter=1;
    // public static count=1;
    CrawlTable tab;
   	public Crawler() {

	tab=new CrawlTable();
	URLConnection.setDefaultAllowUserInteraction(false);
    Properties props= new Properties(System.getProperties());
    props.put("http.proxySet", "true");
    props.put("http.proxyHost", "webcache-cup");
    props.put("http.proxyPort", "8080");
    Properties newprops = new Properties(props);
    System.setProperties(newprops);

    }






//////////////////////////////////////////////////////////////////////////////////ROBOT SAFE


    boolean robotSafe(URL url) {
		String strHost = url.getHost();
		String strRobot = "http://" + strHost + "/robots.txt";// form URL of the robots.txt file
		URL urlRobot;
		try {
		    urlRobot = new URL(strRobot);
		} 
		catch (MalformedURLException e) {
	
		    return false;
		}
	
		String strCommands;
		try {
		    InputStream urlRobotStream = urlRobot.openStream();
		    byte b[] = new byte[1000];
		    int numRead = urlRobotStream.read(b);
		    strCommands = new String(b, 0, numRead);
		    while (numRead != -1) {
			if (Thread.currentThread() != CrawlerFrame.clThread)
			    break;
			numRead = urlRobotStream.read(b);
			if (numRead != -1) {
			    String newCommands = new String(b, 0, numRead);
			    strCommands += newCommands;
			}
		    }
		    urlRobotStream.close();
		} 
		catch (IOException e)
		{
		    // if there is no robots.txt file, it is OK to search
		    return true;
		}
	
		// assume that this robots.txt refers to us and
		// search for "Disallow:" commands.
		String strURL = url.getFile();
		int index = 0;
		while ((index = strCommands.indexOf(DISALLOW, index)) != -1) {
		    index += DISALLOW.length();
		    String strPath = strCommands.substring(index);
		    StringTokenizer st = new StringTokenizer(strPath);
	
		    if (!st.hasMoreTokens())
			break;
	
		    String strBadPath = st.nextToken();
	
		    // if the URL starts with a disallowed path, it is not safe
		    if (strURL.indexOf(strBadPath) == 0)
			return false;
		}//while end
	
		return true;
    }//end robot safe


    String avoidHTMLTag(String s){
                StringBuffer sb=new StringBuffer();
                sb.ensureCapacity((s.length())*2);
                sb.append(s);
                int start = 0;
                int end = 0;
                try{

					while (((start = sb.indexOf("<",start)) != -1)|((end = sb.indexOf(">",start)) != -1))
					{		try{
								if(end<start)
									continue;
						
			                    sb.replace(start,end+1," ");
			                    start--;
			                    end=start;
			                   }
			                catch( Exception ex)
			                	{
			                		  CrawlerFrame.jListURL.add("ERROR:HTML FORMAT");
			                	}
			                }
			
			
					String s1=new String (sb);
                	return s1;
			    }
                catch(Exception e)
                {
                CrawlerFrame.jListURL.add("WRONG HTML FORMAT");
                return "";
                }
                
                
      }//end of htmlavoid


   
   
public void run() 
{

	
	try
	{
                    	
    FileReader clCountRead=new FileReader("c:/search/resources/crawlcount.txt");
    StreamTokenizer countTok=new StreamTokenizer(clCountRead);
    countTok.resetSyntax();
	countTok.wordChars(33,65535);
	countTok.whitespaceChars(0,' ');
	countTok.eolIsSignificant(false);
    countTok.nextToken();
    Crawler.fileCounter=Integer.parseInt(countTok.sval);
    int filecount=Crawler.fileCounter;
    Indexer.filePointerb4=filecount;
	clCountRead.close();
	String strURL= CrawlerFrame.jTextFieldUrlAddress.getText();
	setStatus("CRAWLER STARTING....");
	CrawlerFrame.jListURL.removeAll();
	int counter=0;
	boolean condition;
	URL url;
    try
    {
		url = new URL(strURL);
		if (!tab.contains(strURL)) 
		{
	    	// test to make sure it is robot-safe!
			if (robotSafe(url))
		    tab.insertRecord(strURL);
    	}
    }
    catch (MalformedURLException e) 
    {
		if(!strURL.equals("")){
		setStatus("ERROR: invalid URL " + strURL);
		CrawlerFrame.jTextFieldUrlAddress.setText("");
		}
	}
	

		
	while(((condition=tab.isRecordFalse())||strURL.length()!=0)&& (Thread.currentThread() == CrawlerFrame.clThread))
	{
			if(condition)
			{
				strURL = tab.retrieveFirst();
				CrawlerFrame.jTextFieldUrlAddress.setText(strURL);
				tab.updateRecord(strURL);
				setStatus("searching " + strURL);
		    	CrawlerFrame.jListURL.add(strURL);
		    				
			}
			else
				strURL="";			
			
			if (strURL.length() == 0) 
			{
		       	setStatus("Enter a starting URL then press RUN");
			    break;
			}
		
					    
		    try
		    {
			url = new URL(strURL);
		    }
		    catch (MalformedURLException e) 
		    {
			setStatus("ERROR: invalid URL " + strURL);
			tab.delete(strURL);
			//CrawlerFrame.jTextFieldUrlAddress.setText("");
			strURL="";
			continue;
		    }
	
		     tab.updateRecord(strURL);//sss
		     CrawlerFrame.jListURL.add(strURL); //ss
		    
		    // can only search http: protocol URLs
		    if (url.getProtocol().compareTo("http") != 0)
			break;
	
		    // test to make sure it is before searching
		    if (!robotSafe(url))
			break;
	
		    try
		    {
			// try opening the URL
			URLConnection urlConnection = url.openConnection();
			urlConnection.setAllowUserInteraction(false);
			InputStream urlStream = url.openStream();
			String type
			  = URLConnection.guessContentTypeFromName(url.getFile());
			  if (type == null)
			    break;
			if (type.compareTo("text/html") != 0)
			    break;
	
			byte b[] = new byte[1000];
			int numRead = urlStream.read(b);
			String content = new String(b, 0, numRead);
			while (numRead != -1)
			{
			    if (Thread.currentThread() != CrawlerFrame.clThread)
				break;
			    numRead = urlStream.read(b);
			    if (numRead != -1)
			    {
				String newContent = new String(b, 0, numRead);
				content += newContent;
	            }
			}
	        String fileString=content;
            fileString=fileString.replace('(',' ');
            fileString=fileString.replace(')',' ');
            fileString=fileString.replace(',',' ');
            fileString=fileString.replace('.',' ');
            fileString=fileString.replace(':',' ');
            fileString=fileString.replace('?',' ');
            fileString=fileString.replace('!',' ');
            fileString=fileString.replace('@',' ');
            fileString=fileString.replace('\'',' ');
            fileString=fileString.replace('\"',' ');
					  fileString=strURL+" "+fileString;
            //fileString.replace('',' ');
            File htmlDoc=new File("c:/search/repository/doc"+fileCounter+".txt");
            FileWriter fp=new FileWriter(htmlDoc);
            fp.write(avoidHTMLTag(fileString));
            //fp.write(fileString);
            fp.close();
            fileCounter++;
			urlStream.close();
			if (Thread.currentThread() != CrawlerFrame.clThread)
			    break;
			String lowerCaseContent = content.toLowerCase();
			int index = 0;
			while ((index = lowerCaseContent.indexOf("<a", index)) != -1)
			{  
			    if ((index = lowerCaseContent.indexOf("href", index)) == -1)
				break;
			
			    if ((index = lowerCaseContent.indexOf("=", index)) == -1)
				break;
			
			    if (Thread.currentThread() !=CrawlerFrame.clThread)
				break;
			
			    index++;
			    CrawlTable.count++;
			    String remaining = content.substring(index);
			    
	
			    StringTokenizer st
			      = new StringTokenizer(remaining, "\t\n\r\">#");
			    String strLink = st.nextToken();
	
			    URL urlLink;
			    try
			    {
				urlLink = new URL(url, strLink);
				strLink = urlLink.toString();
			    }
			    catch (MalformedURLException e) 
			    {
				setStatus("ERROR: bad URL " + strLink);
				tab.delete(strLink);
				//CrawlerFrame.jTextFieldUrlAddress.setText("");
				strURL="";
				continue;
				}
		
		
				if (urlLink.getProtocol().compareTo("http") != 0)
					break;
		
				if (Thread.currentThread() != CrawlerFrame.clThread)
					break;
		
				try 
				{
					// try opening the URL
					URLConnection urlLinkConnection
					  = urlLink.openConnection();
					urlLinkConnection.setAllowUserInteraction(false);
					InputStream linkStream = urlLink.openStream();
					String strType
					  = urlLinkConnection.guessContentTypeFromName(urlLink.getFile());
					linkStream.close();
		
					// if another page, add to the end of search list
					if (strType == null)
					    break;
					if (strType.compareTo("text/html") == 0) {
					    // check to see if this URL has already been
					    // searched or is going to be searched
					    //////////////////////////////////////////////////////
					if (!tab.contains(strLink)) 
					{
					    	
						// test to make sure it is robot-safe!
						//if (robotSafe(urlLink))
						    tab.insertRecord(strLink);
			        }
				}
		
				   
				} 
				catch (IOException e) 
				{
					setStatus("ERROR: couldn't open URL " + strLink);
					continue;
				}
				if (strURL.length() == 0) 
				{
		       		setStatus("Enter a starting URL then press RUN");
			    	/////return;
			    	break;
				}
					
				}//end of try
			    } catch (IOException e) 
			    {
				setStatus("ERROR1: couldn't open URL " + strURL);
		        tab.delete(strURL);
		        //CrawlerFrame.jTextFieldUrlAddress.setText("");
		        strURL="";
		        continue;
			    }
		

	}//end while
                    
	    setStatus("done");
        CrawlerFrame.jButtonStop.setEnabled(false);
        CrawlerFrame.jButtonRun.setEnabled(true);
        FileWriter clCountWrite=new FileWriter("c:/search/resources/crawlcount.txt",false);
		Integer count=new Integer(fileCounter);
		Indexer.filePointerafter=fileCounter;
		clCountWrite.write(count.toString(),0,count.toString().length());
		clCountWrite.close();	   
        CrawlerFrame.clThread = null;
	Thread.currentThread().stop();
	}//end of try
	catch(Exception e)
	{
		setStatus("ERROR:"+e.getMessage());
	}	

  }//end of run

    static void setStatus(String status) 
    {
	CrawlerFrame.textStatus.setText(status);
    }

}//end of classCrawler


class CrawlTable
	{
	public static int count=1;
	String connectionAddress1=
		"jdbc:odbc:search";
	String connectionAddress2=
		"jdbc:odbc:search1";
	String connectionAddress3=
		"jdbc:odbc:search2";
	String connectionAddress4=
		"jdbc:odbc:search3";
	Connection con1;
	Connection con2;
	Connection con3;
	Connection con4;
	Statement stmt;
	ResultSet rs;
  	public void insertRecord(String urlAddress)
   		{

		String insertString;
		insertString="insert into CRAWLTABLE (URLADDRESS,ISCRAWLED)"+
		" values('"+urlAddress+"','f')";
		

        try
        	{
        		Class.forName("sun.jdbc.odbc.JdbcOdbcDriver");
        	}
        catch(java.lang.ClassNotFoundException e)
        	{
        		System.err.print("ClassNotFoundException: ");
        		System.err.println(e.getMessage());
            }

        try
        	{               
	     if(count == 1)
	           {
        		con1=DriverManager.getConnection(connectionAddress1,"","");
        		stmt=con1.createStatement();
		 
        		stmt.executeUpdate(insertString);

        		stmt.close();
        		con1.close();
	           }

	     else if(count == 2)
	           {
        		con2=DriverManager.getConnection(connectionAddress2,"","");
        		stmt=con2.createStatement();
		
        		stmt.executeUpdate(insertString);
        		stmt.close();
        		con2.close();
	           }
	     else if(count == 3)
	           {
        		con3=DriverManager.getConnection(connectionAddress3,"","");
        		stmt=con3.createStatement();
		 
        		stmt.executeUpdate(insertString);
        		stmt.close();
        		con3.close();
12 下一页
💿 文件大小 1100 K
👤 上传用户 god_dog
📂 所属分类其他行业
🏷️ 相关标签

#Engine #Search #based #Grid
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -