📄 crawler.java

📁 利用广度优先遍历搜索一定范围内的所有网页,可用于建立搜索引擎和查找网络错误.
💻 JAVA
字号:
/**
 * Represents a Web crawler.
 *
 * @author  SeungJin Lim
 * @version 1.0, 2006/10/23
 * @since   JDK1.5
 */
package WebCrawler;


import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.net.*;
import java.util.Locale;
import java.util.ResourceBundle;
import java.util.MissingResourceException;
import java.util.Vector;

import log.LogItem;
import log.PgDBLogger;

import html.HtmlPage;
import io.MyPrintStream;
import db.PsqlPool;

public class Crawler {
	
	public Crawler( String properties, String pwd ) 
	throws MalformedURLException {
		this.dbPassword = pwd;
		// obtain program parameters.
		processProperties( properties );
		// initialize PsqlPool.
		checkDatabase();
		// create an instance of PgDBLogger.
		logger = new PgDBLogger();
	}
	public void setPrintStream( PrintStream out ) {
		this.out = out;
	}
	public void startWithURL() {
		if( null!=getSourceURL() ) {
			//this.startWith( slink.getContext(), slink.getURL().toString() );
			this.startWith( getSourceContext(), getSourceURL().toString() );
		}
		else {
			startWithDBEntries();
		}
	}
	public void startWith( URL context, String source ) {
		//this.source = new Link( context, source, true );
		
		// 1. Parse the starting URL:
		HtmlPage hp = null;
		try {
			//hp = new HtmlPage( getSourceLink().getURL(), out );
			hp = new HtmlPage( getSourceURL(), out );
		} catch (IOException e) {
			MyPrintStream.out.println("[Crawler.startWith] HtmlPage error: " + getSourceURL());
		}
		Vector<Link> links = hp.links();
		if( null==links || null==getSourceURL() ) {
			MyPrintStream.out.println("[Crawler.startWith] No links found in "+source );
			return;
		} // if
		
		// 2. Store the links found in the starting URL into the database:
		for( int i=0; i<links.size(); i++ ) {
			Link target = links.elementAt(i);
			
			// Do not process non HTML URLs nor non HTTP URLs:
			Link sourceLink = new Link( getSourceContext(), getSourceURL().toString() );
			if( null!=target && (!target.isHTML() || !target.isHTTP() || !sourceLink.getDomain().equals(target.getDomain())) ) continue;
			
			if(  null!=target.getURL() ) {
				LogItem item = new LogItem( sourceLink, target, "null" );
				try {
					logger.put( item );
					MyPrintStream.out.println("[Crawler.startWith] put: " + item.toString());
				} catch (InterruptedException e) {
					MyPrintStream.out.println("[Crawler.startWith] put error: " + item.toString());
				}
			} // if null
		} // for i
		
		// 3. Continue to crawl the rest world:
		startWithDBEntries();
	}
	public void startWithDBEntries() {
		getLogger().setAvailable( true );
		
		// 3. Continue to crawl the rest world:
		LinkProducerConsumer p1 = new LinkProducerConsumer(getLogger(), 1);
		LinkProducerConsumer p2 = new LinkProducerConsumer(getLogger(), 2);
		LinkProducerConsumer p3 = new LinkProducerConsumer(getLogger(), 3);
		LinkProducerConsumer p4 = new LinkProducerConsumer(getLogger(), 4);
		LinkProducerConsumer p5 = new LinkProducerConsumer(getLogger(), 5);

		//p1.start();
		//p2.start();

		try {
			// Let the experiment run for 10 seconds.
			Thread.sleep(5000L);
		} catch (InterruptedException ie) {
			// If this thread is terminated within 60 seconds
			// then shutdown the producer and consumer threads
			// that are currently running.
		} finally {
			// Instruct the producer and consumer threads to
			// complete.
			p1.interrupt();
			p2.interrupt();
			p3.interrupt();
			p4.interrupt();
			p5.interrupt();
		}
	}
	public String getDatabaseServerName() {
		return dbServerName;
	}
	public String getDatabasePort() {
		return dbPort;
	}
	public String getDatabaseName() {
		return dbName;
	}
	public String getDatabaseUserName() {
		return dbUserName;
	}
	public String getDatabasePassword() {
		return dbPassword;
	}
	public PgDBLogger getLogger() {
		return logger;
	}
//	public Link getSourceLink() {
//		return source;
//	}
	public URL getSourceURL() {
		return startUrl;
	}
	public URL getSourceContext() {
		return startUrlContext;
	}
	
	public String toString() {
		//if( getSourceLink()!=null ) return getSourceLink().toString();
		if( getSourceURL()!=null ) return getSourceURL().toString();
		else return null;
	}
	

	private void processProperties( String resource )
	throws MalformedURLException {
		System.out.println("[Crawler] Processing properties ...");
		Locale currentLocale = Locale.getDefault();
		ResourceBundle properties = null;
		try {
			properties = ResourceBundle.getBundle(resource, currentLocale);
			dbServerName = properties.getString("database_server_name");
			System.out.println( "\tdatabase server: "+dbServerName );
			dbPort = properties.getString("database_server_port");
			System.out.println( "\tdatabase port: "+getDatabasePort() );
			dbName = properties.getString("database_name");
			System.out.println( "\tdatabase name: "+getDatabaseName() );
			dbUserName = properties.getString("database_user_name");
			System.out.println( "\tdatabase user name: "+getDatabaseUserName() );
			System.out.println( "\tdatabase password: "+getDatabasePassword() );
		} catch (MissingResourceException ex) {
			System.err.println( "[Crawler.processProperties] MissingResourceException: "+ex.getMessage());
			System.exit(1);
		}

		try {
			startUrlContext = new URL(properties.getString("start_url_context"));
			System.out.println( "\tstart URL context: "+startUrlContext );
			
			//this.source = new Link( new URL(startUrlContext), startUrl, false );
		} catch (MissingResourceException ex) {
			System.err.println( "[Crawler.processProperties] MissingResourceException: "+ex.getMessage());
			System.exit(1);
		}
		
		try {
			startUrl = new URL(startUrlContext, properties.getString("start_url"));
			System.out.println( "\tstart URL: "+startUrl );
		} catch (MissingResourceException ex) {
			startUrl = null;
			System.out.println( "\tstart URL: NULL. Will process DB link entries." );
		}
		
		try {
			String logFile = properties.getString("log_file");
			System.out.println( "\tlog file: "+logFile );
			MyPrintStream.set( new PrintStream(logFile) );
		} catch (MissingResourceException ex) {
			//System.err.println( "[Crawler.processProperties] MissingResourceException: "+ex.getMessage());
			MyPrintStream.set( System.out );
		} catch (FileNotFoundException e) {
			e.printStackTrace( System.out );
		}
	}
	private void checkDatabase() {
		System.out.print("[Crawler] Checking database ... ");
		// Initialize PsqlPool:
		PsqlPool.init("jdbc:postgresql://" + getDatabaseServerName() + ":" + getDatabasePort() + "/" + getDatabaseName(), getDatabaseUserName(), getDatabasePassword());
		System.out.println("OK.");
	}
		
	
	public static void usage() {
		System.out.println(">>>>>Usage:");
		System.out.println("  java -cp \"crawler.jar\" Crawler <option>");
//		System.out.println("	-s<starting URL>");
		System.out.println("	-p<password to the crawler database>");
		System.out.println("	-c<properties file name if it is not crawler.properties>");
		System.out.println("	-o<output log file name (optional)>");
   	}

	/*------------------------------------------------------------------------
	**	Program starts here
	*/
    public static void main( String [] args )
    throws MalformedURLException,IOException {
		/*-------------------------------------------------------------------
		**	Parse command-line arguments
		*/
	    String	outputfile  = null;
	    String	propertiesfile  = null;
//	    String	startURL  = null;
	    String	pwd  = null;
	    //boolean resolveFilename = false;

   		for (int i=0; i<args.length; i++) {
   			// password
			if( args[i].startsWith("-p") ) 
			{
				pwd = args[i].substring(2);
			}
//			// starting URL
//			else if( args[i].startsWith("-s") ) 
//			{
//				startURL = args[i].substring(2);
//			}
			// output log file
			else if( args[i].startsWith("-o") ) 
			{
				outputfile = args[i].substring(2);
			}
			// properties file
			else if( args[i].startsWith("-c") ) 
			{
				propertiesfile = args[i].substring(2);
			}
	    }


		/*--------------------------------------------------------------------
		**	Error checking
		*/
   		//if(pwd==null || startURL==null) {
   		if( pwd==null ) {
   			usage();
			System.exit(1);
   		}

   		
		/*--------------------------------------------------------------------
		**	Process
		*/
   		PrintStream out = null;
		if(outputfile!=null) {
			out = new PrintStream(outputfile);
		}
		else {
			out = System.out;
		}

		//BufferedReader reader = new BufferedReader(new InputStreamReader( System.in ));
		//System.out.print( "Enter database password: ");
		//String pwd = reader.readLine(); 
		//reader.close();

		Crawler crawler;
		crawler = new Crawler( null==propertiesfile ? "crawler" : propertiesfile, pwd.trim() );
		crawler.setPrintStream(out);
		if( null!=crawler.getSourceURL() )
			crawler.startWithURL();
		else
			crawler.startWithDBEntries();
		
		out.close();
    } // main()
	
	/**
	 * Link representing the starting URL.
	 */
	//private Link source;
	private String dbServerName;
	private String dbPort;
	private String dbName;
	private String dbUserName;
	private String dbPassword;
	private URL startUrl;
	private URL startUrlContext;
	private PrintStream out;
	private PgDBLogger logger; 
	//private boolean _DEBUG = false;
	
}
💿 文件大小 574 K
👤 上传用户 mmmmmmmmmxxx
📂 所属分类 Java编程
🏷️ 相关标签

#搜索 #页 #搜索引擎 #网络
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -