📄 crawler.java
字号:
/**
* Represents a Web crawler.
*
* @author SeungJin Lim
* @version 1.0, 2006/10/23
* @since JDK1.5
*/
package WebCrawler;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.net.*;
import java.util.Locale;
import java.util.ResourceBundle;
import java.util.MissingResourceException;
import java.util.Vector;
import log.LogItem;
import log.PgDBLogger;
import html.HtmlPage;
import io.MyPrintStream;
import db.PsqlPool;
public class Crawler {
public Crawler( String properties, String pwd )
throws MalformedURLException {
this.dbPassword = pwd;
// obtain program parameters.
processProperties( properties );
// initialize PsqlPool.
checkDatabase();
// create an instance of PgDBLogger.
logger = new PgDBLogger();
}
public void setPrintStream( PrintStream out ) {
this.out = out;
}
public void startWithURL() {
if( null!=getSourceURL() ) {
//this.startWith( slink.getContext(), slink.getURL().toString() );
this.startWith( getSourceContext(), getSourceURL().toString() );
}
else {
startWithDBEntries();
}
}
public void startWith( URL context, String source ) {
//this.source = new Link( context, source, true );
// 1. Parse the starting URL:
HtmlPage hp = null;
try {
//hp = new HtmlPage( getSourceLink().getURL(), out );
hp = new HtmlPage( getSourceURL(), out );
} catch (IOException e) {
MyPrintStream.out.println("[Crawler.startWith] HtmlPage error: " + getSourceURL());
}
Vector<Link> links = hp.links();
if( null==links || null==getSourceURL() ) {
MyPrintStream.out.println("[Crawler.startWith] No links found in "+source );
return;
} // if
// 2. Store the links found in the starting URL into the database:
for( int i=0; i<links.size(); i++ ) {
Link target = links.elementAt(i);
// Do not process non HTML URLs nor non HTTP URLs:
Link sourceLink = new Link( getSourceContext(), getSourceURL().toString() );
if( null!=target && (!target.isHTML() || !target.isHTTP() || !sourceLink.getDomain().equals(target.getDomain())) ) continue;
if( null!=target.getURL() ) {
LogItem item = new LogItem( sourceLink, target, "null" );
try {
logger.put( item );
MyPrintStream.out.println("[Crawler.startWith] put: " + item.toString());
} catch (InterruptedException e) {
MyPrintStream.out.println("[Crawler.startWith] put error: " + item.toString());
}
} // if null
} // for i
// 3. Continue to crawl the rest world:
startWithDBEntries();
}
public void startWithDBEntries() {
getLogger().setAvailable( true );
// 3. Continue to crawl the rest world:
LinkProducerConsumer p1 = new LinkProducerConsumer(getLogger(), 1);
LinkProducerConsumer p2 = new LinkProducerConsumer(getLogger(), 2);
LinkProducerConsumer p3 = new LinkProducerConsumer(getLogger(), 3);
LinkProducerConsumer p4 = new LinkProducerConsumer(getLogger(), 4);
LinkProducerConsumer p5 = new LinkProducerConsumer(getLogger(), 5);
//p1.start();
//p2.start();
try {
// Let the experiment run for 10 seconds.
Thread.sleep(5000L);
} catch (InterruptedException ie) {
// If this thread is terminated within 60 seconds
// then shutdown the producer and consumer threads
// that are currently running.
} finally {
// Instruct the producer and consumer threads to
// complete.
p1.interrupt();
p2.interrupt();
p3.interrupt();
p4.interrupt();
p5.interrupt();
}
}
public String getDatabaseServerName() {
return dbServerName;
}
public String getDatabasePort() {
return dbPort;
}
public String getDatabaseName() {
return dbName;
}
public String getDatabaseUserName() {
return dbUserName;
}
public String getDatabasePassword() {
return dbPassword;
}
public PgDBLogger getLogger() {
return logger;
}
// public Link getSourceLink() {
// return source;
// }
public URL getSourceURL() {
return startUrl;
}
public URL getSourceContext() {
return startUrlContext;
}
public String toString() {
//if( getSourceLink()!=null ) return getSourceLink().toString();
if( getSourceURL()!=null ) return getSourceURL().toString();
else return null;
}
private void processProperties( String resource )
throws MalformedURLException {
System.out.println("[Crawler] Processing properties ...");
Locale currentLocale = Locale.getDefault();
ResourceBundle properties = null;
try {
properties = ResourceBundle.getBundle(resource, currentLocale);
dbServerName = properties.getString("database_server_name");
System.out.println( "\tdatabase server: "+dbServerName );
dbPort = properties.getString("database_server_port");
System.out.println( "\tdatabase port: "+getDatabasePort() );
dbName = properties.getString("database_name");
System.out.println( "\tdatabase name: "+getDatabaseName() );
dbUserName = properties.getString("database_user_name");
System.out.println( "\tdatabase user name: "+getDatabaseUserName() );
System.out.println( "\tdatabase password: "+getDatabasePassword() );
} catch (MissingResourceException ex) {
System.err.println( "[Crawler.processProperties] MissingResourceException: "+ex.getMessage());
System.exit(1);
}
try {
startUrlContext = new URL(properties.getString("start_url_context"));
System.out.println( "\tstart URL context: "+startUrlContext );
//this.source = new Link( new URL(startUrlContext), startUrl, false );
} catch (MissingResourceException ex) {
System.err.println( "[Crawler.processProperties] MissingResourceException: "+ex.getMessage());
System.exit(1);
}
try {
startUrl = new URL(startUrlContext, properties.getString("start_url"));
System.out.println( "\tstart URL: "+startUrl );
} catch (MissingResourceException ex) {
startUrl = null;
System.out.println( "\tstart URL: NULL. Will process DB link entries." );
}
try {
String logFile = properties.getString("log_file");
System.out.println( "\tlog file: "+logFile );
MyPrintStream.set( new PrintStream(logFile) );
} catch (MissingResourceException ex) {
//System.err.println( "[Crawler.processProperties] MissingResourceException: "+ex.getMessage());
MyPrintStream.set( System.out );
} catch (FileNotFoundException e) {
e.printStackTrace( System.out );
}
}
private void checkDatabase() {
System.out.print("[Crawler] Checking database ... ");
// Initialize PsqlPool:
PsqlPool.init("jdbc:postgresql://" + getDatabaseServerName() + ":" + getDatabasePort() + "/" + getDatabaseName(), getDatabaseUserName(), getDatabasePassword());
System.out.println("OK.");
}
public static void usage() {
System.out.println(">>>>>Usage:");
System.out.println(" java -cp \"crawler.jar\" Crawler <option>");
// System.out.println(" -s<starting URL>");
System.out.println(" -p<password to the crawler database>");
System.out.println(" -c<properties file name if it is not crawler.properties>");
System.out.println(" -o<output log file name (optional)>");
}
/*------------------------------------------------------------------------
** Program starts here
*/
public static void main( String [] args )
throws MalformedURLException,IOException {
/*-------------------------------------------------------------------
** Parse command-line arguments
*/
String outputfile = null;
String propertiesfile = null;
// String startURL = null;
String pwd = null;
//boolean resolveFilename = false;
for (int i=0; i<args.length; i++) {
// password
if( args[i].startsWith("-p") )
{
pwd = args[i].substring(2);
}
// // starting URL
// else if( args[i].startsWith("-s") )
// {
// startURL = args[i].substring(2);
// }
// output log file
else if( args[i].startsWith("-o") )
{
outputfile = args[i].substring(2);
}
// properties file
else if( args[i].startsWith("-c") )
{
propertiesfile = args[i].substring(2);
}
}
/*--------------------------------------------------------------------
** Error checking
*/
//if(pwd==null || startURL==null) {
if( pwd==null ) {
usage();
System.exit(1);
}
/*--------------------------------------------------------------------
** Process
*/
PrintStream out = null;
if(outputfile!=null) {
out = new PrintStream(outputfile);
}
else {
out = System.out;
}
//BufferedReader reader = new BufferedReader(new InputStreamReader( System.in ));
//System.out.print( "Enter database password: ");
//String pwd = reader.readLine();
//reader.close();
Crawler crawler;
crawler = new Crawler( null==propertiesfile ? "crawler" : propertiesfile, pwd.trim() );
crawler.setPrintStream(out);
if( null!=crawler.getSourceURL() )
crawler.startWithURL();
else
crawler.startWithDBEntries();
out.close();
} // main()
/**
* Link representing the starting URL.
*/
//private Link source;
private String dbServerName;
private String dbPort;
private String dbName;
private String dbUserName;
private String dbPassword;
private URL startUrl;
private URL startUrlContext;
private PrintStream out;
private PgDBLogger logger;
//private boolean _DEBUG = false;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -