📄 sqlworkloadmanager.java
字号:
/* * Encog Neural Network and Bot Library for Java v1.x * http://www.heatonresearch.com/encog/ * http://code.google.com/p/encog-java/ * * Copyright 2008, Heaton Research Inc., and individual contributors. * See the copyright.txt in the distribution for a full listing of * individual contributors. * * This is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this software; if not, write to the Free * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA, or see the FSF site: http://www.fsf.org. */package org.encog.bot.spider.workload.sql;import java.net.MalformedURLException;import java.net.URL;import java.sql.ResultSet;import java.sql.SQLException;import java.util.concurrent.CountDownLatch;import java.util.concurrent.Semaphore;import java.util.concurrent.TimeUnit;import java.util.logging.Level;import java.util.logging.Logger;import org.encog.bot.spider.Spider;import org.encog.bot.spider.workload.WorkloadError;import org.encog.bot.spider.workload.WorkloadManager;import org.encog.util.db.DBError;import org.encog.util.db.RepeatableStatement;import org.encog.util.db.RepeatableConnection;/** * SQLWorkloadManager: This workload manager stores the URL lists in an SQL * database. This workload manager uses two tables, which can be created as * follows: * * CREATE TABLE 'spider_host' ( 'host_id' int(10) unsigned NOT NULL * auto_increment, 'host' varchar(255) NOT NULL default '', 'status' varchar(1) * NOT NULL default '', 'urls_done' int(11) NOT NULL, 'urls_error' int(11) NOT * NULL, PRIMARY KEY ('host_id') ) * * CREATE TABLE 'spider_workload' ( 'workload_id' int(10) unsigned NOT NULL * auto_increment, 'host' int(10) unsigned NOT NULL, 'url' varchar(2083) NOT * NULL default '', 'status' varchar(1) NOT NULL default '', 'depth' int(10) * unsigned NOT NULL, 'url_hash' int(11) NOT NULL, 'source_id' int(11) NOT NULL, * PRIMARY KEY ('workload_id'), KEY 'status' ('status'), KEY 'url_hash' * ('url_hash'), KEY 'host' ('host') ) */public class SQLWorkloadManager implements WorkloadManager { /** * The logger. */ private static Logger logger = Logger.getLogger( "com.heatonresearch.httprecipes.spider.workload.sql.SQLWorkloadManager"); /** * The mask used to generate URL hash's. */ public static final int HASH_MASK = 0xffff; /** * The SQL holder to use. */ private final SQLHolder holder = new SQLHolder(); /** * Prepared statement to clear the workload. */ private RepeatableStatement stmtClear; /** * Prepared statement to clear the hosts. */ private RepeatableStatement stmtClear2; /** * Prepared statement to add work. */ private RepeatableStatement stmtAdd; /** * Prepared statement to add work. */ private RepeatableStatement stmtAdd2; /** * Prepared statement to get work. */ private RepeatableStatement stmtGetWork; /** * Prepared statement to get work. */ private RepeatableStatement stmtGetWork2; /** * Prepared statement to empty the workload. */ private RepeatableStatement stmtWorkloadEmpty; /** * Prepared statement to get the status of a URL. */ private RepeatableStatement stmtSetWorkloadStatus; /** * Prepared statement to set the status of a URL. */ private RepeatableStatement stmtSetWorkloadStatus2; /** * Prepared statement to get the depth of a URL. */ private RepeatableStatement stmtGetDepth; /** * Prepared statement to get the source of a URL. */ private RepeatableStatement stmtGetSource; /** * Prepared statement to resume. */ private RepeatableStatement stmtResume; /** * Prepared statement to resume. */ private RepeatableStatement stmtResume2; /** * Prepared statement to get a URL's id. */ private RepeatableStatement stmtGetWorkloadID; /** * Prepared statement to get a host id. */ private RepeatableStatement stmtGetHostID; /** * Prepared statement to get the next host. */ private RepeatableStatement stmtGetNextHost; /** * Prepared statement to set a host's status. */ private RepeatableStatement stmtSetHostStatus; /** * Prepared statement to get a host. */ private RepeatableStatement stmtGetHost; /** * Only one thread at a time is allowed to add to the workload. */ private Semaphore addLock; /** * Is there any work? */ private CountDownLatch workLatch; /** * The maximum size a URL can be. */ private int maxURLSize; /** * The maximum size that a host can be. */ private int maxHostSize; /** * Used to obtain the next URL. */ private RepeatableStatement.Results workResultSet = null; /** * Used to obtain the next host. */ private RepeatableStatement.Results hostResultSet = null; /** * A connection to a JDBC database. */ private RepeatableConnection connection; /** * The current host. */ private String currentHost; /** * The ID of the current host. */ private int currentHostID = -1; /** * Add the specified URL to the workload. * * @param url * The URL to be added. * @param source * The page that contains this URL. * @param depth * The depth of this URL. * @return True if the URL was added, false otherwise. * @throws WorkloadException */ public boolean add(final URL url, final URL source, final int depth) { boolean result = false; try { this.addLock.acquire(); if (!contains(url)) { final String strURL = truncate(url.toString(), this.maxURLSize); final String strHost = truncate(url.getHost(), this.maxHostSize) .toLowerCase(); result = true; // get the host int hostID = getHostID(url, false); if (hostID == -1) { this.stmtAdd2.execute(strHost, Status.STATUS_WAITING, 0, 0); hostID = getHostID(url, true); } // need to set the current host for the first time? if (this.currentHostID == -1) { this.currentHostID = hostID; this.currentHost = strHost; this.stmtSetHostStatus.execute(Status.STATUS_PROCESSING, this.currentHostID); } // now add workload element if (source != null) { final int sourceID = getWorkloadID(source, true); this.stmtAdd.execute(hostID, strURL, Status.STATUS_WAITING, depth, computeHash(url), sourceID); } else { this.stmtAdd.execute(hostID, strURL, Status.STATUS_WAITING, depth, computeHash(url), 0); } this.workLatch.countDown(); } } catch (final InterruptedException e) { throw new WorkloadError(e); } catch (final SQLException e) { throw new WorkloadError(e); } finally { this.addLock.release(); } return result; } /** * Clear the workload. */ public void clear() { this.stmtClear.execute(); this.stmtClear2.execute(); } /** * Close the workload manager. */ public void close() { if (this.workResultSet != null) { try { this.workResultSet.close(); } catch (final Exception e) { logger .log(Level.SEVERE, "Error trying to close workload result set, ignoring..."); } this.workResultSet = null; } if (this.connection != null) { this.connection.close(); } } /** * Compute a hash for a URL. * * @param url * The URL to compute the hash for. * @return The hash code. */ private int computeHash(final URL url) { final String str = url.toString().trim(); int result = str.hashCode(); result = result % SQLWorkloadManager.HASH_MASK; return result; } /** * Determine if the workload contains the specified URL. * * @param url * The URL to search the workload for. * @return True of the workload contains the specified URL. @ */ public boolean contains(final URL url) { try { return getWorkloadID(url, false) != -1; } catch (final SQLException e) { throw new WorkloadError(e); } } /** * Convert the specified String to a URL. If the string is too long or has * other issues, throw a WorkloadException. * * @param aurl * A String to convert into a URL. * @return The URL. @ Thrown if, The String could not be converted. */ public URL convertURL(final String aurl) { URL result = null; final String url = aurl.trim(); if (this.maxURLSize != -1 && url.length() > this.maxURLSize) { throw new WorkloadError("URL size is too big, must be under " + this.maxURLSize + " bytes."); } try { result = new URL(url); } catch (final MalformedURLException e) { throw new WorkloadError(e); } return result; } /** * Create the correct type of SQL holder for this workload managers. * This will likely be overridden by subclasses. * @return A SQL holder. */ public SQLHolder createSQLHolder() { return new SQLHolder(); } /** * Return the size of the specified column. * * @param table * The table that contains the column. * @param column * The column to get the size for. * @return The size of the column. */ public int getColumnSize(final String table, final String column) { try { final ResultSet rs = this.connection.getConnection().getMetaData() .getColumns(null, null, table, null); while (rs.next()) { final String c = rs.getString("COLUMN_NAME"); final int size = rs.getInt("COLUMN_SIZE"); if (c.equalsIgnoreCase(column)) { return size; } } return -1; } catch (final SQLException e) { throw new DBError(e); } } /** * @return the connection */ public RepeatableConnection getConnection() { return this.connection; } /** * Get the current host. * * @return The current host. */ public String getCurrentHost() { return this.currentHost; } /** * Get the depth of the specified URL. * * @param url * The URL to get the depth of. * @return The depth of the specified URL. @ Thrown if the depth could not * be found. */ public int getDepth(final URL url) { RepeatableStatement.Results rs = null; try { rs = this.stmtGetDepth.executeQuery(computeHash(url)); while (rs.getResultSet().next()) { final String u = rs.getResultSet().getString(1); if (u.equals(url.toString())) { return rs.getResultSet().getInt(2); } } return 1; } catch (final SQLException e) { throw new WorkloadError(e); } finally { if (rs != null) { rs.close(); } } } /** * Get the host name associated with the specified host id. * * @param hostID * The host id to look up. * @return The name of the host. @ Thrown if unable to obtain the host name. */ private String getHost(final int hostID) { RepeatableStatement.Results rs = null; try { rs = this.stmtGetHost.executeQuery(hostID); if (!rs.getResultSet().next()) { throw new WorkloadError("Can't find previously created host."); } return rs.getResultSet().getString(1); } catch (final SQLException e) { throw new WorkloadError(e); } finally { if (rs != null) { rs.close(); } } } /** * Get the id for the specified host name. * * @param host * The host to lookup. * @param require * Should an exception be thrown if the host is not located. * @return The id of the specified host name. @ Thrown if the host id is not * found, and is required. * @throws SQLException * Thrown if a SQL error occurs.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -