📄 persistprocessor.java
字号:
/* PersistProcessor.java * * Created on Feb 17, 2005 * * Copyright (C) 2007 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package org.archive.crawler.processor.recrawl;import java.io.BufferedReader;import java.io.File;import java.io.IOException;import java.io.UnsupportedEncodingException;import java.net.MalformedURLException;import java.net.URL;import java.util.Iterator;import java.util.Map.Entry;import java.util.logging.ConsoleHandler;import java.util.logging.Handler;import java.util.logging.Level;import java.util.logging.Logger;import org.apache.commons.codec.binary.Base64;import org.apache.commons.io.IOUtils;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.framework.Processor;import org.archive.crawler.io.CrawlerJournal;import org.archive.util.FileUtils;import org.archive.util.IoUtils;import org.archive.util.OneLineSimpleLogger;import org.archive.util.SURT;import org.archive.util.bdbje.EnhancedEnvironment;import org.archive.util.iterator.LineReadingIterator;import st.ata.util.AList;import com.sleepycat.bind.serial.SerialBinding;import com.sleepycat.bind.serial.StoredClassCatalog;import com.sleepycat.bind.tuple.StringBinding;import com.sleepycat.collections.StoredIterator;import com.sleepycat.collections.StoredSortedMap;import com.sleepycat.je.Database;import com.sleepycat.je.DatabaseConfig;import com.sleepycat.je.DatabaseException;import com.sleepycat.je.EnvironmentConfig;/** * Superclass for Processors which utilize BDB-JE for URI state * (including most notably history) persistence. * * @author gojomo */public abstract class PersistProcessor extends Processor { private static final long serialVersionUID = 1L; private static final Logger logger = Logger.getLogger(PersistProcessor.class.getName()); /** name of history Database */ public static final String URI_HISTORY_DBNAME = "uri_history"; /** * @return DatabaseConfig for history Database */ protected static DatabaseConfig historyDatabaseConfig() { DatabaseConfig dbConfig = new DatabaseConfig(); dbConfig.setTransactional(false); dbConfig.setAllowCreate(true); dbConfig.setDeferredWrite(true); return dbConfig; } /** * Usual constructor * * @param name * @param string */ public PersistProcessor(String name, String string) { super(name,string); } /** * Return a preferred String key for persisting the given CrawlURI's * AList state. * * @param curi CrawlURI * @return String key */ public String persistKeyFor(CrawlURI curi) { // use a case-sensitive SURT for uniqueness and sorting benefits return SURT.fromURI(curi.getUURI().toString(),true); } /** * Whether the current CrawlURI's state should be persisted (to log or * direct to database) * * @param curi CrawlURI * @return true if state should be stored; false to skip persistence */ protected boolean shouldStore(CrawlURI curi) { // TODO: don't store some codes, such as 304 unchanged? return curi.isSuccess(); } /** * Whether the current CrawlURI's state should be loaded * * @param curi CrawlURI * @return true if state should be loaded; false to skip loading */ protected boolean shouldLoad(CrawlURI curi) { // TODO: don't load some (prereqs?) return true; } /** * Copies entries from an existing environment db to a new one. If * historyMap is not provided, only logs the entries that would have been * copied. * * @param sourceDir existing environment database directory * @param historyMap new environment db (or null for a dry run) * @return number of records * @throws DatabaseException */ private static int copyPersistEnv(File sourceDir, StoredSortedMap<String,AList> historyMap) throws DatabaseException { int count = 0; // open the source env history DB, copying entries to target env EnhancedEnvironment sourceEnv = setupEnvironment(sourceDir); StoredClassCatalog sourceClassCatalog = sourceEnv.getClassCatalog(); Database sourceHistoryDB = sourceEnv.openDatabase( null, URI_HISTORY_DBNAME, historyDatabaseConfig()); StoredSortedMap<String,AList> sourceHistoryMap = new StoredSortedMap<String,AList>(sourceHistoryDB, new StringBinding(), new SerialBinding<AList>(sourceClassCatalog, AList.class), true); Iterator<Entry<String,AList>> iter = sourceHistoryMap.entrySet().iterator(); while (iter.hasNext()) { Entry<String,AList> item = iter.next(); logger.fine(item.getKey() + " " + item.getValue().toPrettyString()); if (historyMap != null) { historyMap.put(item.getKey(), item.getValue()); } count++; } StoredIterator.close(iter); sourceHistoryDB.close(); sourceEnv.close(); return count; } /** * Populates an environment db from a persist log. If historyMap is * not provided, only logs the entries that would have been populated. * * @param persistLogReader * persist log * @param historyMap * new environment db (or null for a dry run) * @return number of records * @throws UnsupportedEncodingException * @throws DatabaseException */ private static int populatePersistEnvFromLog(BufferedReader persistLogReader, StoredSortedMap<String,AList> historyMap) throws UnsupportedEncodingException, DatabaseException { int count = 0; Iterator<String> iter = new LineReadingIterator(persistLogReader); while (iter.hasNext()) { String line = iter.next(); if (line.length() == 0) { continue; } String[] splits = line.split(" "); if (splits.length != 2) { logger.severe("bad line: " + line); continue; } try { AList alist = (AList) IoUtils.deserializeFromByteArray(Base64.decodeBase64(splits[1].getBytes("UTF8"))); logger.fine(splits[0] + " " + alist.toPrettyString()); if (historyMap != null) { historyMap.put(splits[0], alist); } } catch (RuntimeException e) { logger.log(Level.SEVERE, "problem with line: " + line, e); } count++; } IOUtils.closeQuietly(persistLogReader); return count; } /** * Populates a new environment db from an old environment db or a persist * log. If path to new environment is not provided, only logs the entries * that would have been populated. * * @param sourcePath * source of old entries: can be a path to an existing * environment db, or a URL or path to a persist log * @param envFile * path to new environment db (or null for a dry run) * @return number of records * @throws DatabaseException * @throws IOException */ public static int populatePersistEnv(String sourcePath, File envFile) throws DatabaseException, IOException { int count = 0; StoredSortedMap<String,AList> historyMap = null; EnhancedEnvironment targetEnv = null; StoredClassCatalog classCatalog = null; Database historyDB = null; if (envFile != null) { // set up target environment if (!envFile.exists()) { envFile.mkdirs(); } targetEnv = setupEnvironment(envFile); classCatalog = targetEnv.getClassCatalog(); historyDB = targetEnv.openDatabase(null, URI_HISTORY_DBNAME, historyDatabaseConfig()); historyMap = new StoredSortedMap<String,AList>(historyDB, new StringBinding(), new SerialBinding<AList>(classCatalog, AList.class), true); } try { count = copyPersistSourceToHistoryMap(null, sourcePath, historyMap); } finally { // in finally block so that we unlock the target env even if we // failed to populate it if (envFile != null) { logger.info(count + " records imported from " + sourcePath + " to BDB env " + envFile); historyDB.sync(); historyDB.close(); targetEnv.close(); } else { logger.info(count + " records found in " + sourcePath); } } return count; } /** * Populates a given StoredSortedMap (history map) from an old * environment db or a persist log. If a map is not provided, only * logs the entries that would have been populated. * * @param sourcePath * source of old entries: can be a path to an existing * environment db, or a URL or path to a persist log * @param historyMap * map to populate (or null for a dry run) * @return number of records * @throws DatabaseException * @throws IOException */ public static int copyPersistSourceToHistoryMap(File context, String sourcePath, StoredSortedMap<String, AList> historyMap) throws DatabaseException, IOException, MalformedURLException, UnsupportedEncodingException { int count; // delegate depending on the source File sourceFile = FileUtils.maybeRelative(context, sourcePath); if (sourceFile.isDirectory()) { count = copyPersistEnv(sourceFile, historyMap); } else { BufferedReader persistLogReader = null; if (sourceFile.isFile()) { persistLogReader = CrawlerJournal.getBufferedReader(sourceFile); } else { URL sourceUrl = new URL(sourcePath); persistLogReader = CrawlerJournal.getBufferedReader(sourceUrl); } count = populatePersistEnvFromLog(persistLogReader, historyMap); } return count; } /** * Utility main for importing a log into a BDB-JE environment or moving a * database between environments (2 arguments), or simply dumping a log * to stderr in a more readable format (1 argument). * * @param args command-line arguments * @throws DatabaseException * @throws IOException */ public static void main(String[] args) throws DatabaseException, IOException { Handler handler = new ConsoleHandler(); handler.setLevel(Level.ALL); handler.setFormatter(new OneLineSimpleLogger()); logger.addHandler(handler); logger.setUseParentHandlers(false); if (args.length == 2) { logger.setLevel(Level.INFO); populatePersistEnv(args[0], new File(args[1])); } else if (args.length == 1) { logger.setLevel(Level.FINE); populatePersistEnv(args[0], null); } else { System.out.println("Arguments: "); System.out.println(" source [target]"); System.out.println( "...where source is either a txtser log file or BDB env dir"); System.out.println( "and target, if present, is a BDB env dir. "); return; } } private static EnhancedEnvironment setupEnvironment(File env) throws DatabaseException { EnvironmentConfig envConfig = new EnvironmentConfig(); envConfig.setAllowCreate(true); return new EnhancedEnvironment(env, envConfig); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -