⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 persistprocessor.java

📁 高性能分词算法
💻 JAVA
字号:
/* PersistProcessor.java *  * Created on Feb 17, 2005 * * Copyright (C) 2007 Internet Archive. *  * This file is part of the Heritrix web crawler (crawler.archive.org). *  * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. *  * Heritrix is distributed in the hope that it will be useful,  * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. *  * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.crawler.processor.recrawl;import java.io.BufferedReader;import java.io.File;import java.io.IOException;import java.io.UnsupportedEncodingException;import java.net.MalformedURLException;import java.net.URL;import java.util.Iterator;import java.util.Map.Entry;import java.util.logging.ConsoleHandler;import java.util.logging.Handler;import java.util.logging.Level;import java.util.logging.Logger;import org.apache.commons.codec.binary.Base64;import org.apache.commons.io.IOUtils;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.framework.Processor;import org.archive.crawler.io.CrawlerJournal;import org.archive.util.FileUtils;import org.archive.util.IoUtils;import org.archive.util.OneLineSimpleLogger;import org.archive.util.SURT;import org.archive.util.bdbje.EnhancedEnvironment;import org.archive.util.iterator.LineReadingIterator;import st.ata.util.AList;import com.sleepycat.bind.serial.SerialBinding;import com.sleepycat.bind.serial.StoredClassCatalog;import com.sleepycat.bind.tuple.StringBinding;import com.sleepycat.collections.StoredIterator;import com.sleepycat.collections.StoredSortedMap;import com.sleepycat.je.Database;import com.sleepycat.je.DatabaseConfig;import com.sleepycat.je.DatabaseException;import com.sleepycat.je.EnvironmentConfig;/** * Superclass for Processors which utilize BDB-JE for URI state * (including most notably history) persistence. *  * @author gojomo */public abstract class PersistProcessor extends Processor {        private static final long serialVersionUID = 1L;    private static final Logger logger =        Logger.getLogger(PersistProcessor.class.getName());    /** name of history Database */    public static final String URI_HISTORY_DBNAME = "uri_history";        /**     * @return DatabaseConfig for history Database     */    protected static DatabaseConfig historyDatabaseConfig() {        DatabaseConfig dbConfig = new DatabaseConfig();        dbConfig.setTransactional(false);        dbConfig.setAllowCreate(true);        dbConfig.setDeferredWrite(true);        return dbConfig;    }    /**     * Usual constructor     *      * @param name     * @param string     */    public PersistProcessor(String name, String string) {        super(name,string);    }    /**     * Return a preferred String key for persisting the given CrawlURI's     * AList state.      *      * @param curi CrawlURI     * @return String key     */    public String persistKeyFor(CrawlURI curi) {        // use a case-sensitive SURT for uniqueness and sorting benefits        return SURT.fromURI(curi.getUURI().toString(),true);    }    /**     * Whether the current CrawlURI's state should be persisted (to log or     * direct to database)     *      * @param curi CrawlURI     * @return true if state should be stored; false to skip persistence     */    protected boolean shouldStore(CrawlURI curi) {        // TODO: don't store some codes, such as 304 unchanged?        return curi.isSuccess();    }    /**     * Whether the current CrawlURI's state should be loaded     *      * @param curi CrawlURI     * @return true if state should be loaded; false to skip loading     */    protected boolean shouldLoad(CrawlURI curi) {        // TODO: don't load some (prereqs?)        return true;    }    /**     * Copies entries from an existing environment db to a new one. If     * historyMap is not provided, only logs the entries that would have been      * copied.     *      * @param sourceDir existing environment database directory     * @param historyMap new environment db (or null for a dry run)     * @return number of records     * @throws DatabaseException     */    private static int copyPersistEnv(File sourceDir, StoredSortedMap<String,AList> historyMap)     throws DatabaseException {        int count = 0;        // open the source env history DB, copying entries to target env        EnhancedEnvironment sourceEnv = setupEnvironment(sourceDir);        StoredClassCatalog sourceClassCatalog = sourceEnv.getClassCatalog();        Database sourceHistoryDB = sourceEnv.openDatabase(                null, URI_HISTORY_DBNAME, historyDatabaseConfig());        StoredSortedMap<String,AList> sourceHistoryMap = new StoredSortedMap<String,AList>(sourceHistoryDB,                new StringBinding(), new SerialBinding<AList>(sourceClassCatalog,                        AList.class), true);        Iterator<Entry<String,AList>> iter = sourceHistoryMap.entrySet().iterator();        while (iter.hasNext()) {            Entry<String,AList> item = iter.next();             logger.fine(item.getKey() + " " + item.getValue().toPrettyString());            if (historyMap != null) {                historyMap.put(item.getKey(), item.getValue());            }            count++;        }        StoredIterator.close(iter);        sourceHistoryDB.close();        sourceEnv.close();                return count;    }    /**     * Populates an environment db from a persist log. If historyMap is     * not provided, only logs the entries that would have been populated.     *      * @param persistLogReader     *            persist log     * @param historyMap     *            new environment db (or null for a dry run)     * @return number of records     * @throws UnsupportedEncodingException     * @throws DatabaseException     */    private static int populatePersistEnvFromLog(BufferedReader persistLogReader, StoredSortedMap<String,AList> historyMap)     throws UnsupportedEncodingException, DatabaseException {        int count = 0;                Iterator<String> iter = new LineReadingIterator(persistLogReader);        while (iter.hasNext()) {            String line = iter.next();             if (line.length() == 0) {                continue;            }            String[] splits = line.split(" ");            if (splits.length != 2) {                logger.severe("bad line: " + line);                continue;            }            try {                AList alist = (AList) IoUtils.deserializeFromByteArray(Base64.decodeBase64(splits[1].getBytes("UTF8")));                logger.fine(splits[0] + " " + alist.toPrettyString());                if (historyMap != null) {                    historyMap.put(splits[0], alist);                }            } catch (RuntimeException e) {                logger.log(Level.SEVERE, "problem with line: " + line, e);            }            count++;        }        IOUtils.closeQuietly(persistLogReader);                return count;    }        /**     * Populates a new environment db from an old environment db or a persist     * log. If path to new environment is not provided, only logs the entries      * that would have been populated.     *      * @param sourcePath     *            source of old entries: can be a path to an existing     *            environment db, or a URL or path to a persist log     * @param envFile     *            path to new environment db (or null for a dry run)     * @return number of records     * @throws DatabaseException     * @throws IOException     */    public static int populatePersistEnv(String sourcePath, File envFile)        throws DatabaseException, IOException {        int count = 0;        StoredSortedMap<String,AList> historyMap = null;        EnhancedEnvironment targetEnv = null;        StoredClassCatalog classCatalog = null;        Database historyDB = null;        if (envFile != null) {            // set up target environment            if (!envFile.exists()) {                envFile.mkdirs();            }            targetEnv = setupEnvironment(envFile);            classCatalog = targetEnv.getClassCatalog();            historyDB = targetEnv.openDatabase(null, URI_HISTORY_DBNAME,                     historyDatabaseConfig());            historyMap = new StoredSortedMap<String,AList>(historyDB,                     new StringBinding(), new SerialBinding<AList>(classCatalog,                        AList.class), true);        }        try {            count = copyPersistSourceToHistoryMap(null, sourcePath, historyMap);        } finally {            // in finally block so that we unlock the target env even if we            // failed to populate it            if (envFile != null) {                logger.info(count + " records imported from " + sourcePath + " to BDB env " + envFile);                historyDB.sync();                historyDB.close();                targetEnv.close();            } else {                logger.info(count + " records found in " + sourcePath);            }        }        return count;    }    /**     * Populates a given StoredSortedMap (history map) from an old      * environment db or a persist log. If a map is not provided, only      * logs the entries that would have been populated.     *      * @param sourcePath     *            source of old entries: can be a path to an existing     *            environment db, or a URL or path to a persist log     * @param historyMap     *            map to populate (or null for a dry run)     * @return number of records     * @throws DatabaseException     * @throws IOException     */    public static int copyPersistSourceToHistoryMap(File context,            String sourcePath,            StoredSortedMap<String, AList> historyMap)            throws DatabaseException, IOException, MalformedURLException,            UnsupportedEncodingException {        int count;        // delegate depending on the source        File sourceFile = FileUtils.maybeRelative(context, sourcePath);        if (sourceFile.isDirectory()) {            count = copyPersistEnv(sourceFile, historyMap);        } else {            BufferedReader persistLogReader = null;            if (sourceFile.isFile()) {                persistLogReader = CrawlerJournal.getBufferedReader(sourceFile);            } else {                URL sourceUrl = new URL(sourcePath);                persistLogReader = CrawlerJournal.getBufferedReader(sourceUrl);            }            count = populatePersistEnvFromLog(persistLogReader, historyMap);        }        return count;    }    /**     * Utility main for importing a log into a BDB-JE environment or moving a     * database between environments (2 arguments), or simply dumping a log     * to stderr in a more readable format (1 argument).      *      * @param args command-line arguments     * @throws DatabaseException     * @throws IOException     */    public static void main(String[] args) throws DatabaseException, IOException {        Handler handler = new ConsoleHandler();        handler.setLevel(Level.ALL);        handler.setFormatter(new OneLineSimpleLogger());        logger.addHandler(handler);        logger.setUseParentHandlers(false);        if (args.length == 2) {            logger.setLevel(Level.INFO);            populatePersistEnv(args[0], new File(args[1]));        } else if (args.length == 1) {            logger.setLevel(Level.FINE);            populatePersistEnv(args[0], null);        } else {            System.out.println("Arguments: ");            System.out.println("    source [target]");            System.out.println(                "...where source is either a txtser log file or BDB env dir");            System.out.println(                "and target, if present, is a BDB env dir. ");            return;        }    }    private static EnhancedEnvironment setupEnvironment(File env) throws DatabaseException {        EnvironmentConfig envConfig = new EnvironmentConfig();        envConfig.setAllowCreate(true);        return new EnhancedEnvironment(env, envConfig);    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -