⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 persistprocessor.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
字号:
/* PersistProcessor.java *  * Created on Feb 17, 2005 * * Copyright (C) 2007 Internet Archive. *  * This file is part of the Heritrix web crawler (crawler.archive.org). *  * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. *  * Heritrix is distributed in the hope that it will be useful,  * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. *  * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.crawler.processor.recrawl;import java.io.BufferedReader;import java.io.File;import java.io.FileNotFoundException;import java.io.FileReader;import java.io.IOException;import java.io.UnsupportedEncodingException;import java.util.Iterator;import java.util.Map.Entry;import org.apache.commons.codec.binary.Base64;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.framework.Processor;import org.archive.crawler.io.CrawlerJournal;import org.archive.util.IoUtils;import org.archive.util.SURT;import org.archive.util.bdbje.EnhancedEnvironment;import org.archive.util.iterator.LineReadingIterator;import st.ata.util.AList;import com.sleepycat.bind.serial.SerialBinding;import com.sleepycat.bind.serial.StoredClassCatalog;import com.sleepycat.bind.tuple.StringBinding;import com.sleepycat.collections.StoredIterator;import com.sleepycat.collections.StoredSortedMap;import com.sleepycat.je.Database;import com.sleepycat.je.DatabaseConfig;import com.sleepycat.je.DatabaseException;import com.sleepycat.je.EnvironmentConfig;/** * Superclass for Processors which utilize BDB-JE for URI state * (including most notably history) persistence. *  * @author gojomo */public abstract class PersistProcessor extends Processor {    /** name of history Database */    public static final String URI_HISTORY_DBNAME = "uri_history";        /**     * @return DatabaseConfig for history Database     */    protected static DatabaseConfig historyDatabaseConfig() {        DatabaseConfig dbConfig = new DatabaseConfig();        dbConfig.setTransactional(false);        dbConfig.setAllowCreate(true);        dbConfig.setDeferredWrite(true);        return dbConfig;    }    /**     * Usual constructor     *      * @param name     * @param string     */    public PersistProcessor(String name, String string) {        super(name,string);    }    /**     * Return a preferred String key for persisting the given CrawlURI's     * AList state.      *      * @param curi CrawlURI     * @return String key     */    public String persistKeyFor(CrawlURI curi) {        // use a case-sensitive SURT for uniqueness and sorting benefits        return SURT.fromURI(curi.getUURI().toString(),true);    }    /**     * Whether the current CrawlURI's state should be persisted (to log or     * direct to database)     *      * @param curi CrawlURI     * @return true if state should be stored; false to skip persistence     */    protected boolean shouldStore(CrawlURI curi) {        // TODO: don't store some codes, such as 304 unchanged?        return curi.isSuccess();    }    /**     * Whether the current CrawlURI's state should be loaded     *      * @param curi CrawlURI     * @return true if state should be loaded; false to skip loading     */    protected boolean shouldLoad(CrawlURI curi) {        // TODO: don't load some (prereqs?)        return true;    }    /**     * Utility main for importing a log into a BDB-JE environment or moving a     * database between environments (2 arguments), or simply dumping a log     * to stdout in a more readable format (1 argument).      *      * @param args command-line arguments     * @throws DatabaseException     * @throws IOException     */    public static void main(String[] args) throws DatabaseException, IOException {        if(args.length==2) {            main2args(args);        } else if (args.length==1) {            main1arg(args);        } else {            System.out.println("Arguments: ");            System.out.println("    source [target]");            System.out.println(                "...where source is either a txtser log file or BDB env dir");            System.out.println(                "and target, if present, is a BDB env dir. ");            return;        }            }    /**     * Move the history information in the first argument (either the path      * to a log or to an environment containing a uri_history database) to      * the environment in the second environment (path; environment will      * be created if it dow not already exist).      *      * @param args command-line arguments     * @throws DatabaseException     * @throws FileNotFoundException     * @throws UnsupportedEncodingException     * @throws IOException     */    private static void main2args(String[] args) throws DatabaseException, FileNotFoundException, UnsupportedEncodingException, IOException {        File source = new File(args[0]);        File env = new File(args[1]);        if(!env.exists()) {            env.mkdirs();        }                // setup target environment        EnhancedEnvironment targetEnv = setupEnvironment(env);        StoredClassCatalog classCatalog = targetEnv.getClassCatalog();        Database historyDB = targetEnv.openDatabase(                null,URI_HISTORY_DBNAME,historyDatabaseConfig());        StoredSortedMap historyMap = new StoredSortedMap(historyDB,                new StringBinding(), new SerialBinding(classCatalog,                        AList.class), true);                int count = 0;                if(source.isFile()) {            // scan log, writing to database            BufferedReader br = CrawlerJournal.getBufferedReader(source);            Iterator iter = new LineReadingIterator(br);            while(iter.hasNext()) {                String line = (String) iter.next();                 String[] splits = line.split(" ");                historyMap.put(                        splits[0],                         IoUtils.deserializeFromByteArray(                                Base64.decodeBase64(splits[1].getBytes("UTF8"))));                count++;            }            br.close();        } else {            // open the source env history DB, copying entries to target env            EnhancedEnvironment sourceEnv = setupEnvironment(source);            StoredClassCatalog sourceClassCatalog = sourceEnv.getClassCatalog();            Database sourceHistoryDB = sourceEnv.openDatabase(                    null,URI_HISTORY_DBNAME,historyDatabaseConfig());            StoredSortedMap sourceHistoryMap = new StoredSortedMap(sourceHistoryDB,                    new StringBinding(), new SerialBinding(sourceClassCatalog,                            AList.class), true);            Iterator iter = sourceHistoryMap.entrySet().iterator();            while(iter.hasNext()) {                Entry item = (Entry) iter.next();                 historyMap.put(item.getKey(), item.getValue());                count++;            }            StoredIterator.close(iter);            sourceHistoryDB.close();            sourceEnv.close();        }                // cleanup        historyDB.sync();        historyDB.close();        targetEnv.close();        System.out.println(count+" records imported from "+source+" to BDB env "+env);    }    /**     * Dump the contents of the argument (path to a persist log) to stdout     * in a slightly more readable format.      *      * @param args command-line arguments     * @throws DatabaseException     * @throws FileNotFoundException     * @throws UnsupportedEncodingException     * @throws IOException     */    private static void main1arg(String[] args) throws DatabaseException, FileNotFoundException, UnsupportedEncodingException, IOException {        File source = new File(args[0]);                int count = 0;                if(source.isFile()) {            // scan log, writing to database            BufferedReader br = CrawlerJournal.getBufferedReader(source);            Iterator iter = new LineReadingIterator(br);            while(iter.hasNext()) {                String line = (String) iter.next();                 String[] splits = line.split(" ");                AList alist = (AList)IoUtils.deserializeFromByteArray(                        Base64.decodeBase64(splits[1].getBytes("UTF8")));                System.out.println(                        splits[0] + " " + alist.toPrettyString());                count++;            }            br.close();        } else {            // open the source env history DB, copying entries to target env            EnhancedEnvironment sourceEnv = setupEnvironment(source);            StoredClassCatalog sourceClassCatalog = sourceEnv.getClassCatalog();            Database sourceHistoryDB = sourceEnv.openDatabase(                    null,URI_HISTORY_DBNAME,historyDatabaseConfig());            StoredSortedMap sourceHistoryMap = new StoredSortedMap(sourceHistoryDB,                    new StringBinding(), new SerialBinding(sourceClassCatalog,                            AList.class), true);            Iterator iter = sourceHistoryMap.entrySet().iterator();            while(iter.hasNext()) {                Entry item = (Entry) iter.next();                 AList alist = (AList)item.getValue();                System.out.println(item.getKey() + " " + alist.toPrettyString());                count++;            }            StoredIterator.close(iter);            sourceHistoryDB.close();            sourceEnv.close();        }                System.out.println(count+" records dumped from "+source);    }        private static EnhancedEnvironment setupEnvironment(File env) throws DatabaseException {        EnvironmentConfig envConfig = new EnvironmentConfig();        envConfig.setAllowCreate(true);        return new EnhancedEnvironment(env, envConfig);    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -