⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 bdbworkqueue.java

📁 最强的爬虫工程
💻 JAVA
字号:
/* BdbWorkQueue *  * Created on Dec 24, 2004 * * Copyright (C) 2004 Internet Archive. *  * This file is part of the Heritrix web crawler (crawler.archive.org). *  * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. *  * Heritrix is distributed in the hope that it will be useful,  * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. *  * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.crawler.frontier;import java.io.IOException;import java.io.Serializable;import java.io.UnsupportedEncodingException;import java.util.logging.Level;import java.util.logging.Logger;import org.archive.crawler.datamodel.CrawlURI;import org.archive.util.ArchiveUtils;import org.archive.util.IoUtils;import com.sleepycat.je.DatabaseEntry;import com.sleepycat.je.DatabaseException;/** * One independent queue of items with the same 'classKey' (eg host). * @author gojomo */public class BdbWorkQueue extends WorkQueueimplements Comparable, Serializable {    private static Logger LOGGER =        Logger.getLogger(BdbWorkQueue.class.getName());        // be robust against trivial implementation changes    private static final long serialVersionUID = ArchiveUtils        .classnameBasedUID(BdbWorkQueue.class, 1);    /**     * All items in this queue have this same 'origin'     * prefix to their keys.     */    private byte[] origin;    /**     * Create a virtual queue inside the given BdbMultipleWorkQueues      *      * @param classKey     */    public BdbWorkQueue(String classKey, BdbFrontier frontier) {        super(classKey);        this.origin = BdbMultipleWorkQueues.calculateOriginKey(classKey);        if (LOGGER.isLoggable(Level.FINE)) {            LOGGER.fine(getPrefixClassKey(this.origin) + " " + classKey);        }        // add the queue-front 'cap' entry; see...        // http://sourceforge.net/tracker/index.php?func=detail&aid=1262665&group_id=73833&atid=539102        frontier.getWorkQueues().addCap(origin);    }    protected long deleteMatchingFromQueue(final WorkQueueFrontier frontier,            final String match) throws IOException {        try {            final BdbMultipleWorkQueues queues = ((BdbFrontier) frontier)                .getWorkQueues();            return queues.deleteMatchingFromQueue(match, classKey,                new DatabaseEntry(origin));        } catch (DatabaseException e) {            throw IoUtils.wrapAsIOException(e);        }    }    protected void deleteItem(final WorkQueueFrontier frontier,            final CrawlURI peekItem) throws IOException {        try {            final BdbMultipleWorkQueues queues = ((BdbFrontier) frontier)                .getWorkQueues();             queues.delete(peekItem);        } catch (DatabaseException e) {            e.printStackTrace();            throw IoUtils.wrapAsIOException(e);        }    }    protected CrawlURI peekItem(final WorkQueueFrontier frontier)    throws IOException {        final BdbMultipleWorkQueues queues = ((BdbFrontier) frontier)            .getWorkQueues();        DatabaseEntry key = new DatabaseEntry(origin);        CrawlURI curi = null;        int tries = 1;        while(true) {            try {                curi = queues.get(key);            } catch (DatabaseException e) {                LOGGER.log(Level.SEVERE,"peekItem failure; retrying",e);            }                        // ensure CrawlURI, if any,  came from acceptable range:             if(!ArchiveUtils.startsWith(key.getData(),origin)) {                LOGGER.severe(                    "inconsistency: "+classKey+"("+                    getPrefixClassKey(origin)+") with " + getCount() + " items gave "                    + curi +"("+getPrefixClassKey(key.getData()));                // clear curi to allow retry                curi = null;                 // reset key to original origin for retry                key.setData(origin);            }                        if (curi!=null) {                // success                break;            }                        if (tries>3) {                LOGGER.severe("no item where expected in queue "+classKey);                break;            }            tries++;            LOGGER.severe("Trying get #" + Integer.toString(tries)                    + " in queue " + classKey + " with " + getCount()                    + " items using key "                    + getPrefixClassKey(key.getData()));        }         return curi;    }    protected void insertItem(final WorkQueueFrontier frontier,            final CrawlURI curi) throws IOException {        try {            final BdbMultipleWorkQueues queues = ((BdbFrontier) frontier)                .getWorkQueues();            queues.put(curi);            if (LOGGER.isLoggable(Level.FINE)) {                LOGGER.fine("Inserted into " + getPrefixClassKey(this.origin) +                    " (count " + Long.toString(getCount())+ "): " +                        curi.toString());            }        } catch (DatabaseException e) {            throw IoUtils.wrapAsIOException(e);        }    }        /**     * @param byteArray Byte array to get hex string of.     * @return Hex string of passed in byte array (Used logging     * key-prefixes).     */    protected static String getPrefixClassKey(final byte [] byteArray) {        int zeroIndex = 0;        while(byteArray[zeroIndex]!=0) {            zeroIndex++;        }        try {            return new String(byteArray,0,zeroIndex,"UTF-8");        } catch (UnsupportedEncodingException e) {            // should be impossible; UTF-8 always available            e.printStackTrace();            return e.getMessage();        }    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -