📄 statisticstracker.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * * Created on Jul 16, 2003 * */package org.archive.crawler.admin;import java.io.File;import java.io.FileWriter;import java.io.IOException;import java.io.PrintWriter;import java.io.Serializable;import java.util.Comparator;import java.util.Date;import java.util.EventObject;import java.util.Hashtable;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.HashMap;import java.util.SortedMap;import java.util.TreeMap;import java.util.TreeSet;import java.util.Vector;import java.util.logging.Level;import java.util.logging.Logger;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.event.CrawlURIDispositionListener;import org.archive.crawler.framework.AbstractTracker;import org.archive.crawler.framework.CrawlController;import org.archive.crawler.framework.exceptions.FatalConfigurationException;import org.archive.crawler.util.CrawledBytesHistotable;import org.archive.net.UURI;import org.archive.util.ArchiveUtils;import org.archive.util.Histotable;import org.archive.util.LongWrapper;import org.archive.util.MimetypeUtils;import org.archive.util.PaddingStringBuffer;/** * This is an implementation of the AbstractTracker. It is designed to function * with the WUI as well as performing various logging activity. * <p> * At the end of each snapshot a line is written to the * 'progress-statistics.log' file. * <p> * The header of that file is as follows: * <pre> [timestamp] [discovered]    [queued] [downloaded] [doc/s(avg)]  [KB/s(avg)] [dl-failures] [busy-thread] [mem-use-KB]</pre> * First there is a <b>timestamp</b>, accurate down to 1 second. * <p> * <b>discovered</b>, <b>queued</b>, <b>downloaded</b> and <b>dl-failures</b> * are (respectively) the discovered URI count, pending URI count, successfully * fetched count and failed fetch count from the frontier at the time of the * snapshot. * <p> * <b>KB/s(avg)</b> is the bandwidth usage.  We use the total bytes downloaded * to calculate average bandwidth usage (KB/sec). Since we also note the value * each time a snapshot is made we can calculate the average bandwidth usage * during the last snapshot period to gain a "current" rate. The first number is * the current and the average is in parenthesis. * <p> * <b>doc/s(avg)</b> works the same way as doc/s except it show the number of * documents (URIs) rather then KB downloaded. * <p> * <b>busy-threads</b> is the total number of ToeThreads that are not available * (and thus presumably busy processing a URI). This information is extracted * from the crawl controller. * <p> * Finally mem-use-KB is extracted from the run time environment * (<code>Runtime.getRuntime().totalMemory()</code>). * <p> * In addition to the data collected for the above logs, various other data * is gathered and stored by this tracker. * <ul> *   <li> Successfully downloaded documents per fetch status code *   <li> Successfully downloaded documents per document mime type *   <li> Amount of data per mime type *   <li> Successfully downloaded documents per host *   <li> Amount of data per host *   <li> Disposition of all seeds (this is written to 'reports.log' at end of *        crawl) *   <li> Successfully downloaded documents per host per source * </ul> * * @author Parker Thompson * @author Kristinn Sigurdsson * * @see org.archive.crawler.framework.StatisticsTracking * @see org.archive.crawler.framework.AbstractTracker */public class StatisticsTracker extends AbstractTrackerimplements CrawlURIDispositionListener, Serializable {    private static final long serialVersionUID = 8004878315916392305L;    /**     * Messages from the StatisticsTracker.     */    private final static Logger logger =        Logger.getLogger(StatisticsTracker.class.getName());        // TODO: Need to be able to specify file where the object will be    // written once the CrawlEnded event occurs    protected long lastPagesFetchedCount = 0;    protected long lastProcessedBytesCount = 0;    /*     * Snapshot data.     */    protected long discoveredUriCount = 0;    protected long queuedUriCount = 0;    protected long finishedUriCount = 0;    protected long downloadedUriCount = 0;    protected long downloadFailures = 0;    protected long downloadDisregards = 0;    protected double docsPerSecond = 0;    protected double currentDocsPerSecond = 0;    protected int currentKBPerSec = 0;    protected long totalKBPerSec = 0;    protected int busyThreads = 0;    protected long totalProcessedBytes = 0;    protected float congestionRatio = 0;     protected long deepestUri;    protected long averageDepth;        /*     * Cumulative data     */    /** tally sizes novel, verified (same hash), vouched (not-modified) */     protected CrawledBytesHistotable crawledBytes = new CrawledBytesHistotable();        /** Keep track of the file types we see (mime type -> count) */    protected Hashtable<String,LongWrapper> mimeTypeDistribution     = new Hashtable<String,LongWrapper>();    protected Hashtable<String,LongWrapper> mimeTypeBytes     = new Hashtable<String,LongWrapper>();        /** Keep track of fetch status codes */    protected Hashtable<String,LongWrapper> statusCodeDistribution     = new Hashtable<String,LongWrapper>();        /** Keep track of hosts.      *      * Each of these Maps are individually unsynchronized, and cannot      * be trivially synchronized with the Collections wrapper. Thus     * their synchronized access is enforced by this class.     *      * <p>They're transient because usually bigmaps that get reconstituted     * on recover from checkpoint.     */    protected transient Map<String,LongWrapper> hostsDistribution = null;    protected transient Map<String,LongWrapper> hostsBytes = null;    protected transient Map<String,Long> hostsLastFinished = null;    /** Keep track of URL counts per host per seed */    protected transient     Map<String,HashMap<String,LongWrapper>> sourceHostDistribution = null;    /**     * Record of seeds' latest actions.     */    protected transient Map<String,SeedRecord> processedSeedsRecords;    // seeds tallies: ONLY UPDATED WHEN SEED REPORT WRITTEN    private int seedsCrawled;    private int seedsNotCrawled;    // sExitMessage: only set at crawl-end    private String sExitMessage = "Before crawl end";    public StatisticsTracker(String name) {        super( name, "A statistics tracker thats integrated into " +            "the web UI and that creates the progress-statistics log.");    }    public void initialize(CrawlController c)    throws FatalConfigurationException {        super.initialize(c);        try {            this.sourceHostDistribution = c.getBigMap("sourceHostDistribution",            	    String.class, HashMap.class);            this.hostsDistribution = c.getBigMap("hostsDistribution",                String.class, LongWrapper.class);            this.hostsBytes = c.getBigMap("hostsBytes", String.class,                LongWrapper.class);            this.hostsLastFinished = c.getBigMap("hostsLastFinished",                String.class, Long.class);            this.processedSeedsRecords = c.getBigMap("processedSeedsRecords",                    String.class, SeedRecord.class);        } catch (Exception e) {            throw new FatalConfigurationException("Failed setup of" +                " StatisticsTracker: " + e);        }        controller.addCrawlURIDispositionListener(this);    }    protected void finalCleanup() {        super.finalCleanup();        if (this.hostsBytes != null) {            this.hostsBytes.clear();            this.hostsBytes = null;        }        if (this.hostsDistribution != null) {            this.hostsDistribution.clear();            this.hostsDistribution = null;        }        if (this.hostsLastFinished != null) {            this.hostsLastFinished.clear();            this.hostsLastFinished = null;        }        if (this.processedSeedsRecords != null) {            this.processedSeedsRecords.clear();            this.processedSeedsRecords = null;        }        if (this.sourceHostDistribution != null) {            this.sourceHostDistribution.clear();            this.sourceHostDistribution = null;        }    }    protected synchronized void progressStatisticsEvent(final EventObject e) {        // This method loads "snapshot" data.        discoveredUriCount = discoveredUriCount();        downloadedUriCount = successfullyFetchedCount();        finishedUriCount = finishedUriCount();        queuedUriCount = queuedUriCount();        downloadFailures = failedFetchAttempts();        downloadDisregards = disregardedFetchAttempts();        totalProcessedBytes = totalBytesCrawled();        congestionRatio = congestionRatio();        deepestUri = deepestUri();        averageDepth = averageDepth();                if (finishedUriCount() == 0) {            docsPerSecond = 0;            totalKBPerSec = 0;        } else if (getCrawlerTotalElapsedTime() < 1000) {            return; // Not enough time has passed for a decent snapshot.        } else {            docsPerSecond = (double) downloadedUriCount /                (double)(getCrawlerTotalElapsedTime() / 1000);            // Round to nearest long.            totalKBPerSec = (long)(((totalProcessedBytes / 1024) /                 ((getCrawlerTotalElapsedTime()) / 1000)) + .5 );        }        busyThreads = activeThreadCount();        if(shouldrun ||            (System.currentTimeMillis() - lastLogPointTime) >= 1000) {            // If shouldrun is false there is a chance that the time interval            // since last time is too small for a good sample.  We only want            // to update "current" data when the interval is long enough or            // shouldrun is true.            currentDocsPerSecond = 0;            currentKBPerSec = 0;            // Note time.            long currentTime = System.currentTimeMillis();            long sampleTime = currentTime - lastLogPointTime;            // if we haven't done anyting or there isn't a reasonable sample            // size give up.            if (sampleTime >= 1000) {                // Update docs/sec snapshot                long currentPageCount = successfullyFetchedCount();                long samplePageCount = currentPageCount - lastPagesFetchedCount;                currentDocsPerSecond =                    (double) samplePageCount / (double)(sampleTime / 1000);                lastPagesFetchedCount = currentPageCount;                // Update kbytes/sec snapshot                long currentProcessedBytes = totalProcessedBytes;                long sampleProcessedBytes =                    currentProcessedBytes - lastProcessedBytesCount;                currentKBPerSec =                    (int)(((sampleProcessedBytes/1024)/(sampleTime/1000)) + .5);                lastProcessedBytesCount = currentProcessedBytes;            }        }        if (this.controller != null) {            this.controller.logProgressStatistics(getProgressStatisticsLine());        }        lastLogPointTime = System.currentTimeMillis();        super.progressStatisticsEvent(e);    }    /**     * Return one line of current progress-statistics     *      * @param now     * @return String of stats     */    public String getProgressStatisticsLine(Date now) {        return new PaddingStringBuffer()            .append(ArchiveUtils.getLog14Date(now))            .raAppend(32, discoveredUriCount)            .raAppend(44, queuedUriCount)            .raAppend(57, downloadedUriCount)            .raAppend(74, ArchiveUtils.                doubleToString(currentDocsPerSecond, 2) +                "(" + ArchiveUtils.doubleToString(docsPerSecond, 2) + ")")            .raAppend(85, currentKBPerSec + "(" + totalKBPerSec + ")")            .raAppend(99, downloadFailures)            .raAppend(113, busyThreads)            .raAppend(126, (Runtime.getRuntime().totalMemory() -                Runtime.getRuntime().freeMemory()) / 1024)            .raAppend(140, Runtime.getRuntime().totalMemory() / 1024)            .raAppend(153, ArchiveUtils.doubleToString(congestionRatio, 2))            .raAppend(165, deepestUri)            .raAppend(177, averageDepth)            .toString();    }        public Map<String,Number> getProgressStatistics() {        Map<String,Number> stats = new HashMap<String,Number>();        stats.put("discoveredUriCount", new Long(discoveredUriCount));        stats.put("queuedUriCount", new Long(queuedUriCount));        stats.put("downloadedUriCount", new Long(downloadedUriCount));        stats.put("currentDocsPerSecond", new Double(currentDocsPerSecond));        stats.put("docsPerSecond", new Double(docsPerSecond));        stats.put("totalKBPerSec", new Long(totalKBPerSec));        stats.put("totalProcessedBytes", new Long(totalProcessedBytes));        stats.put("currentKBPerSec", new Long(currentKBPerSec));        stats.put("downloadFailures", new Long(downloadFailures));        stats.put("busyThreads", new Integer(busyThreads));        stats.put("congestionRatio", new Double(congestionRatio));        stats.put("deepestUri", new Long(deepestUri));        stats.put("averageDepth", new Long(averageDepth));        stats.put("totalMemory", new Long(Runtime.getRuntime().totalMemory()));        stats.put("freeMemory", new Long(Runtime.getRuntime().freeMemory()));        return stats;    }    /**     * Return one line of current progress-statistics     *      * @return String of stats     */    public String getProgressStatisticsLine() {        return getProgressStatisticsLine(new Date());    }        public double processedDocsPerSec(){        return docsPerSecond;    }    public double currentProcessedDocsPerSec(){        return currentDocsPerSecond;    }    public long processedKBPerSec(){
12 3 下一页
💿 文件大小 9430 K
👤 上传用户 zergwyk
📂 所属分类 Internet/网络编程
🏷️ 相关标签

#lucece
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -