📄 statisticstracker.java
字号:
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Created on Jul 16, 2003 * */package org.archive.crawler.admin;import java.io.File;import java.io.FileWriter;import java.io.IOException;import java.io.PrintWriter;import java.io.Serializable;import java.util.Comparator;import java.util.Date;import java.util.EventObject;import java.util.Hashtable;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.HashMap;import java.util.SortedMap;import java.util.TreeMap;import java.util.TreeSet;import java.util.Vector;import java.util.logging.Level;import java.util.logging.Logger;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.event.CrawlURIDispositionListener;import org.archive.crawler.framework.AbstractTracker;import org.archive.crawler.framework.CrawlController;import org.archive.crawler.framework.exceptions.FatalConfigurationException;import org.archive.crawler.util.CrawledBytesHistotable;import org.archive.net.UURI;import org.archive.util.ArchiveUtils;import org.archive.util.Histotable;import org.archive.util.LongWrapper;import org.archive.util.MimetypeUtils;import org.archive.util.PaddingStringBuffer;/** * This is an implementation of the AbstractTracker. It is designed to function * with the WUI as well as performing various logging activity. * <p> * At the end of each snapshot a line is written to the * 'progress-statistics.log' file. * <p> * The header of that file is as follows: * <pre> [timestamp] [discovered] [queued] [downloaded] [doc/s(avg)] [KB/s(avg)] [dl-failures] [busy-thread] [mem-use-KB]</pre> * First there is a <b>timestamp</b>, accurate down to 1 second. * <p> * <b>discovered</b>, <b>queued</b>, <b>downloaded</b> and <b>dl-failures</b> * are (respectively) the discovered URI count, pending URI count, successfully * fetched count and failed fetch count from the frontier at the time of the * snapshot. * <p> * <b>KB/s(avg)</b> is the bandwidth usage. We use the total bytes downloaded * to calculate average bandwidth usage (KB/sec). Since we also note the value * each time a snapshot is made we can calculate the average bandwidth usage * during the last snapshot period to gain a "current" rate. The first number is * the current and the average is in parenthesis. * <p> * <b>doc/s(avg)</b> works the same way as doc/s except it show the number of * documents (URIs) rather then KB downloaded. * <p> * <b>busy-threads</b> is the total number of ToeThreads that are not available * (and thus presumably busy processing a URI). This information is extracted * from the crawl controller. * <p> * Finally mem-use-KB is extracted from the run time environment * (<code>Runtime.getRuntime().totalMemory()</code>). * <p> * In addition to the data collected for the above logs, various other data * is gathered and stored by this tracker. * <ul> * <li> Successfully downloaded documents per fetch status code * <li> Successfully downloaded documents per document mime type * <li> Amount of data per mime type * <li> Successfully downloaded documents per host * <li> Amount of data per host * <li> Disposition of all seeds (this is written to 'reports.log' at end of * crawl) * <li> Successfully downloaded documents per host per source * </ul> * * @author Parker Thompson * @author Kristinn Sigurdsson * * @see org.archive.crawler.framework.StatisticsTracking * @see org.archive.crawler.framework.AbstractTracker */public class StatisticsTracker extends AbstractTrackerimplements CrawlURIDispositionListener, Serializable { private static final long serialVersionUID = 8004878315916392305L; /** * Messages from the StatisticsTracker. */ private final static Logger logger = Logger.getLogger(StatisticsTracker.class.getName()); // TODO: Need to be able to specify file where the object will be // written once the CrawlEnded event occurs protected long lastPagesFetchedCount = 0; protected long lastProcessedBytesCount = 0; /* * Snapshot data. */ protected long discoveredUriCount = 0; protected long queuedUriCount = 0; protected long finishedUriCount = 0; protected long downloadedUriCount = 0; protected long downloadFailures = 0; protected long downloadDisregards = 0; protected double docsPerSecond = 0; protected double currentDocsPerSecond = 0; protected int currentKBPerSec = 0; protected long totalKBPerSec = 0; protected int busyThreads = 0; protected long totalProcessedBytes = 0; protected float congestionRatio = 0; protected long deepestUri; protected long averageDepth; /* * Cumulative data */ /** tally sizes novel, verified (same hash), vouched (not-modified) */ protected CrawledBytesHistotable crawledBytes = new CrawledBytesHistotable(); /** Keep track of the file types we see (mime type -> count) */ protected Hashtable<String,LongWrapper> mimeTypeDistribution = new Hashtable<String,LongWrapper>(); protected Hashtable<String,LongWrapper> mimeTypeBytes = new Hashtable<String,LongWrapper>(); /** Keep track of fetch status codes */ protected Hashtable<String,LongWrapper> statusCodeDistribution = new Hashtable<String,LongWrapper>(); /** Keep track of hosts. * * Each of these Maps are individually unsynchronized, and cannot * be trivially synchronized with the Collections wrapper. Thus * their synchronized access is enforced by this class. * * <p>They're transient because usually bigmaps that get reconstituted * on recover from checkpoint. */ protected transient Map<String,LongWrapper> hostsDistribution = null; protected transient Map<String,LongWrapper> hostsBytes = null; protected transient Map<String,Long> hostsLastFinished = null; /** Keep track of URL counts per host per seed */ protected transient Map<String,HashMap<String,LongWrapper>> sourceHostDistribution = null; /** * Record of seeds' latest actions. */ protected transient Map<String,SeedRecord> processedSeedsRecords; // seeds tallies: ONLY UPDATED WHEN SEED REPORT WRITTEN private int seedsCrawled; private int seedsNotCrawled; // sExitMessage: only set at crawl-end private String sExitMessage = "Before crawl end"; public StatisticsTracker(String name) { super( name, "A statistics tracker thats integrated into " + "the web UI and that creates the progress-statistics log."); } public void initialize(CrawlController c) throws FatalConfigurationException { super.initialize(c); try { this.sourceHostDistribution = c.getBigMap("sourceHostDistribution", String.class, HashMap.class); this.hostsDistribution = c.getBigMap("hostsDistribution", String.class, LongWrapper.class); this.hostsBytes = c.getBigMap("hostsBytes", String.class, LongWrapper.class); this.hostsLastFinished = c.getBigMap("hostsLastFinished", String.class, Long.class); this.processedSeedsRecords = c.getBigMap("processedSeedsRecords", String.class, SeedRecord.class); } catch (Exception e) { throw new FatalConfigurationException("Failed setup of" + " StatisticsTracker: " + e); } controller.addCrawlURIDispositionListener(this); } protected void finalCleanup() { super.finalCleanup(); if (this.hostsBytes != null) { this.hostsBytes.clear(); this.hostsBytes = null; } if (this.hostsDistribution != null) { this.hostsDistribution.clear(); this.hostsDistribution = null; } if (this.hostsLastFinished != null) { this.hostsLastFinished.clear(); this.hostsLastFinished = null; } if (this.processedSeedsRecords != null) { this.processedSeedsRecords.clear(); this.processedSeedsRecords = null; } if (this.sourceHostDistribution != null) { this.sourceHostDistribution.clear(); this.sourceHostDistribution = null; } } protected synchronized void progressStatisticsEvent(final EventObject e) { // This method loads "snapshot" data. discoveredUriCount = discoveredUriCount(); downloadedUriCount = successfullyFetchedCount(); finishedUriCount = finishedUriCount(); queuedUriCount = queuedUriCount(); downloadFailures = failedFetchAttempts(); downloadDisregards = disregardedFetchAttempts(); totalProcessedBytes = totalBytesCrawled(); congestionRatio = congestionRatio(); deepestUri = deepestUri(); averageDepth = averageDepth(); if (finishedUriCount() == 0) { docsPerSecond = 0; totalKBPerSec = 0; } else if (getCrawlerTotalElapsedTime() < 1000) { return; // Not enough time has passed for a decent snapshot. } else { docsPerSecond = (double) downloadedUriCount / (double)(getCrawlerTotalElapsedTime() / 1000); // Round to nearest long. totalKBPerSec = (long)(((totalProcessedBytes / 1024) / ((getCrawlerTotalElapsedTime()) / 1000)) + .5 ); } busyThreads = activeThreadCount(); if(shouldrun || (System.currentTimeMillis() - lastLogPointTime) >= 1000) { // If shouldrun is false there is a chance that the time interval // since last time is too small for a good sample. We only want // to update "current" data when the interval is long enough or // shouldrun is true. currentDocsPerSecond = 0; currentKBPerSec = 0; // Note time. long currentTime = System.currentTimeMillis(); long sampleTime = currentTime - lastLogPointTime; // if we haven't done anyting or there isn't a reasonable sample // size give up. if (sampleTime >= 1000) { // Update docs/sec snapshot long currentPageCount = successfullyFetchedCount(); long samplePageCount = currentPageCount - lastPagesFetchedCount; currentDocsPerSecond = (double) samplePageCount / (double)(sampleTime / 1000); lastPagesFetchedCount = currentPageCount; // Update kbytes/sec snapshot long currentProcessedBytes = totalProcessedBytes; long sampleProcessedBytes = currentProcessedBytes - lastProcessedBytesCount; currentKBPerSec = (int)(((sampleProcessedBytes/1024)/(sampleTime/1000)) + .5); lastProcessedBytesCount = currentProcessedBytes; } } if (this.controller != null) { this.controller.logProgressStatistics(getProgressStatisticsLine()); } lastLogPointTime = System.currentTimeMillis(); super.progressStatisticsEvent(e); } /** * Return one line of current progress-statistics * * @param now * @return String of stats */ public String getProgressStatisticsLine(Date now) { return new PaddingStringBuffer() .append(ArchiveUtils.getLog14Date(now)) .raAppend(32, discoveredUriCount) .raAppend(44, queuedUriCount) .raAppend(57, downloadedUriCount) .raAppend(74, ArchiveUtils. doubleToString(currentDocsPerSecond, 2) + "(" + ArchiveUtils.doubleToString(docsPerSecond, 2) + ")") .raAppend(85, currentKBPerSec + "(" + totalKBPerSec + ")") .raAppend(99, downloadFailures) .raAppend(113, busyThreads) .raAppend(126, (Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / 1024) .raAppend(140, Runtime.getRuntime().totalMemory() / 1024) .raAppend(153, ArchiveUtils.doubleToString(congestionRatio, 2)) .raAppend(165, deepestUri) .raAppend(177, averageDepth) .toString(); } public Map<String,Number> getProgressStatistics() { Map<String,Number> stats = new HashMap<String,Number>(); stats.put("discoveredUriCount", new Long(discoveredUriCount)); stats.put("queuedUriCount", new Long(queuedUriCount)); stats.put("downloadedUriCount", new Long(downloadedUriCount)); stats.put("currentDocsPerSecond", new Double(currentDocsPerSecond)); stats.put("docsPerSecond", new Double(docsPerSecond)); stats.put("totalKBPerSec", new Long(totalKBPerSec)); stats.put("totalProcessedBytes", new Long(totalProcessedBytes)); stats.put("currentKBPerSec", new Long(currentKBPerSec)); stats.put("downloadFailures", new Long(downloadFailures)); stats.put("busyThreads", new Integer(busyThreads)); stats.put("congestionRatio", new Double(congestionRatio)); stats.put("deepestUri", new Long(deepestUri)); stats.put("averageDepth", new Long(averageDepth)); stats.put("totalMemory", new Long(Runtime.getRuntime().totalMemory())); stats.put("freeMemory", new Long(Runtime.getRuntime().freeMemory())); return stats; } /** * Return one line of current progress-statistics * * @return String of stats */ public String getProgressStatisticsLine() { return getProgressStatisticsLine(new Date()); } public double processedDocsPerSec(){ return docsPerSecond; } public double currentProcessedDocsPerSec(){ return currentDocsPerSecond; } public long processedKBPerSec(){
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -