📄 statisticssummary.java
字号:
/* StatisticsSummary * * $Id: StatisticsSummary.java,v 1.2 2006/08/15 00:25:02 paul_jack Exp $$ * * Created on July 27, 2006 * * Copyright (C) 2006 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package org.archive.crawler.admin;import java.io.File;import java.io.FileReader;import java.io.BufferedReader;import java.io.IOException;import java.util.Comparator;import java.util.Hashtable;import java.util.Iterator;import java.util.Map;import java.util.SortedMap;import java.util.TreeMap;import java.util.TreeSet;import java.util.logging.Level;import java.util.logging.Logger;import org.archive.util.LongWrapper;/** * This class provides descriptive statistics of a finished crawl job by * using the crawl report files generated by StatisticsTracker. Any formatting * changes to the way StatisticsTracker writes to the summary crawl reports will * require changes to this class. * <p> * The following statistics are accessible from this class: * <ul> * <li> Successfully downloaded documents per fetch status code * <li> Successfully downloaded documents per document mime type * <li> Amount of data per mime type * <li> Successfully downloaded documents per host * <li> Amount of data per host * <li> Successfully downloaded documents per top-level domain name (TLD) * <li> Disposition of all seeds * <li> Successfully downloaded documents per host per source * </ul> * * <p>TODO: Make it so summarizing is not done all in RAM so we avoid * OOME. * * @author Frank McCown * * @see org.archive.crawler.admin.StatisticsTracker */public class StatisticsSummary { /** * Messages from the StatisticsSummary. */ private final static Logger logger = Logger.getLogger(StatisticsSummary.class.getName()); private boolean stats = true; /** Crawl job whose summary we want to view */ private CrawlJob cjob; protected long totalDnsStatusCodeDocuments = 0; protected long totalStatusCodeDocuments = 0; protected long totalFileTypeDocuments = 0; protected long totalMimeTypeDocuments = 0; protected long totalDnsMimeTypeDocuments = 0; protected long totalDnsHostDocuments = 0; protected long totalHostDocuments = 0; protected long totalMimeSize = 0; protected long totalDnsMimeSize = 0; protected long totalHostSize = 0; protected long totalDnsHostSize = 0; protected long totalTldDocuments = 0; protected long totalTldSize = 0; protected long totalHosts = 0; protected String durationTime; protected String processedDocsPerSec; protected String bandwidthKbytesPerSec; protected String totalDataWritten; /** Keep track of the file types we see (mime type -> count) */ protected Hashtable mimeTypeDistribution = new Hashtable(); protected Hashtable mimeTypeBytes = new Hashtable(); protected Hashtable mimeTypeDnsDistribution = new Hashtable(); protected Hashtable mimeTypeDnsBytes = new Hashtable(); /** Keep track of status codes */ protected Hashtable statusCodeDistribution = new Hashtable(); protected Hashtable dnsStatusCodeDistribution = new Hashtable(); /** Keep track of hosts */ protected Hashtable hostsDistribution = new Hashtable(); protected Hashtable hostsBytes = new Hashtable(); protected Hashtable hostsDnsDistribution = new Hashtable(); protected Hashtable hostsDnsBytes = new Hashtable(); /** Keep track of TLDs */ protected Hashtable tldDistribution = new Hashtable(); protected Hashtable tldBytes = new Hashtable(); protected Hashtable tldHostDistribution = new Hashtable(); /** Keep track of processed seeds */ protected transient Map processedSeedsRecords = new Hashtable(); /** * Constructor * * @param cjob * Completed crawl job */ public StatisticsSummary(CrawlJob cjob) { this.cjob = cjob; // Read all stats for this crawl job this.stats = calculateStatusCodeDistribution(); if (calculateMimeTypeDistribution()) { this.stats = true; } if (calculateHostsDistribution()) { this.stats = true; } if (readCrawlReport()) { this.stats = true; } if (readSeedReport()) { this.stats = true; } } /** * Increment a counter for a key in a given HashMap. Used for various * aggregate data. * * @param map The HashMap * @param key The key for the counter to be incremented, if it does not * exist it will be added (set to 1). If null it will * increment the counter "unknown". */ protected static void incrementMapCount(Map map, String key) { incrementMapCount(map,key,1); } /** * Increment a counter for a key in a given HashMap by an arbitrary amount. * Used for various aggregate data. The increment amount can be negative. * * @param map * The HashMap * @param key * The key for the counter to be incremented, if it does not * exist it will be added (set to equal to * <code>increment</code>). * If null it will increment the counter "unknown". * @param increment * The amount to increment counter related to the * <code>key</code>. */ protected static void incrementMapCount(Map map, String key, long increment) { if (key == null) { key = "unknown"; } LongWrapper lw = (LongWrapper)map.get(key); if(lw == null) { map.put(key, new LongWrapper(increment)); } else { lw.longValue += increment; } } /** Returns a HashMap that contains information about distributions of * encountered mime types. Key/value pairs represent * mime type -> count. * <p> * <b>Note:</b> All the values are wrapped with a * {@link LongWrapper LongWrapper} * @return mimeTypeDistribution */ public Hashtable getMimeDistribution() { return mimeTypeDistribution; } public long getTotalMimeTypeDocuments() { return totalMimeTypeDocuments; } public long getTotalDnsMimeTypeDocuments() { return totalDnsMimeTypeDocuments; } public long getTotalMimeSize() { return totalMimeSize; } public long getTotalDnsMimeSize() { return totalDnsMimeSize; } /** * Return a HashMap representing the distribution of HTTP status codes for * successfully fetched curis, as represented by a hashmap where key -> * val represents (string)code -> (integer)count. * * <b>Note: </b> All the values are wrapped with a * {@link LongWrapper LongWrapper} * * @return statusCodeDistribution */ public Hashtable getStatusCodeDistribution() { return statusCodeDistribution; } /** * Return a HashMap representing the distribution of DNS status codes for * successfully fetched curis, as represented by a hashmap where key -> * val represents (string)code -> (integer)count. * * <b>Note: </b> All the values are wrapped with a * {@link LongWrapper LongWrapper} * * @return dnsStatusCodeDistribution */ public Hashtable getDnsStatusCodeDistribution() { return dnsStatusCodeDistribution; } public Hashtable getDnsMimeDistribution() { return mimeTypeDnsDistribution; } public long getTotalDnsStatusCodeDocuments() { return totalDnsStatusCodeDocuments; } public long getTotalStatusCodeDocuments() { return totalStatusCodeDocuments; } public long getTotalHostDocuments() { return totalHostDocuments; } public long getTotalDnsHostDocuments() { return totalDnsHostDocuments; } public Hashtable getHostsDnsDistribution() { return hostsDnsDistribution; } public long getTotalHostDnsDocuments() { return totalDnsHostDocuments; } public long getTotalHostSize() { return totalHostSize; } public long getTotalDnsHostSize() { return totalDnsHostSize; } public Hashtable getTldDistribution() { return tldDistribution; } public Hashtable getTldBytes() { return tldBytes; } public long getTotalTldDocuments() { return totalTldDocuments; } public long getTotalTldSize() { return totalTldSize; } public Hashtable getTldHostDistribution() { return tldHostDistribution; } public long getTotalHosts() { return totalHosts; } public String getDurationTime() { return durationTime; } public String getProcessedDocsPerSec() { return processedDocsPerSec; } public String getBandwidthKbytesPerSec() { return bandwidthKbytesPerSec; } public String getTotalDataWritten() { return totalDataWritten; } /** * Sort the entries of the given HashMap in descending order by their * values, which must be longs wrapped with <code>LongWrapper</code>. * <p> * Elements are sorted by value from largest to smallest. Equal values are * sorted in an arbitrary, but consistent manner by their keys. Only items * with identical value and key are considered equal. * * If the passed-in map requires access to be synchronized, the caller * should ensure this synchronization. * * @param mapOfLongWrapperValues * Assumes values are wrapped with LongWrapper. * @return a sorted set containing the same elements as the map. */ public TreeMap getReverseSortedCopy(final Map mapOfLongWrapperValues) { TreeMap sortedMap = new TreeMap(new Comparator() { public int compare(Object e1, Object e2) { long firstVal = ((LongWrapper)mapOfLongWrapperValues.get(e1)). longValue; long secondVal = ((LongWrapper)mapOfLongWrapperValues.get(e2)). longValue; if (firstVal < secondVal) { return 1; } if (secondVal < firstVal) { return -1; } // If the values are the same, sort by keys. return ((String)e1).compareTo((String)e2); } }); try { sortedMap.putAll(mapOfLongWrapperValues); } catch (UnsupportedOperationException e) { Iterator i = mapOfLongWrapperValues.keySet().iterator(); for (;i.hasNext();) { // Ok. Try doing it the slow way then. Object key = i.next(); sortedMap.put(key, mapOfLongWrapperValues.get(key)); } } return sortedMap; } /** * Get the number of hosts with a particular TLD. * @param tld * top-level domain name * @return Total crawled hosts */ public long getHostsPerTld(String tld) { LongWrapper lw = (LongWrapper)tldHostDistribution.get(tld); return (lw == null ? 0 : lw.longValue); } /** * Read status code distribution from responsecode-report.txt. * DNS and HTTP status codes are separated when read. * @return True if we found some stats. */ private boolean calculateStatusCodeDistribution() { // Read from responsecode-report.txt File f = new File(cjob.getDirectory(), "responsecode-report.txt"); if (!f.exists()) { return false; } BufferedReader br = null; try {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -