⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 statisticssummary.java

📁 最强的爬虫工程
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/* StatisticsSummary *  * $Id: StatisticsSummary.java,v 1.2 2006/08/15 00:25:02 paul_jack Exp $$ *  * Created on July 27, 2006 *  * Copyright (C) 2006 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.crawler.admin;import java.io.File;import java.io.FileReader;import java.io.BufferedReader;import java.io.IOException;import java.util.Comparator;import java.util.Hashtable;import java.util.Iterator;import java.util.Map;import java.util.SortedMap;import java.util.TreeMap;import java.util.TreeSet;import java.util.logging.Level;import java.util.logging.Logger;import org.archive.util.LongWrapper;/** * This class provides descriptive statistics of a finished crawl job by * using the crawl report files generated by StatisticsTracker.  Any formatting * changes to the way StatisticsTracker writes to the summary crawl reports will * require changes to this class. * <p> * The following statistics are accessible from this class: * <ul> *   <li> Successfully downloaded documents per fetch status code *   <li> Successfully downloaded documents per document mime type *   <li> Amount of data per mime type *   <li> Successfully downloaded documents per host *   <li> Amount of data per host *   <li> Successfully downloaded documents per top-level domain name (TLD) *   <li> Disposition of all seeds  *   <li> Successfully downloaded documents per host per source * </ul> * * <p>TODO: Make it so summarizing is not done all in RAM so we avoid * OOME. * * @author Frank McCown * * @see org.archive.crawler.admin.StatisticsTracker */public class StatisticsSummary {    /**     * Messages from the StatisticsSummary.     */    private final static Logger logger =        Logger.getLogger(StatisticsSummary.class.getName());        private boolean stats = true;        /** Crawl job whose summary we want to view */    private CrawlJob cjob;            protected long totalDnsStatusCodeDocuments = 0;    protected long totalStatusCodeDocuments = 0;    protected long totalFileTypeDocuments = 0;    protected long totalMimeTypeDocuments = 0;    protected long totalDnsMimeTypeDocuments = 0;    protected long totalDnsHostDocuments = 0;    protected long totalHostDocuments = 0;    protected long totalMimeSize = 0;    protected long totalDnsMimeSize = 0;    protected long totalHostSize = 0;    protected long totalDnsHostSize = 0;    protected long totalTldDocuments = 0;    protected long totalTldSize = 0;    protected long totalHosts = 0;        protected String durationTime;    protected String processedDocsPerSec;    protected String bandwidthKbytesPerSec;    protected String totalDataWritten;        /** Keep track of the file types we see (mime type -> count) */    protected Hashtable mimeTypeDistribution = new Hashtable();    protected Hashtable mimeTypeBytes = new Hashtable();    protected Hashtable mimeTypeDnsDistribution = new Hashtable();    protected Hashtable mimeTypeDnsBytes = new Hashtable();        /** Keep track of status codes */    protected Hashtable statusCodeDistribution = new Hashtable();    protected Hashtable dnsStatusCodeDistribution = new Hashtable();        /** Keep track of hosts */    protected Hashtable hostsDistribution = new Hashtable();     protected Hashtable hostsBytes = new Hashtable();     protected Hashtable hostsDnsDistribution = new Hashtable();    protected Hashtable hostsDnsBytes = new Hashtable();     /** Keep track of TLDs */    protected Hashtable tldDistribution = new Hashtable();    protected Hashtable tldBytes = new Hashtable();    protected Hashtable tldHostDistribution = new Hashtable();    /** Keep track of processed seeds */    protected transient Map processedSeedsRecords = new Hashtable();    /**     * Constructor     *      * @param cjob     * 				Completed crawl job     */    public StatisticsSummary(CrawlJob cjob) {    	this.cjob = cjob;    	    	// Read all stats for this crawl job    	this.stats = calculateStatusCodeDistribution();    	if (calculateMimeTypeDistribution()) {    		this.stats = true;    	}    	if (calculateHostsDistribution()) {    		this.stats = true;    	}    	if (readCrawlReport()) {    		this.stats = true;    	}    	if (readSeedReport()) {    		this.stats = true;    	}    }            /**     * Increment a counter for a key in a given HashMap. Used for various     * aggregate data.     *     * @param map The HashMap     * @param key The key for the counter to be incremented, if it does not     *               exist it will be added (set to 1).  If null it will     *            increment the counter "unknown".     */    protected static void incrementMapCount(Map map, String key) {    	incrementMapCount(map,key,1);    }    /**     * Increment a counter for a key in a given HashMap by an arbitrary amount.     * Used for various aggregate data. The increment amount can be negative.     *     * @param map     *            The HashMap     * @param key     *            The key for the counter to be incremented, if it does not     *            exist it will be added (set to equal to     *            <code>increment</code>).     *            If null it will increment the counter "unknown".     * @param increment     *            The amount to increment counter related to the     *            <code>key</code>.     */    protected static void incrementMapCount(Map map, String key,            long increment) {        if (key == null) {            key = "unknown";        }        LongWrapper lw = (LongWrapper)map.get(key);        if(lw == null) {            map.put(key, new LongWrapper(increment));        } else {            lw.longValue += increment;        }    }      /** Returns a HashMap that contains information about distributions of     *  encountered mime types.  Key/value pairs represent     *  mime type -> count.     * <p>     * <b>Note:</b> All the values are wrapped with a     * {@link LongWrapper LongWrapper}     * @return mimeTypeDistribution     */    public Hashtable getMimeDistribution() {        return mimeTypeDistribution;    }        public long getTotalMimeTypeDocuments() {       	return totalMimeTypeDocuments;    }        public long getTotalDnsMimeTypeDocuments() {       	return totalDnsMimeTypeDocuments;    }        public long getTotalMimeSize() {    	return totalMimeSize;    }        public long getTotalDnsMimeSize() {    	return totalDnsMimeSize;    }       /**     * Return a HashMap representing the distribution of HTTP status codes for     * successfully fetched curis, as represented by a hashmap where key -&gt;     * val represents (string)code -&gt; (integer)count.     *      * <b>Note: </b> All the values are wrapped with a     * {@link LongWrapper LongWrapper}     *      * @return statusCodeDistribution     */    public Hashtable getStatusCodeDistribution() {    	        return statusCodeDistribution;    }       /**     * Return a HashMap representing the distribution of DNS status codes for     * successfully fetched curis, as represented by a hashmap where key -&gt;     * val represents (string)code -&gt; (integer)count.     *      * <b>Note: </b> All the values are wrapped with a     * {@link LongWrapper LongWrapper}     *      * @return dnsStatusCodeDistribution     */    public Hashtable getDnsStatusCodeDistribution() {    	return dnsStatusCodeDistribution;    }        public Hashtable getDnsMimeDistribution() {        return mimeTypeDnsDistribution;    }    public long getTotalDnsStatusCodeDocuments() {    	return totalDnsStatusCodeDocuments;    }        public long getTotalStatusCodeDocuments() {    	return totalStatusCodeDocuments;    }          public long getTotalHostDocuments() {       	return totalHostDocuments;    }        public long getTotalDnsHostDocuments() {       	return totalDnsHostDocuments;    }        public Hashtable getHostsDnsDistribution() {    	return hostsDnsDistribution;    }        public long getTotalHostDnsDocuments() {    	return totalDnsHostDocuments;    }        public long getTotalHostSize() {    	return totalHostSize;    }        public long getTotalDnsHostSize() {    	return totalDnsHostSize;    }        public Hashtable getTldDistribution() {    	return tldDistribution;    }        public Hashtable getTldBytes() {    	return tldBytes;    }        public long getTotalTldDocuments() {    	return totalTldDocuments;    }        public long getTotalTldSize() {    	return totalTldSize;    }        public Hashtable getTldHostDistribution() {    	return tldHostDistribution;    }        public long getTotalHosts() {    	return totalHosts;    }        public String getDurationTime() {    	return durationTime;    }        public String getProcessedDocsPerSec() {    	return processedDocsPerSec;    }        public String getBandwidthKbytesPerSec() {    	return bandwidthKbytesPerSec;    }        public String getTotalDataWritten() {    	return totalDataWritten;    }    /**     * Sort the entries of the given HashMap in descending order by their     * values, which must be longs wrapped with <code>LongWrapper</code>.     * <p>     * Elements are sorted by value from largest to smallest. Equal values are     * sorted in an arbitrary, but consistent manner by their keys. Only items     * with identical value and key are considered equal.     *     * If the passed-in map requires access to be synchronized, the caller     * should ensure this synchronization.      *      * @param mapOfLongWrapperValues     *            Assumes values are wrapped with LongWrapper.     * @return a sorted set containing the same elements as the map.     */    public TreeMap getReverseSortedCopy(final Map mapOfLongWrapperValues) {        TreeMap sortedMap = new TreeMap(new Comparator() {            public int compare(Object e1, Object e2) {                long firstVal = ((LongWrapper)mapOfLongWrapperValues.get(e1)).                    longValue;                long secondVal = ((LongWrapper)mapOfLongWrapperValues.get(e2)).                    longValue;                if (firstVal < secondVal) {                    return 1;                }                if (secondVal < firstVal) {                    return -1;                }                // If the values are the same, sort by keys.                return ((String)e1).compareTo((String)e2);            }        });        try {            sortedMap.putAll(mapOfLongWrapperValues);        } catch (UnsupportedOperationException e) {            Iterator i = mapOfLongWrapperValues.keySet().iterator();            for (;i.hasNext();) {                // Ok. Try doing it the slow way then.                Object key = i.next();                sortedMap.put(key, mapOfLongWrapperValues.get(key));            }        }        return sortedMap;    }         /**     * Get the number of hosts with a particular TLD.     * @param tld     * 				top-level domain name     * @return		Total crawled hosts     */    public long getHostsPerTld(String tld) {    	LongWrapper lw = (LongWrapper)tldHostDistribution.get(tld);    	return (lw == null ? 0 : lw.longValue);    }        /**     * Read status code distribution from responsecode-report.txt.     * DNS and HTTP status codes are separated when read.     * @return True if we found some stats.     */    private boolean calculateStatusCodeDistribution() {    	// Read from responsecode-report.txt    	File f = new File(cjob.getDirectory(), "responsecode-report.txt");    	if (!f.exists()) {    		return false;    	}    	BufferedReader br = null;    	try {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -