📄 statisticstracker.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
        return totalKBPerSec;    }    public int currentProcessedKBPerSec(){        return currentKBPerSec;    }    /** Returns a HashMap that contains information about distributions of     *  encountered mime types.  Key/value pairs represent     *  mime type -> count.     * <p>     * <b>Note:</b> All the values are wrapped with a {@link LongWrapper LongWrapper}     * @return mimeTypeDistribution     */    public Hashtable<String,LongWrapper> getFileDistribution() {        return mimeTypeDistribution;    }    /**     * Increment a counter for a key in a given HashMap. Used for various     * aggregate data.     *      * As this is used to change Maps which depend on StatisticsTracker     * for their synchronization, this method should only be invoked     * from a a block synchronized on 'this'.      *     * @param map The HashMap     * @param key The key for the counter to be incremented, if it does not     *               exist it will be added (set to 1).  If null it will     *            increment the counter "unknown".     */    protected static void incrementMapCount(Map<String,LongWrapper> map,             String key) {    	incrementMapCount(map,key,1);    }    /**     * Increment a counter for a key in a given HashMap by an arbitrary amount.     * Used for various aggregate data. The increment amount can be negative.     *     * As this is used to change Maps which depend on StatisticsTracker     * for their synchronization, this method should only be invoked     * from a a block synchronized on 'this'.      *     * @param map     *            The HashMap     * @param key     *            The key for the counter to be incremented, if it does not exist     *            it will be added (set to equal to <code>increment</code>).     *            If null it will increment the counter "unknown".     * @param increment     *            The amount to increment counter related to the <code>key</code>.     */    protected static void incrementMapCount(Map<String,LongWrapper> map,             String key, long increment) {        if (key == null) {            key = "unknown";        }        LongWrapper lw = (LongWrapper)map.get(key);        if(lw == null) {            map.put(key, new LongWrapper(increment));        } else {            lw.longValue += increment;        }    }    /**     * Sort the entries of the given HashMap in descending order by their     * values, which must be longs wrapped with <code>LongWrapper</code>.     * <p>     * Elements are sorted by value from largest to smallest. Equal values are     * sorted in an arbitrary, but consistent manner by their keys. Only items     * with identical value and key are considered equal.     *     * If the passed-in map requires access to be synchronized, the caller     * should ensure this synchronization.      *      * @param mapOfLongWrapperValues     *            Assumes values are wrapped with LongWrapper.     * @return a sorted set containing the same elements as the map.     */    public TreeMap<String,LongWrapper> getReverseSortedCopy(            final Map<String,LongWrapper> mapOfLongWrapperValues) {        TreeMap<String,LongWrapper> sortedMap =           new TreeMap<String,LongWrapper>(new Comparator<String>() {            public int compare(String e1, String e2) {                long firstVal = mapOfLongWrapperValues.get(e1).                    longValue;                long secondVal = mapOfLongWrapperValues.get(e2).                    longValue;                if (firstVal < secondVal) {                    return 1;                }                if (secondVal < firstVal) {                    return -1;                }                // If the values are the same, sort by keys.                return e1.compareTo(e2);            }        });        try {            sortedMap.putAll(mapOfLongWrapperValues);        } catch (UnsupportedOperationException e) {            Iterator<String> i = mapOfLongWrapperValues.keySet().iterator();            for (;i.hasNext();) {                // Ok. Try doing it the slow way then.                String key = i.next();                sortedMap.put(key, mapOfLongWrapperValues.get(key));            }        }        return sortedMap;    }    /**     * Return a HashMap representing the distribution of status codes for     * successfully fetched curis, as represented by a hashmap where key -&gt;     * val represents (string)code -&gt; (integer)count.     *      * <b>Note: </b> All the values are wrapped with a     * {@link LongWrapper LongWrapper}     *      * @return statusCodeDistribution     */    public Hashtable<String,LongWrapper> getStatusCodeDistribution() {        return statusCodeDistribution;    }        /**     * Returns the time (in millisec) when a URI belonging to a given host was     * last finished processing.      *      * @param host The host to look up time of last completed URI.     * @return Returns the time (in millisec) when a URI belonging to a given      * host was last finished processing. If no URI has been completed for host     * -1 will be returned.      */    public long getHostLastFinished(String host){        Long l = null;        synchronized(hostsLastFinished){            l = (Long)hostsLastFinished.get(host);        }        return (l != null)? l.longValue(): -1;    }    /**     * Returns the accumulated number of bytes downloaded from a given host.     * @param host name of the host     * @return the accumulated number of bytes downloaded from a given host     */    public long getBytesPerHost(String host){        synchronized(hostsBytes){            return ((LongWrapper)hostsBytes.get(host)).longValue;        }    }    /**     * Returns the accumulated number of bytes from files of a given file type.     * @param filetype Filetype to check.     * @return the accumulated number of bytes from files of a given mime type     */    public long getBytesPerFileType(String filetype){        return ((LongWrapper)mimeTypeBytes.get(filetype)).longValue;    }    /**     * Get the total number of ToeThreads (sleeping and active)     *     * @return The total number of ToeThreads     */    public int threadCount() {        return this.controller != null? controller.getToeCount(): 0;    }    /**     * @return Current thread count (or zero if can't figure it out).     */     public int activeThreadCount() {        return this.controller != null? controller.getActiveToeCount(): 0;        // note: reuse of old busy value seemed misleading: anyone asking        // for thread count when paused or stopped still wants accurate reading    }    /**     * This returns the number of completed URIs as a percentage of the total     * number of URIs encountered (should be inverse to the discovery curve)     *     * @return The number of completed URIs as a percentage of the total     * number of URIs encountered     */    public int percentOfDiscoveredUrisCompleted() {        long completed = finishedUriCount();        long total = discoveredUriCount();        if (total == 0) {            return 0;        }        return (int) (100 * completed / total);    }    /**     * Number of <i>discovered</i> URIs.     *     * <p>If crawl not running (paused or stopped) this will return the value of     * the last snapshot.     *     * @return A count of all uris encountered     *     * @see org.archive.crawler.framework.Frontier#discoveredUriCount()     */    public long discoveredUriCount() {        // While shouldrun is true we can use info direct from the crawler.        // After that our last snapshot will have to do.        return shouldrun && this.controller != null &&                this.controller.getFrontier() != null?            controller.getFrontier().discoveredUriCount() : discoveredUriCount;    }    /**     * Number of URIs that have <i>finished</i> processing.     *     * @return Number of URIs that have finished processing     *     * @see org.archive.crawler.framework.Frontier#finishedUriCount()     */    public long finishedUriCount() {        return shouldrun && this.controller != null &&                this.controller.getFrontier() != null ?            controller.getFrontier().finishedUriCount() : finishedUriCount;    }    /**     * Get the total number of failed fetch attempts (connection failures -> give up, etc)     *     * @return The total number of failed fetch attempts     */    public long failedFetchAttempts() {        // While shouldrun is true we can use info direct from the crawler.        // After that our last snapshot will have to do.        return shouldrun && this.controller != null &&                this.controller.getFrontier() != null ?            controller.getFrontier().failedFetchCount() : downloadFailures;    }    /**     * Get the total number of failed fetch attempts (connection failures -> give up, etc)     *     * @return The total number of failed fetch attempts     */    public long disregardedFetchAttempts() {        // While shouldrun is true we can use info direct from the crawler.        // After that our last snapshot will have to do.        return shouldrun && this.controller != null &&                this.controller.getFrontier() != null?            controller.getFrontier().disregardedUriCount() : downloadDisregards;    }    public long successfullyFetchedCount() {        // While shouldrun is true we can use info direct from the crawler.        // After that our last snapshot will have to do.        return shouldrun && this.controller != null &&                this.controller.getFrontier() != null?            controller.getFrontier().succeededFetchCount() : downloadedUriCount;    }        public long totalCount() {        return queuedUriCount() + activeThreadCount() +            successfullyFetchedCount();    }    /**     * Ratio of number of threads that would theoretically allow     * maximum crawl progress (if each was as productive as current     * threads), to current number of threads.     *      * @return float congestion ratio      */    public float congestionRatio() {        // While shouldrun is true we can use info direct from the crawler.        // After that our last snapshot will have to do.        return shouldrun && this.controller != null &&                this.controller.getFrontier() != null ?            controller.getFrontier().congestionRatio() : congestionRatio;    }        /**     * Ordinal position of the 'deepest' URI eligible      * for crawling. Essentially, the length of the longest     * frontier internal queue.      *      * @return long URI count to deepest URI     */    public long deepestUri() {        // While shouldrun is true we can use info direct from the crawler.        // After that our last snapshot will have to do.        return shouldrun && this.controller != null &&                this.controller.getFrontier() != null ?            controller.getFrontier().deepestUri() : deepestUri;    }        /**     * Average depth of the last URI in all eligible queues.     * That is, the average length of all eligible queues.     *      * @return long average depth of last URIs in queues      */    public long averageDepth() {        // While shouldrun is true we can use info direct from the crawler.        // After that our last snapshot will have to do.        return shouldrun && this.controller != null &&                this.controller.getFrontier() != null ?            controller.getFrontier().averageDepth() : averageDepth;    }        /**     * Number of URIs <i>queued</i> up and waiting for processing.     *     * <p>If crawl not running (paused or stopped) this will return the value     * of the last snapshot.     *     * @return Number of URIs queued up and waiting for processing.     *     * @see org.archive.crawler.framework.Frontier#queuedUriCount()     */    public long queuedUriCount() {        // While shouldrun is true we can use info direct from the crawler.        // After that our last snapshot will have to do.        return shouldrun && this.controller != null &&                this.controller.getFrontier() != null?            controller.getFrontier().queuedUriCount() : queuedUriCount;    }    /** @deprecated use totalBytesCrawled */     public long totalBytesWritten() {        // return totalBytesCrawled();         return shouldrun && this.controller != null &&                this.controller.getFrontier() != null?            controller.getFrontier().totalBytesWritten() : totalProcessedBytes;    }        public long totalBytesCrawled() {        return shouldrun ?            crawledBytes.getTotal() : totalProcessedBytes;    }        public String crawledBytesSummary() {        return crawledBytes.summary();    }    /**     * If the curi is a seed, we update the processedSeeds table.     *     * @param curi The CrawlURI that may be a seed.     * @param disposition The dispositino of the CrawlURI.     */    private void handleSeed(CrawlURI curi, String disposition) {        if(curi.isSeed()){            SeedRecord sr = new SeedRecord(curi, disposition);            processedSeedsRecords.put(sr.getUri(), sr);        }    }    public void crawledURISuccessful(CrawlURI curi) {        handleSeed(curi,SEED_DISPOSITION_SUCCESS);        // save crawled bytes tally        crawledBytes.accumulate(curi);                // Save status codes        incrementMapCount(statusCodeDistribution,            Integer.toString(curi.getFetchStatus()));        // Save mime types        String mime = MimetypeUtils.truncate(curi.getContentType());        incrementMapCount(mimeTypeDistribution, mime);        incrementMapCount(mimeTypeBytes, mime, curi.getContentSize());
💿 文件大小 9430 K
👤 上传用户 zergwyk
📂 所属分类 Internet/网络编程
🏷️ 相关标签

#lucece
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -