📄 statisticstracker.java
字号:
return totalKBPerSec; } public int currentProcessedKBPerSec(){ return currentKBPerSec; } /** Returns a HashMap that contains information about distributions of * encountered mime types. Key/value pairs represent * mime type -> count. * <p> * <b>Note:</b> All the values are wrapped with a {@link LongWrapper LongWrapper} * @return mimeTypeDistribution */ public Hashtable<String,LongWrapper> getFileDistribution() { return mimeTypeDistribution; } /** * Increment a counter for a key in a given HashMap. Used for various * aggregate data. * * As this is used to change Maps which depend on StatisticsTracker * for their synchronization, this method should only be invoked * from a a block synchronized on 'this'. * * @param map The HashMap * @param key The key for the counter to be incremented, if it does not * exist it will be added (set to 1). If null it will * increment the counter "unknown". */ protected static void incrementMapCount(Map<String,LongWrapper> map, String key) { incrementMapCount(map,key,1); } /** * Increment a counter for a key in a given HashMap by an arbitrary amount. * Used for various aggregate data. The increment amount can be negative. * * As this is used to change Maps which depend on StatisticsTracker * for their synchronization, this method should only be invoked * from a a block synchronized on 'this'. * * @param map * The HashMap * @param key * The key for the counter to be incremented, if it does not exist * it will be added (set to equal to <code>increment</code>). * If null it will increment the counter "unknown". * @param increment * The amount to increment counter related to the <code>key</code>. */ protected static void incrementMapCount(Map<String,LongWrapper> map, String key, long increment) { if (key == null) { key = "unknown"; } LongWrapper lw = (LongWrapper)map.get(key); if(lw == null) { map.put(key, new LongWrapper(increment)); } else { lw.longValue += increment; } } /** * Sort the entries of the given HashMap in descending order by their * values, which must be longs wrapped with <code>LongWrapper</code>. * <p> * Elements are sorted by value from largest to smallest. Equal values are * sorted in an arbitrary, but consistent manner by their keys. Only items * with identical value and key are considered equal. * * If the passed-in map requires access to be synchronized, the caller * should ensure this synchronization. * * @param mapOfLongWrapperValues * Assumes values are wrapped with LongWrapper. * @return a sorted set containing the same elements as the map. */ public TreeMap<String,LongWrapper> getReverseSortedCopy( final Map<String,LongWrapper> mapOfLongWrapperValues) { TreeMap<String,LongWrapper> sortedMap = new TreeMap<String,LongWrapper>(new Comparator<String>() { public int compare(String e1, String e2) { long firstVal = mapOfLongWrapperValues.get(e1). longValue; long secondVal = mapOfLongWrapperValues.get(e2). longValue; if (firstVal < secondVal) { return 1; } if (secondVal < firstVal) { return -1; } // If the values are the same, sort by keys. return e1.compareTo(e2); } }); try { sortedMap.putAll(mapOfLongWrapperValues); } catch (UnsupportedOperationException e) { Iterator<String> i = mapOfLongWrapperValues.keySet().iterator(); for (;i.hasNext();) { // Ok. Try doing it the slow way then. String key = i.next(); sortedMap.put(key, mapOfLongWrapperValues.get(key)); } } return sortedMap; } /** * Return a HashMap representing the distribution of status codes for * successfully fetched curis, as represented by a hashmap where key -> * val represents (string)code -> (integer)count. * * <b>Note: </b> All the values are wrapped with a * {@link LongWrapper LongWrapper} * * @return statusCodeDistribution */ public Hashtable<String,LongWrapper> getStatusCodeDistribution() { return statusCodeDistribution; } /** * Returns the time (in millisec) when a URI belonging to a given host was * last finished processing. * * @param host The host to look up time of last completed URI. * @return Returns the time (in millisec) when a URI belonging to a given * host was last finished processing. If no URI has been completed for host * -1 will be returned. */ public long getHostLastFinished(String host){ Long l = null; synchronized(hostsLastFinished){ l = (Long)hostsLastFinished.get(host); } return (l != null)? l.longValue(): -1; } /** * Returns the accumulated number of bytes downloaded from a given host. * @param host name of the host * @return the accumulated number of bytes downloaded from a given host */ public long getBytesPerHost(String host){ synchronized(hostsBytes){ return ((LongWrapper)hostsBytes.get(host)).longValue; } } /** * Returns the accumulated number of bytes from files of a given file type. * @param filetype Filetype to check. * @return the accumulated number of bytes from files of a given mime type */ public long getBytesPerFileType(String filetype){ return ((LongWrapper)mimeTypeBytes.get(filetype)).longValue; } /** * Get the total number of ToeThreads (sleeping and active) * * @return The total number of ToeThreads */ public int threadCount() { return this.controller != null? controller.getToeCount(): 0; } /** * @return Current thread count (or zero if can't figure it out). */ public int activeThreadCount() { return this.controller != null? controller.getActiveToeCount(): 0; // note: reuse of old busy value seemed misleading: anyone asking // for thread count when paused or stopped still wants accurate reading } /** * This returns the number of completed URIs as a percentage of the total * number of URIs encountered (should be inverse to the discovery curve) * * @return The number of completed URIs as a percentage of the total * number of URIs encountered */ public int percentOfDiscoveredUrisCompleted() { long completed = finishedUriCount(); long total = discoveredUriCount(); if (total == 0) { return 0; } return (int) (100 * completed / total); } /** * Number of <i>discovered</i> URIs. * * <p>If crawl not running (paused or stopped) this will return the value of * the last snapshot. * * @return A count of all uris encountered * * @see org.archive.crawler.framework.Frontier#discoveredUriCount() */ public long discoveredUriCount() { // While shouldrun is true we can use info direct from the crawler. // After that our last snapshot will have to do. return shouldrun && this.controller != null && this.controller.getFrontier() != null? controller.getFrontier().discoveredUriCount() : discoveredUriCount; } /** * Number of URIs that have <i>finished</i> processing. * * @return Number of URIs that have finished processing * * @see org.archive.crawler.framework.Frontier#finishedUriCount() */ public long finishedUriCount() { return shouldrun && this.controller != null && this.controller.getFrontier() != null ? controller.getFrontier().finishedUriCount() : finishedUriCount; } /** * Get the total number of failed fetch attempts (connection failures -> give up, etc) * * @return The total number of failed fetch attempts */ public long failedFetchAttempts() { // While shouldrun is true we can use info direct from the crawler. // After that our last snapshot will have to do. return shouldrun && this.controller != null && this.controller.getFrontier() != null ? controller.getFrontier().failedFetchCount() : downloadFailures; } /** * Get the total number of failed fetch attempts (connection failures -> give up, etc) * * @return The total number of failed fetch attempts */ public long disregardedFetchAttempts() { // While shouldrun is true we can use info direct from the crawler. // After that our last snapshot will have to do. return shouldrun && this.controller != null && this.controller.getFrontier() != null? controller.getFrontier().disregardedUriCount() : downloadDisregards; } public long successfullyFetchedCount() { // While shouldrun is true we can use info direct from the crawler. // After that our last snapshot will have to do. return shouldrun && this.controller != null && this.controller.getFrontier() != null? controller.getFrontier().succeededFetchCount() : downloadedUriCount; } public long totalCount() { return queuedUriCount() + activeThreadCount() + successfullyFetchedCount(); } /** * Ratio of number of threads that would theoretically allow * maximum crawl progress (if each was as productive as current * threads), to current number of threads. * * @return float congestion ratio */ public float congestionRatio() { // While shouldrun is true we can use info direct from the crawler. // After that our last snapshot will have to do. return shouldrun && this.controller != null && this.controller.getFrontier() != null ? controller.getFrontier().congestionRatio() : congestionRatio; } /** * Ordinal position of the 'deepest' URI eligible * for crawling. Essentially, the length of the longest * frontier internal queue. * * @return long URI count to deepest URI */ public long deepestUri() { // While shouldrun is true we can use info direct from the crawler. // After that our last snapshot will have to do. return shouldrun && this.controller != null && this.controller.getFrontier() != null ? controller.getFrontier().deepestUri() : deepestUri; } /** * Average depth of the last URI in all eligible queues. * That is, the average length of all eligible queues. * * @return long average depth of last URIs in queues */ public long averageDepth() { // While shouldrun is true we can use info direct from the crawler. // After that our last snapshot will have to do. return shouldrun && this.controller != null && this.controller.getFrontier() != null ? controller.getFrontier().averageDepth() : averageDepth; } /** * Number of URIs <i>queued</i> up and waiting for processing. * * <p>If crawl not running (paused or stopped) this will return the value * of the last snapshot. * * @return Number of URIs queued up and waiting for processing. * * @see org.archive.crawler.framework.Frontier#queuedUriCount() */ public long queuedUriCount() { // While shouldrun is true we can use info direct from the crawler. // After that our last snapshot will have to do. return shouldrun && this.controller != null && this.controller.getFrontier() != null? controller.getFrontier().queuedUriCount() : queuedUriCount; } /** @deprecated use totalBytesCrawled */ public long totalBytesWritten() { // return totalBytesCrawled(); return shouldrun && this.controller != null && this.controller.getFrontier() != null? controller.getFrontier().totalBytesWritten() : totalProcessedBytes; } public long totalBytesCrawled() { return shouldrun ? crawledBytes.getTotal() : totalProcessedBytes; } public String crawledBytesSummary() { return crawledBytes.summary(); } /** * If the curi is a seed, we update the processedSeeds table. * * @param curi The CrawlURI that may be a seed. * @param disposition The dispositino of the CrawlURI. */ private void handleSeed(CrawlURI curi, String disposition) { if(curi.isSeed()){ SeedRecord sr = new SeedRecord(curi, disposition); processedSeedsRecords.put(sr.getUri(), sr); } } public void crawledURISuccessful(CrawlURI curi) { handleSeed(curi,SEED_DISPOSITION_SUCCESS); // save crawled bytes tally crawledBytes.accumulate(curi); // Save status codes incrementMapCount(statusCodeDistribution, Integer.toString(curi.getFetchStatus())); // Save mime types String mime = MimetypeUtils.truncate(curi.getContentType()); incrementMapCount(mimeTypeDistribution, mime); incrementMapCount(mimeTypeBytes, mime, curi.getContentSize());
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -