📄 statisticssummary.java
字号:
FileReader reader = new FileReader(f); br = new BufferedReader(reader); String line = br.readLine(); // Ignore heading line = br.readLine(); while (line != null) { // Get status code and # urls which are seperated by a space String[] items = line.split(" "); if (items.length < 2) { logger.log(Level.WARNING, "Unexpected formatting on line [" + line + "]"); } else { // See if DNS or HTTP status code if (items[0].length() < 3) { // DNS status code long total = Long.parseLong(items[1]); dnsStatusCodeDistribution.put(items[0], new LongWrapper(total)); totalDnsStatusCodeDocuments += total; } else { // HTTP status code long total = Long.parseLong(items[1]); statusCodeDistribution.put(items[0], new LongWrapper(total)); totalStatusCodeDocuments += total; } } line = br.readLine(); } } catch (IOException e) { logger.log(Level.SEVERE, "Unable to read " + f.getAbsolutePath(), e); } finally { if (br != null) { try { br.close(); } catch (IOException e) { logger.log(Level.SEVERE, "Closing " + f.getAbsolutePath(), e); } } } return true; } /** * Read MIME type data from mimetype-report.txt. * MIME type of text/dns is separated from other MIME types. * @return True if we found some stats. */ private boolean calculateMimeTypeDistribution() { File f = new File(cjob.getDirectory(), "mimetype-report.txt"); if (!f.exists()) { return false; } BufferedReader br = null; try { FileReader reader = new FileReader(f); br = new BufferedReader(reader); String line = br.readLine(); // Ignore heading line = br.readLine(); while (line != null) { // Get num urls, num bytes, and MIME type (seperated by a space) // Example: 12 134279 text/html String[] items = line.split(" "); if (items.length < 3) { logger.log(Level.WARNING, "Unexpected formatting on line [" + line + "]"); } else { long total = Long.parseLong(items[0]); long bytes = Long.parseLong(items[1]); String mime = items[2]; // Seperate DNS reconrds from HTTP if (mime.equalsIgnoreCase("text/dns")) { mimeTypeDnsDistribution.put(mime, new LongWrapper(total)); mimeTypeDnsBytes.put(mime, new LongWrapper(bytes)); totalDnsMimeTypeDocuments += total; totalDnsMimeSize += bytes; } else { mimeTypeDistribution.put(mime, new LongWrapper(total)); mimeTypeBytes.put(mime, new LongWrapper(bytes)); totalMimeTypeDocuments += total; totalMimeSize += bytes; } } line = br.readLine(); } } catch (IOException e) { logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e); } finally { if (br != null) { try { br.close(); } catch (IOException e) { logger.log(Level.SEVERE, "Closing " + f.getAbsolutePath(), e); } } } return true; } /** * Read number of URLs and total bytes for each host name from * hosts-report.txt. * Host name of "dns:" is separated from others. * @return true if stats found. */ private boolean calculateHostsDistribution() { File f = new File(cjob.getDirectory(), "hosts-report.txt"); if (!f.exists()) { return false; } BufferedReader br = null; try { FileReader reader = new FileReader(f); br = new BufferedReader(reader); String line = br.readLine(); // Ignore heading line = br.readLine(); while (line != null) { // Get num urls, num bytes, and host name (seperated by a space) // Example: 9 7468 www.blogger.com String[] items = line.split(" "); if (items.length < 3) { logger.log(Level.WARNING, "Unexpected formatting on line [" + line + "]"); } else { long total = Long.parseLong(items[0]); long bytes = Long.parseLong(items[1]); String host = items[2]; // Seperate DNS reconrds from HTTP if (host.startsWith("dns:", 0)) { hostsDnsDistribution.put(host, new LongWrapper(total)); hostsDnsBytes.put(host, new LongWrapper(bytes)); totalDnsHostDocuments += total; totalDnsHostSize += bytes; } else { hostsDistribution.put(host, new LongWrapper(total)); hostsBytes.put(host, new LongWrapper(bytes)); totalHostDocuments += total; totalHostSize += bytes; // Count top level domain (TLD) String tld = host.substring(host.lastIndexOf('.')+1); incrementMapCount(tldDistribution, tld, total); incrementMapCount(tldBytes, tld, bytes); incrementMapCount(tldHostDistribution, tld); totalTldDocuments += total; totalTldSize += bytes; totalHosts++; } } line = br.readLine(); } } catch (IOException e) { logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e); } finally { if (br != null) { try { br.close(); } catch (IOException e) { logger.log(Level.SEVERE, "Closing " + f.getAbsolutePath(), e); } } } return true; } /** * Returns the accumulated number of bytes downloaded from a given host. * @param host name of the host * @return the accumulated number of bytes downloaded from a given host */ public long getBytesPerHost(String host) { long bytes = -1; bytes = host != null && host.startsWith("dns:", 0) ? ((LongWrapper)hostsDnsBytes.get(host)).longValue : ((LongWrapper)hostsBytes.get(host)).longValue; return bytes; } /** * Returns the total number of bytes downloaded for a given TLD. * @param tld TLD * @return the total number of bytes downloaded for a given TLD */ public long getBytesPerTld(String tld) { LongWrapper lw = (LongWrapper)tldBytes.get(tld); return (lw == null ? 0 : lw.longValue); } /** * Returns the accumulated number of bytes from files of a given file type. * @param filetype Filetype to check. * @return the accumulated number of bytes from files of a given mime type */ public long getBytesPerMimeType(String filetype) { long bytes = -1; if (filetype != null) { if (filetype.equals("text/dns")) { bytes = mimeTypeDnsBytes.get(filetype) == null ? 0 : ((LongWrapper)mimeTypeDnsBytes.get(filetype)).longValue; } else { bytes = mimeTypeBytes.get(filetype) == null ? 0 : ((LongWrapper)mimeTypeBytes.get(filetype)).longValue; } } return bytes; } /** * Reads duration time, processed docs/sec, bandwidth, and total size * of crawl from crawl-report.txt. * @return true if stats found. */ public boolean readCrawlReport() { File f = new File(cjob.getDirectory(), "crawl-report.txt"); if (!f.exists()) { return false; } BufferedReader br = null; try { FileReader reader = new FileReader(f); br = new BufferedReader(reader); String line = br.readLine(); while (line != null) { if (line.startsWith("Duration Time")) { durationTime = line.substring(line.indexOf(':')+1); } else if (line.startsWith("Processed docs/sec")) { processedDocsPerSec = line.substring(line.indexOf(':')+1); } else if (line.startsWith("Bandwidth in Kbytes/sec")) { bandwidthKbytesPerSec = line.substring(line.indexOf(':')+1); } else if (line.startsWith("Total Raw Data Size in Bytes")) { totalDataWritten = line.substring(line.indexOf(':')+1); } line = br.readLine(); } } catch (IOException e) { logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e); } finally { if (br != null) { try { br.close(); } catch (IOException e) { logger.log(Level.SEVERE, "Failed close of " + f.getAbsolutePath(), e); } } } return true; } /** * Returns sorted Iterator of seeds records based on status code. * @return sorted Iterator of seeds records */ public Iterator getSeedRecordsSortedByStatusCode() { TreeSet sortedSet = new TreeSet(new Comparator() { public int compare(Object e1, Object e2) { SeedRecord sr1 = (SeedRecord)e1; SeedRecord sr2 = (SeedRecord)e2; int code1 = sr1.getStatusCode(); int code2 = sr2.getStatusCode(); if (code1 == code2) { // If the values are equal, sort by URIs. return sr1.getUri().compareTo(sr2.getUri()); } // mirror and shift the nubmer line so as to // place zero at the beginning, then all negatives // in order of ascending absolute value, then all // positives descending code1 = -code1 - Integer.MAX_VALUE; code2 = -code2 - Integer.MAX_VALUE; return new Integer(code1).compareTo(new Integer(code2)); } }); for (Iterator iterator = processedSeedsRecords.entrySet().iterator(); iterator.hasNext();) { Map.Entry entry = (Map.Entry) iterator.next(); SeedRecord sr = (SeedRecord)entry.getValue(); sortedSet.add(sr); } return sortedSet.iterator(); } /** * Reads seed data from seeds-report.txt. * @return True if stats found. */ private boolean readSeedReport() { File f = new File(cjob.getDirectory(), "seeds-report.txt"); if (!f.exists()) { return false; } BufferedReader br = null; try { FileReader reader = new FileReader(f); br = new BufferedReader(reader); // Ignore heading: [code] [status] [seed] [redirect] String line = br.readLine(); line = br.readLine(); while (line != null) { // Example lines: // 302 CRAWLED http://www.ashlandcitytimes.com/ http://www.ashlandcitytimes.com/apps/pbcs.dll/section?Category=MTCN01 // 200 CRAWLED http://noleeo.com/ String[] items = line.split(" "); if (items.length < 3) { logger.log(Level.WARNING, "Unexpected formatting on line [" + line + "]"); } else { String statusCode = items[0]; String crawlStatus = items[1]; String seed = items[2]; String redirect = items.length > 3 ? items[3] : null; // All values should be CRAWLED or NOTCRAWLED if (crawlStatus.equals("CRAWLED")) { crawlStatus =org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_SUCCESS; } else { crawlStatus = org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_FAILURE; } SeedRecord sr = new SeedRecord(seed, crawlStatus, Integer.parseInt(statusCode), redirect); processedSeedsRecords.put(seed, sr); } line = br.readLine(); } } catch (IOException e) { logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e); } finally { if (br != null) { try { br.close(); } catch (IOException e) { logger.log(Level.SEVERE, "Closing " + f.getAbsolutePath(), e); } } } return true; } /** * Return a copy of the hosts distribution in reverse-sorted * (largest first) order. * * @return SortedMap of hosts distribution */ public SortedMap getReverseSortedHostsDistribution() { return getReverseSortedCopy(hostsDistribution); } /** * @return True if we compiled stats, false if none to compile (e.g. * there are no reports files on disk). */ public boolean isStats() { return this.stats; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -