📄 statisticstracker.java
字号:
// Save hosts stats. saveHostStats((curi.getFetchStatus() == 1)? "dns:": this.controller.getServerCache(). getHostFor(curi).getHostName(), curi.getContentSize()); if (curi.containsKey(CrawlURI.A_SOURCE_TAG)){ saveSourceStats(curi.getString(CrawlURI.A_SOURCE_TAG), this.controller.getServerCache().getHostFor(curi). getHostName()); } } protected void saveSourceStats(String source, String hostname) { synchronized(sourceHostDistribution) { HashMap<String,LongWrapper> hostUriCount = sourceHostDistribution.get(source); if (hostUriCount == null) { hostUriCount = new HashMap<String,LongWrapper>(); } // TODO: Dan suggests we don't need a hashtable value. Might // be faster if we went without. Could just have keys of: // seed | host (concatenated as string) // and values of: // #urls incrementMapCount(hostUriCount, hostname); sourceHostDistribution.put(source, hostUriCount); } } protected void saveHostStats(String hostname, long size) { synchronized(hostsDistribution){ incrementMapCount(hostsDistribution, hostname); } synchronized(hostsBytes){ incrementMapCount(hostsBytes, hostname, size); } synchronized(hostsLastFinished){ hostsLastFinished.put(hostname, new Long(System.currentTimeMillis())); } } public void crawledURINeedRetry(CrawlURI curi) { handleSeed(curi,SEED_DISPOSITION_RETRY); } public void crawledURIDisregard(CrawlURI curi) { handleSeed(curi,SEED_DISPOSITION_DISREGARD); } public void crawledURIFailure(CrawlURI curi) { handleSeed(curi,SEED_DISPOSITION_FAILURE); } /** * Get a seed iterator for the job being monitored. * * <b>Note:</b> This iterator will iterate over a list of <i>strings</i> not * UURIs like the Scope seed iterator. The strings are equal to the URIs' * getURIString() values. * @return the seed iterator * FIXME: Consider using TransformingIterator here */ public Iterator<String> getSeeds() { List<String> seedsCopy = new Vector<String>(); Iterator<UURI> i = controller.getScope().seedsIterator(); while (i.hasNext()) { seedsCopy.add(i.next().toString()); } return seedsCopy.iterator(); } public Iterator getSeedRecordsSortedByStatusCode() { return getSeedRecordsSortedByStatusCode(getSeeds()); } protected Iterator<SeedRecord> getSeedRecordsSortedByStatusCode( Iterator<String> i) { TreeSet<SeedRecord> sortedSet = new TreeSet<SeedRecord>(new Comparator<SeedRecord>() { public int compare(SeedRecord sr1, SeedRecord sr2) { int code1 = sr1.getStatusCode(); int code2 = sr2.getStatusCode(); if (code1 == code2) { // If the values are equal, sort by URIs. return sr1.getUri().compareTo(sr2.getUri()); } // mirror and shift the nubmer line so as to // place zero at the beginning, then all negatives // in order of ascending absolute value, then all // positives descending code1 = -code1 - Integer.MAX_VALUE; code2 = -code2 - Integer.MAX_VALUE; return new Integer(code1).compareTo(new Integer(code2)); } }); while (i.hasNext()) { String seed = i.next(); SeedRecord sr = (SeedRecord) processedSeedsRecords.get(seed); if(sr==null) { sr = new SeedRecord(seed,SEED_DISPOSITION_NOT_PROCESSED); processedSeedsRecords.put(seed,sr); } sortedSet.add(sr); } return sortedSet.iterator(); } public void crawlEnded(String message) { logger.info("Entered crawlEnded"); this.sExitMessage = message; // held for reference by reports super.crawlEnded(message); logger.info("Leaving crawlEnded"); } /** * @param writer Where to write. */ protected void writeSeedsReportTo(PrintWriter writer) { // Build header. writer.print("[code] [status] [seed] [redirect]\n"); seedsCrawled = 0; seedsNotCrawled = 0; for (Iterator i = getSeedRecordsSortedByStatusCode(getSeeds()); i.hasNext();) { SeedRecord sr = (SeedRecord)i.next(); writer.print(sr.getStatusCode()); writer.print(" "); if((sr.getStatusCode() > 0)) { seedsCrawled++; writer.print("CRAWLED"); } else { seedsNotCrawled++; writer.print("NOTCRAWLED"); } writer.print(" "); writer.print(sr.getUri()); if(sr.getRedirectUri()!=null) { writer.print(" "); writer.print(sr.getRedirectUri()); } writer.print("\n"); } } protected void writeSourceReportTo(PrintWriter writer) { writer.print("[source] [host] [#urls]\n"); // for each source for (Iterator i = sourceHostDistribution.keySet().iterator(); i.hasNext();) { Object sourceKey = i.next(); Map<String,LongWrapper> hostCounts = (Map<String,LongWrapper>)sourceHostDistribution.get(sourceKey); // sort hosts by #urls SortedMap sortedHostCounts = getReverseSortedHostCounts(hostCounts); // for each host for (Iterator j = sortedHostCounts.keySet().iterator(); j.hasNext();) { Object hostKey = j.next(); LongWrapper hostCount = (LongWrapper) hostCounts.get(hostKey); writer.print(sourceKey.toString()); writer.print(" "); writer.print(hostKey.toString()); writer.print(" "); writer.print(hostCount.longValue); writer.print("\n"); } } } /** * Return a copy of the hosts distribution in reverse-sorted (largest first) * order. * * @return SortedMap of hosts distribution */ public SortedMap getReverseSortedHostCounts( Map<String,LongWrapper> hostCounts) { synchronized(hostCounts){ return getReverseSortedCopy(hostCounts); } } protected void writeHostsReportTo(PrintWriter writer) { SortedMap hd = getReverseSortedHostsDistribution(); // header writer.print("[#urls] [#bytes] [host]\n"); for (Iterator i = hd.keySet().iterator(); i.hasNext();) { // Key is 'host'. Object key = i.next(); if (hd.get(key)!=null) { writer.print(((LongWrapper)hd.get(key)).longValue); } else { writer.print("-"); } writer.print(" "); writer.print(getBytesPerHost((String)key)); writer.print(" "); writer.print((String)key); writer.print("\n"); } } /** * Return a copy of the hosts distribution in reverse-sorted * (largest first) order. * @return SortedMap of hosts distribution */ public SortedMap getReverseSortedHostsDistribution() { synchronized(hostsDistribution){ return getReverseSortedCopy(hostsDistribution); } } protected void writeMimetypesReportTo(PrintWriter writer) { // header writer.print("[#urls] [#bytes] [mime-types]\n"); TreeMap fd = getReverseSortedCopy(getFileDistribution()); for (Iterator i = fd.keySet().iterator(); i.hasNext();) { Object key = i.next(); // Key is mime type. writer.print(Long.toString(((LongWrapper)fd.get(key)).longValue)); writer.print(" "); writer.print(Long.toString(getBytesPerFileType((String)key))); writer.print(" "); writer.print((String)key); writer.print("\n"); } } protected void writeResponseCodeReportTo(PrintWriter writer) { // Build header. writer.print("[rescode] [#urls]\n"); TreeMap scd = getReverseSortedCopy(getStatusCodeDistribution()); for (Iterator i = scd.keySet().iterator(); i.hasNext();) { Object key = i.next(); writer.print((String)key); writer.print(" "); writer.print(Long.toString(((LongWrapper)scd.get(key)).longValue)); writer.print("\n"); } } protected void writeCrawlReportTo(PrintWriter writer) { writer.print("Crawl Name: " + controller.getOrder().getCrawlOrderName()); writer.print("\nCrawl Status: " + sExitMessage); writer.print("\nDuration Time: " + ArchiveUtils.formatMillisecondsToConventional(crawlDuration())); writer.print("\nTotal Seeds Crawled: " + seedsCrawled); writer.print("\nTotal Seeds not Crawled: " + seedsNotCrawled); // hostsDistribution contains all hosts crawled plus an entry for dns. writer.print("\nTotal Hosts Crawled: " + (hostsDistribution.size()-1)); writer.print("\nTotal Documents Crawled: " + finishedUriCount); writer.print("\nProcessed docs/sec: " + ArchiveUtils.doubleToString(docsPerSecond,2)); writer.print("\nBandwidth in Kbytes/sec: " + totalKBPerSec); writer.print("\nTotal Raw Data Size in Bytes: " + totalProcessedBytes + " (" + ArchiveUtils.formatBytesForDisplay(totalProcessedBytes) + ") \n"); writer.print("Novel Bytes: " + crawledBytes.get(CrawledBytesHistotable.NOVEL) + " (" + ArchiveUtils.formatBytesForDisplay( crawledBytes.get(CrawledBytesHistotable.NOVEL)) + ") \n"); if(crawledBytes.containsKey(CrawledBytesHistotable.DUPLICATE)) { writer.print("Duplicate-by-hash Bytes: " + crawledBytes.get(CrawledBytesHistotable.DUPLICATE) + " (" + ArchiveUtils.formatBytesForDisplay( crawledBytes.get(CrawledBytesHistotable.DUPLICATE)) + ") \n"); } if(crawledBytes.containsKey(CrawledBytesHistotable.NOTMODIFIED)) { writer.print("Not-modified Bytes: " + crawledBytes.get(CrawledBytesHistotable.NOTMODIFIED) + " (" + ArchiveUtils.formatBytesForDisplay( crawledBytes.get(CrawledBytesHistotable.NOTMODIFIED)) + ") \n"); } } protected void writeProcessorsReportTo(PrintWriter writer) { controller.reportTo(CrawlController.PROCESSORS_REPORT,writer); } protected void writeReportFile(String reportName, String filename) { File f = new File(controller.getDisk().getPath(), filename); try { PrintWriter bw = new PrintWriter(new FileWriter(f)); writeReportTo(reportName, bw); bw.close(); controller.addToManifest(f.getAbsolutePath(), CrawlController.MANIFEST_REPORT_FILE, true); } catch (IOException e) { logger.log(Level.SEVERE, "Unable to write " + f.getAbsolutePath() + " at the end of crawl.", e); } logger.info("wrote report: " + f.getAbsolutePath()); } /** * @param writer Where to write. */ protected void writeManifestReportTo(PrintWriter writer) { controller.reportTo(CrawlController.MANIFEST_REPORT, writer); } /** * @param reportName Name of report. * @param w Where to write. */ private void writeReportTo(String reportName, PrintWriter w) { if("hosts".equals(reportName)) { writeHostsReportTo(w); } else if ("mime types".equals(reportName)) { writeMimetypesReportTo(w); } else if ("response codes".equals(reportName)) { writeResponseCodeReportTo(w); } else if ("seeds".equals(reportName)) { writeSeedsReportTo(w); } else if ("crawl".equals(reportName)) { writeCrawlReportTo(w); } else if ("processors".equals(reportName)) { writeProcessorsReportTo(w); } else if ("manifest".equals(reportName)) { writeManifestReportTo(w); } else if ("frontier".equals(reportName)) { writeFrontierReportTo(w); } else if ("source".equals(reportName)) { writeSourceReportTo(w); }// / TODO else default/error } /** * Write the Frontier's 'nonempty' report (if available) * @param writer to report to */ protected void writeFrontierReportTo(PrintWriter writer) { if(controller.getFrontier().isEmpty()) { writer.println("frontier empty"); } else { controller.getFrontier().reportTo("nonempty", writer); } } /** * Run the reports. */ public void dumpReports() { // Add all files mentioned in the crawl order to the // manifest set. controller.addOrderToManifest(); writeReportFile("hosts","hosts-report.txt"); writeReportFile("mime types","mimetype-report.txt"); writeReportFile("response codes","responsecode-report.txt"); writeReportFile("seeds","seeds-report.txt"); writeReportFile("crawl","crawl-report.txt"); writeReportFile("processors","processors-report.txt"); writeReportFile("manifest","crawl-manifest.txt"); writeReportFile("frontier","frontier-report.txt"); if (!sourceHostDistribution.isEmpty()) { writeReportFile("source","source-report.txt"); } // TODO: Save object to disk? } public void crawlCheckpoint(File cpDir) throws Exception { // CrawlController is managing the checkpointing of this object. logNote("CRAWL CHECKPOINTING TO " + cpDir.toString()); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -