⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 statisticssummary.java

📁 最强的爬虫工程
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
	    	FileReader reader = new FileReader(f);	    	br = new BufferedReader(reader);	    	String line = br.readLine();  // Ignore heading	    	line = br.readLine();	    	while (line != null) {  	  	    	  // Get status code and # urls which are seperated by a space	    	  	    	  String[] items = line.split(" ");	    	  if (items.length < 2) {	    		  logger.log(Level.WARNING,                          "Unexpected formatting on line [" + line + "]");	    	  }	    	  else {	    		  // See if DNS or HTTP status code	    		  if (items[0].length() < 3) {	    			  // DNS status code	    			  long total = Long.parseLong(items[1]);	    			  dnsStatusCodeDistribution.put(items[0], 	    					  new LongWrapper(total));	    			  totalDnsStatusCodeDocuments += total;	    		  }	    		  else {	    			  // HTTP status code	    			  long total = Long.parseLong(items[1]);	    			  statusCodeDistribution.put(items[0], 	    					  new LongWrapper(total));	    			  totalStatusCodeDocuments += total;	    		  }	    	  }	    	  line = br.readLine();	    	}    	} catch (IOException e) {    		logger.log(Level.SEVERE, "Unable to read " + f.getAbsolutePath(),    			e);    	} finally {    		if (br != null) {    			try {					br.close();				} catch (IOException e) {					logger.log(Level.SEVERE,						"Closing " + f.getAbsolutePath(), e);				}    		}    	}    	return true;    }        /**     * Read MIME type data from mimetype-report.txt.     * MIME type of text/dns is separated from other MIME types.     * @return True if we found some stats.     */    private boolean calculateMimeTypeDistribution() {    	    	File f = new File(cjob.getDirectory(), "mimetype-report.txt");    	if (!f.exists()) {    		return false;    	}    	BufferedReader br = null;    	try {	    	FileReader reader = new FileReader(f);	    	br = new BufferedReader(reader);	    	String line = br.readLine();  // Ignore heading	    	line = br.readLine();	    	while (line != null) {	    			    	  	    		// Get num urls, num bytes, and MIME type (seperated by a space)	    		// Example: 12 134279 text/html  	    		String[] items = line.split(" ");	    		if (items.length < 3) {	    			logger.log(Level.WARNING,                            "Unexpected formatting on line [" + line + "]");	    		}	    		else {	    			long total = Long.parseLong(items[0]);	    			long bytes = Long.parseLong(items[1]);	    			String mime = items[2];	    			// Seperate DNS reconrds from HTTP	    			if (mime.equalsIgnoreCase("text/dns")) {	    				mimeTypeDnsDistribution.put(mime,                                new LongWrapper(total));	    				mimeTypeDnsBytes.put(mime, new LongWrapper(bytes));	    				totalDnsMimeTypeDocuments += total;	    				totalDnsMimeSize += bytes;	    			}	    			else {	    				mimeTypeDistribution.put(mime, new LongWrapper(total));	    				mimeTypeBytes.put(mime, new LongWrapper(bytes));	    				totalMimeTypeDocuments += total;	    				totalMimeSize += bytes;	    			}	    		}	    		line = br.readLine();	    	}    	} catch (IOException e) {    		logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);    	} finally {    		if (br != null) {    			try {    				br.close();    			} catch (IOException e) {    				logger.log(Level.SEVERE,    					"Closing " + f.getAbsolutePath(), e);    			}    		}    	}    	return true;    }        /**     * Read number of URLs and total bytes for each host name from     * hosts-report.txt.     * Host name of "dns:" is separated from others.     * @return true if stats found.     */    private boolean calculateHostsDistribution() {    	File f = new File(cjob.getDirectory(), "hosts-report.txt");    	if (!f.exists()) {    		return false;    	}    	BufferedReader br = null;    	try {	    	FileReader reader = new FileReader(f);	    	br = new BufferedReader(reader);	    	String line = br.readLine();  // Ignore heading	    	line = br.readLine();	    	while (line != null) {    	  	    		// Get num urls, num bytes, and host name (seperated by a space)	    		// Example: 9 7468 www.blogger.com	    		String[] items = line.split(" ");	    		if (items.length < 3) {	    			logger.log(Level.WARNING,                            "Unexpected formatting on line [" + line + "]");	    		}	    		else {	    			long total = Long.parseLong(items[0]);	    			long bytes = Long.parseLong(items[1]);	    			String host = items[2];	    			// Seperate DNS reconrds from HTTP	    			if (host.startsWith("dns:", 0)) {	    				hostsDnsDistribution.put(host, new LongWrapper(total));	    				hostsDnsBytes.put(host, new LongWrapper(bytes));	    				totalDnsHostDocuments += total;	    				totalDnsHostSize += bytes;	    			}	    			else {	    				hostsDistribution.put(host, new LongWrapper(total));	    				hostsBytes.put(host, new LongWrapper(bytes));	    				totalHostDocuments += total;	    				totalHostSize += bytes;	    				// Count top level domain (TLD)	    				String tld = host.substring(host.lastIndexOf('.')+1);	    				incrementMapCount(tldDistribution, tld, total);   	    				incrementMapCount(tldBytes, tld, bytes);	    				incrementMapCount(tldHostDistribution, tld);	    				totalTldDocuments += total;	    				totalTldSize += bytes;	    				totalHosts++;	    			}	    		}	    		line = br.readLine();	    	}    	} catch (IOException e) {    		logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);    	} finally {    		if (br != null) {    			try {    				br.close();    			} catch (IOException e) {    				logger.log(Level.SEVERE,    					"Closing " + f.getAbsolutePath(), e);    			}    		}    	}    	return true;    }    /**     * Returns the accumulated number of bytes downloaded from a given host.     * @param host name of the host     * @return the accumulated number of bytes downloaded from a given host     */    public long getBytesPerHost(String host) {     	long bytes = -1;    	    	bytes = host != null && host.startsWith("dns:", 0) ? 	    	((LongWrapper)hostsDnsBytes.get(host)).longValue :	    	((LongWrapper)hostsBytes.get(host)).longValue;	        	    	return bytes;    }        /**     * Returns the total number of bytes downloaded for a given TLD.     * @param tld TLD     * @return the total number of bytes downloaded for a given TLD     */    public long getBytesPerTld(String tld) {    	LongWrapper lw = (LongWrapper)tldBytes.get(tld);    	return (lw == null ? 0 : lw.longValue);    }    /**     * Returns the accumulated number of bytes from files of a given file type.     * @param filetype Filetype to check.     * @return the accumulated number of bytes from files of a given mime type     */    public long getBytesPerMimeType(String filetype) {    	long bytes = -1;    	    	if (filetype != null) {    		    	if (filetype.equals("text/dns")) {	    			    		bytes = mimeTypeDnsBytes.get(filetype) == null ? 0 :	    			((LongWrapper)mimeTypeDnsBytes.get(filetype)).longValue;	    	}	    	else {	    		bytes = mimeTypeBytes.get(filetype) == null ? 0 :	    			((LongWrapper)mimeTypeBytes.get(filetype)).longValue;	    	}    	}    	return bytes;    }        /**     * Reads duration time, processed docs/sec, bandwidth, and total size     * of crawl from crawl-report.txt.     * @return true if stats found.     */    public boolean readCrawlReport() {    	File f = new File(cjob.getDirectory(), "crawl-report.txt");    	if (!f.exists()) {    		return false;    	}    	BufferedReader br = null;    	try {	    	FileReader reader = new FileReader(f);	    	br = new BufferedReader(reader);	    	String line = br.readLine();  	    	while (line != null) {	    		if (line.startsWith("Duration Time")) {	    			durationTime = line.substring(line.indexOf(':')+1);	    		}	    		else if (line.startsWith("Processed docs/sec")) {	    			processedDocsPerSec = line.substring(line.indexOf(':')+1);	    		}	    		else if (line.startsWith("Bandwidth in Kbytes/sec")) {	    			bandwidthKbytesPerSec = line.substring(line.indexOf(':')+1);	    		}	    		else if (line.startsWith("Total Raw Data Size in Bytes")) {	    			totalDataWritten = line.substring(line.indexOf(':')+1);	    		}	    		line = br.readLine();	    	}    	}    	catch (IOException e) {    		logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);		    	} finally {    		if (br != null) {    			try {					br.close();				} catch (IOException e) {					logger.log(Level.SEVERE,					    "Failed close of " + f.getAbsolutePath(), e);				}    		}    	}    	return true;    }      /**     * Returns sorted Iterator of seeds records based on status code.     * @return sorted Iterator of seeds records     */    public Iterator getSeedRecordsSortedByStatusCode() {        TreeSet sortedSet = new TreeSet(new Comparator() {            public int compare(Object e1, Object e2) {                SeedRecord sr1 = (SeedRecord)e1;                SeedRecord sr2 = (SeedRecord)e2;                int code1 = sr1.getStatusCode();                int code2 = sr2.getStatusCode();                if (code1 == code2) {                    // If the values are equal, sort by URIs.                    return sr1.getUri().compareTo(sr2.getUri());                }                // mirror and shift the nubmer line so as to                // place zero at the beginning, then all negatives                 // in order of ascending absolute value, then all                 // positives descending                code1 = -code1 - Integer.MAX_VALUE;                code2 = -code2 - Integer.MAX_VALUE;                                return new Integer(code1).compareTo(new Integer(code2));            }        });        for (Iterator iterator = processedSeedsRecords.entrySet().iterator();                iterator.hasNext();) {            Map.Entry entry = (Map.Entry) iterator.next();            SeedRecord sr = (SeedRecord)entry.getValue();            sortedSet.add(sr);        }                return sortedSet.iterator();    }        /**     * Reads seed data from seeds-report.txt.     * @return True if stats found.     */    private boolean readSeedReport() {    	File f = new File(cjob.getDirectory(), "seeds-report.txt");    	if (!f.exists()) {    		return false;    	}    	BufferedReader br = null;    	try {	    	FileReader reader = new FileReader(f);	    	br = new BufferedReader(reader);	    		    	// Ignore heading: [code] [status] [seed] [redirect]	    	String line = br.readLine();  	    	line = br.readLine();	    	while (line != null) {	    		// Example lines:	    		// 302 CRAWLED http://www.ashlandcitytimes.com/ http://www.ashlandcitytimes.com/apps/pbcs.dll/section?Category=MTCN01	    		// 200 CRAWLED http://noleeo.com/	    		String[] items = line.split(" ");	    		if (items.length < 3) {	    			logger.log(Level.WARNING,                            "Unexpected formatting on line [" + line + "]");	    		}	    		else {	    			String statusCode = items[0];	    			String crawlStatus = items[1];	    			String seed = items[2];	    			String redirect = items.length > 3 ? items[3] : null;	    			// All values should be CRAWLED or NOTCRAWLED	    			if (crawlStatus.equals("CRAWLED")) {	    				crawlStatus =org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_SUCCESS;	    		  	    			}	    			else {	    				crawlStatus = org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_FAILURE;	    			}	    			SeedRecord sr = new SeedRecord(seed, crawlStatus, 	    					Integer.parseInt(statusCode), redirect);	    			processedSeedsRecords.put(seed, sr);	    		}	    		line = br.readLine();	    	}    	} catch (IOException e) {    		logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);   		    	} finally {    		if (br != null) {    			try {					br.close();				} catch (IOException e) {					logger.log(Level.SEVERE,						"Closing " + f.getAbsolutePath(), e);				}    		}    	}    	return true;    }            /**     * Return a copy of the hosts distribution in reverse-sorted     * (largest first) order.     *       * @return SortedMap of hosts distribution     */    public SortedMap getReverseSortedHostsDistribution() {        return getReverseSortedCopy(hostsDistribution);      }            /**     * @return True if we compiled stats, false if none to compile (e.g.     * there are no reports files on disk).     */    public boolean isStats() {    	return this.stats;    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -