⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 statisticstracker.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
        // Save hosts stats.        saveHostStats((curi.getFetchStatus() == 1)? "dns:":                this.controller.getServerCache().                getHostFor(curi).getHostName(),                curi.getContentSize());                if (curi.containsKey(CrawlURI.A_SOURCE_TAG)){            saveSourceStats(curi.getString(CrawlURI.A_SOURCE_TAG),                     this.controller.getServerCache().getHostFor(curi).                    getHostName());         }    }             protected void saveSourceStats(String source, String hostname) {        synchronized(sourceHostDistribution) {            HashMap<String,LongWrapper> hostUriCount =                 sourceHostDistribution.get(source);            if (hostUriCount == null) {                hostUriCount = new HashMap<String,LongWrapper>();            }            // TODO: Dan suggests we don't need a hashtable value.  Might            // be faster if we went without. Could just have keys of:            //  seed | host (concatenated as string)            // and values of:             //  #urls            incrementMapCount(hostUriCount, hostname);            sourceHostDistribution.put(source, hostUriCount);        }    }        protected void saveHostStats(String hostname, long size) {        synchronized(hostsDistribution){            incrementMapCount(hostsDistribution, hostname);        }        synchronized(hostsBytes){            incrementMapCount(hostsBytes, hostname, size);        }        synchronized(hostsLastFinished){            hostsLastFinished.put(hostname,                new Long(System.currentTimeMillis()));        }    }    public void crawledURINeedRetry(CrawlURI curi) {        handleSeed(curi,SEED_DISPOSITION_RETRY);    }    public void crawledURIDisregard(CrawlURI curi) {        handleSeed(curi,SEED_DISPOSITION_DISREGARD);    }    public void crawledURIFailure(CrawlURI curi) {        handleSeed(curi,SEED_DISPOSITION_FAILURE);    }    /**     * Get a seed iterator for the job being monitored.      *      * <b>Note:</b> This iterator will iterate over a list of <i>strings</i> not     * UURIs like the Scope seed iterator. The strings are equal to the URIs'     * getURIString() values.     * @return the seed iterator     * FIXME: Consider using TransformingIterator here     */    public Iterator<String> getSeeds() {        List<String> seedsCopy = new Vector<String>();        Iterator<UURI> i = controller.getScope().seedsIterator();        while (i.hasNext()) {            seedsCopy.add(i.next().toString());        }        return seedsCopy.iterator();    }    public Iterator getSeedRecordsSortedByStatusCode() {        return getSeedRecordsSortedByStatusCode(getSeeds());    }        protected Iterator<SeedRecord> getSeedRecordsSortedByStatusCode(            Iterator<String> i) {        TreeSet<SeedRecord> sortedSet =           new TreeSet<SeedRecord>(new Comparator<SeedRecord>() {            public int compare(SeedRecord sr1, SeedRecord sr2) {                int code1 = sr1.getStatusCode();                int code2 = sr2.getStatusCode();                if (code1 == code2) {                    // If the values are equal, sort by URIs.                    return sr1.getUri().compareTo(sr2.getUri());                }                // mirror and shift the nubmer line so as to                // place zero at the beginning, then all negatives                 // in order of ascending absolute value, then all                 // positives descending                code1 = -code1 - Integer.MAX_VALUE;                code2 = -code2 - Integer.MAX_VALUE;                                return new Integer(code1).compareTo(new Integer(code2));            }        });        while (i.hasNext()) {            String seed = i.next();            SeedRecord sr = (SeedRecord) processedSeedsRecords.get(seed);            if(sr==null) {                sr = new SeedRecord(seed,SEED_DISPOSITION_NOT_PROCESSED);                processedSeedsRecords.put(seed,sr);            }            sortedSet.add(sr);        }        return sortedSet.iterator();    }    public void crawlEnded(String message) {        logger.info("Entered crawlEnded");        this.sExitMessage = message; // held for reference by reports        super.crawlEnded(message);        logger.info("Leaving crawlEnded");    }        /**     * @param writer Where to write.     */    protected void writeSeedsReportTo(PrintWriter writer) {        // Build header.        writer.print("[code] [status] [seed] [redirect]\n");        seedsCrawled = 0;        seedsNotCrawled = 0;        for (Iterator i = getSeedRecordsSortedByStatusCode(getSeeds());                i.hasNext();) {            SeedRecord sr = (SeedRecord)i.next();            writer.print(sr.getStatusCode());            writer.print(" ");            if((sr.getStatusCode() > 0)) {                seedsCrawled++;                writer.print("CRAWLED");            } else {                seedsNotCrawled++;                writer.print("NOTCRAWLED");            }            writer.print(" ");            writer.print(sr.getUri());            if(sr.getRedirectUri()!=null) {                writer.print(" ");                writer.print(sr.getRedirectUri());            }            writer.print("\n");        }    }        protected void writeSourceReportTo(PrintWriter writer) {                writer.print("[source] [host] [#urls]\n");        // for each source        for (Iterator i = sourceHostDistribution.keySet().iterator(); i.hasNext();) {            Object sourceKey = i.next();            Map<String,LongWrapper> hostCounts              = (Map<String,LongWrapper>)sourceHostDistribution.get(sourceKey);            // sort hosts by #urls            SortedMap sortedHostCounts = getReverseSortedHostCounts(hostCounts);            // for each host            for (Iterator j = sortedHostCounts.keySet().iterator(); j.hasNext();) {                Object hostKey = j.next();                LongWrapper hostCount = (LongWrapper) hostCounts.get(hostKey);                writer.print(sourceKey.toString());                writer.print(" ");                writer.print(hostKey.toString());                writer.print(" ");                writer.print(hostCount.longValue);                writer.print("\n");            }        }    }      /**     * Return a copy of the hosts distribution in reverse-sorted (largest first)     * order.     *      * @return SortedMap of hosts distribution     */    public SortedMap getReverseSortedHostCounts(            Map<String,LongWrapper> hostCounts) {        synchronized(hostCounts){            return getReverseSortedCopy(hostCounts);        }    }        protected void writeHostsReportTo(PrintWriter writer) {        SortedMap hd = getReverseSortedHostsDistribution();        // header        writer.print("[#urls] [#bytes] [host]\n");        for (Iterator i = hd.keySet().iterator(); i.hasNext();) {            // Key is 'host'.            Object key = i.next();            if (hd.get(key)!=null) {                writer.print(((LongWrapper)hd.get(key)).longValue);            } else {                writer.print("-");            }            writer.print(" ");            writer.print(getBytesPerHost((String)key));            writer.print(" ");            writer.print((String)key);            writer.print("\n");        }    }        /**     * Return a copy of the hosts distribution in reverse-sorted     * (largest first) order.      * @return SortedMap of hosts distribution     */    public SortedMap getReverseSortedHostsDistribution() {        synchronized(hostsDistribution){            return getReverseSortedCopy(hostsDistribution);        }    }    protected void writeMimetypesReportTo(PrintWriter writer) {        // header        writer.print("[#urls] [#bytes] [mime-types]\n");        TreeMap fd = getReverseSortedCopy(getFileDistribution());        for (Iterator i = fd.keySet().iterator(); i.hasNext();) {            Object key = i.next();            // Key is mime type.            writer.print(Long.toString(((LongWrapper)fd.get(key)).longValue));            writer.print(" ");            writer.print(Long.toString(getBytesPerFileType((String)key)));            writer.print(" ");            writer.print((String)key);            writer.print("\n");        }    }        protected void writeResponseCodeReportTo(PrintWriter writer) {        // Build header.        writer.print("[rescode] [#urls]\n");        TreeMap scd = getReverseSortedCopy(getStatusCodeDistribution());        for (Iterator i = scd.keySet().iterator(); i.hasNext();) {            Object key = i.next();            writer.print((String)key);            writer.print(" ");            writer.print(Long.toString(((LongWrapper)scd.get(key)).longValue));            writer.print("\n");        }    }        protected void writeCrawlReportTo(PrintWriter writer) {        writer.print("Crawl Name: " + controller.getOrder().getCrawlOrderName());        writer.print("\nCrawl Status: " + sExitMessage);        writer.print("\nDuration Time: " +                ArchiveUtils.formatMillisecondsToConventional(crawlDuration()));        writer.print("\nTotal Seeds Crawled: " + seedsCrawled);        writer.print("\nTotal Seeds not Crawled: " + seedsNotCrawled);        // hostsDistribution contains all hosts crawled plus an entry for dns.        writer.print("\nTotal Hosts Crawled: " + (hostsDistribution.size()-1));        writer.print("\nTotal Documents Crawled: " + finishedUriCount);        writer.print("\nProcessed docs/sec: " +                ArchiveUtils.doubleToString(docsPerSecond,2));        writer.print("\nBandwidth in Kbytes/sec: " + totalKBPerSec);        writer.print("\nTotal Raw Data Size in Bytes: " + totalProcessedBytes +                " (" + ArchiveUtils.formatBytesForDisplay(totalProcessedBytes) +                ") \n");        writer.print("Novel Bytes: "                 + crawledBytes.get(CrawledBytesHistotable.NOVEL)                + " (" + ArchiveUtils.formatBytesForDisplay(                        crawledBytes.get(CrawledBytesHistotable.NOVEL))                +  ") \n");        if(crawledBytes.containsKey(CrawledBytesHistotable.DUPLICATE)) {            writer.print("Duplicate-by-hash Bytes: "                     + crawledBytes.get(CrawledBytesHistotable.DUPLICATE)                    + " (" + ArchiveUtils.formatBytesForDisplay(                            crawledBytes.get(CrawledBytesHistotable.DUPLICATE))                    +  ") \n");        }        if(crawledBytes.containsKey(CrawledBytesHistotable.NOTMODIFIED)) {            writer.print("Not-modified Bytes: "                     + crawledBytes.get(CrawledBytesHistotable.NOTMODIFIED)                    + " (" + ArchiveUtils.formatBytesForDisplay(                            crawledBytes.get(CrawledBytesHistotable.NOTMODIFIED))                    +  ") \n");        }    }        protected void writeProcessorsReportTo(PrintWriter writer) {        controller.reportTo(CrawlController.PROCESSORS_REPORT,writer);    }        protected void writeReportFile(String reportName, String filename) {        File f = new File(controller.getDisk().getPath(), filename);        try {            PrintWriter bw = new PrintWriter(new FileWriter(f));            writeReportTo(reportName, bw);            bw.close();            controller.addToManifest(f.getAbsolutePath(),                CrawlController.MANIFEST_REPORT_FILE, true);        } catch (IOException e) {            logger.log(Level.SEVERE, "Unable to write " + f.getAbsolutePath() +                " at the end of crawl.", e);        }        logger.info("wrote report: " + f.getAbsolutePath());    }        /**     * @param writer Where to write.     */    protected void writeManifestReportTo(PrintWriter writer) {        controller.reportTo(CrawlController.MANIFEST_REPORT, writer);    }        /**     * @param reportName Name of report.     * @param w Where to write.     */    private void writeReportTo(String reportName, PrintWriter w) {        if("hosts".equals(reportName)) {            writeHostsReportTo(w);        } else if ("mime types".equals(reportName)) {            writeMimetypesReportTo(w);        } else if ("response codes".equals(reportName)) {            writeResponseCodeReportTo(w);        } else if ("seeds".equals(reportName)) {            writeSeedsReportTo(w);        } else if ("crawl".equals(reportName)) {            writeCrawlReportTo(w);        } else if ("processors".equals(reportName)) {            writeProcessorsReportTo(w);        } else if ("manifest".equals(reportName)) {            writeManifestReportTo(w);        } else if ("frontier".equals(reportName)) {            writeFrontierReportTo(w);        } else if ("source".equals(reportName)) {            writeSourceReportTo(w);        }// / TODO else default/error    }    /**     * Write the Frontier's 'nonempty' report (if available)     * @param writer to report to     */    protected void writeFrontierReportTo(PrintWriter writer) {        if(controller.getFrontier().isEmpty()) {            writer.println("frontier empty");        } else {            controller.getFrontier().reportTo("nonempty", writer);        }    }    /**     * Run the reports.     */    public void dumpReports() {        // Add all files mentioned in the crawl order to the        // manifest set.        controller.addOrderToManifest();        writeReportFile("hosts","hosts-report.txt");        writeReportFile("mime types","mimetype-report.txt");        writeReportFile("response codes","responsecode-report.txt");        writeReportFile("seeds","seeds-report.txt");        writeReportFile("crawl","crawl-report.txt");        writeReportFile("processors","processors-report.txt");        writeReportFile("manifest","crawl-manifest.txt");        writeReportFile("frontier","frontier-report.txt");        if (!sourceHostDistribution.isEmpty()) {            writeReportFile("source","source-report.txt");        }        // TODO: Save object to disk?    }    public void crawlCheckpoint(File cpDir) throws Exception {        // CrawlController is managing the checkpointing of this object.        logNote("CRAWL CHECKPOINTING TO " + cpDir.toString());    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -