📄 adaptiverevisitfrontier.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
                    curi.getUURI().toString(), array);            }            // once logged, discard            curi.remove(A_LOCALIZED_ERRORS);        }    }        /**     * The CrawlURI has been successfully crawled.      *     * @param curi The CrawlURI     */    protected void successDisposition(CrawlURI curi) {        curi.aboutToLog();        long waitInterval = 0;                if(curi.containsKey(A_WAIT_INTERVAL)){            waitInterval = curi.getLong(A_WAIT_INTERVAL);            curi.addAnnotation("wt:" +                     ArchiveUtils.formatMillisecondsToConventional(                            waitInterval));        } else {            logger.severe("Missing wait interval for " + curi.toString() +                    " WaitEvaluator may be missing.");        }        if(curi.containsKey(A_NUMBER_OF_VISITS)){            curi.addAnnotation(curi.getInt(A_NUMBER_OF_VISITS) + "vis");        }        if(curi.containsKey(A_NUMBER_OF_VERSIONS)){            curi.addAnnotation(curi.getInt(A_NUMBER_OF_VERSIONS) + "ver");        }        if(curi.containsKey(A_FETCH_OVERDUE)){            curi.addAnnotation("ov:" +                    ArchiveUtils.formatMillisecondsToConventional(                    (curi.getLong(A_FETCH_OVERDUE))));        }                Object array[] = { curi };        controller.uriProcessing.log(            Level.INFO,            curi.getUURI().toString(),            array);        succeededFetchCount++;        totalProcessedBytes += curi.getContentSize();        // Let everyone know in case they want to do something before we strip        // the curi.        controller.fireCrawledURISuccessfulEvent(curi);                curi.setSchedulingDirective(CandidateURI.NORMAL);        // Set time of next processing        curi.putLong(A_TIME_OF_NEXT_PROCESSING,                System.currentTimeMillis()+waitInterval);                        /* Update HQ */        AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());                // Wake up time is based on the time when a fetch was completed + the        // calculated snooze time for politeness. If the fetch completion time        // is missing, we'll use current time.        long wakeupTime = (curi.containsKey(A_FETCH_COMPLETED_TIME)?                curi.getLong(A_FETCH_COMPLETED_TIME):                    (new Date()).getTime()) + calculateSnoozeTime(curi);                // Ready the URI for reserialization.        curi.processingCleanup();         curi.resetDeferrals();           curi.resetFetchAttempts();        try {            hq.update(curi, true, wakeupTime);        } catch (IOException e) {            logger.severe("An IOException occured when updating " +                     curi.toString() + "\n" + e.getMessage());            e.printStackTrace();        }    }    /**     * Put near top of relevant hostQueue (but behind anything recently     * scheduled 'high')..     *     * @param curi CrawlURI to reschedule. Its time of next processing is not     *             modified.     * @param errorWait signals if there should be a wait before retrying.     * @throws AttributeNotFoundException     */    protected void reschedule(CrawlURI curi, boolean errorWait)            throws AttributeNotFoundException {        long delay = 0;        if(errorWait){            if(curi.containsKey(A_RETRY_DELAY)) {                delay = curi.getLong(A_RETRY_DELAY);            } else {                // use ARFrontier default                delay = ((Long)getAttribute(ATTR_RETRY_DELAY,curi)).longValue();            }        }                long retryTime = (curi.containsKey(A_FETCH_COMPLETED_TIME)?                curi.getLong(A_FETCH_COMPLETED_TIME):                    (new Date()).getTime()) + delay;                AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());        // Ready the URI for reserialization.        curi.processingCleanup();         if(errorWait){            curi.resetDeferrals(); //Defferals only refer to immediate retries.        }        try {            hq.update(curi, errorWait, retryTime);        } catch (IOException e) {            // TODO Handle IOException            e.printStackTrace();        }    }    /**     * The CrawlURI has encountered a problem, and will not     * be retried.     *     * @param curi The CrawlURI     */    protected void failureDisposition(CrawlURI curi) {        //Let interested listeners know of failed disposition.        this.controller.fireCrawledURIFailureEvent(curi);        // send to basic log        curi.aboutToLog();        Object array[] = { curi };        this.controller.uriProcessing.log(            Level.INFO,            curi.getUURI().toString(),            array);        // if exception, also send to crawlErrors        if (curi.getFetchStatus() == S_RUNTIME_EXCEPTION) {            this.controller.runtimeErrors.log(                Level.WARNING,                curi.getUURI().toString(),                array);        }        failedFetchCount++;                // Put the failed URI at the very back of the queue.        curi.setSchedulingDirective(CandidateURI.NORMAL);        // TODO: reconsider this        curi.putLong(A_TIME_OF_NEXT_PROCESSING,Long.MAX_VALUE);        AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());        // Ready the URI for serialization.        curi.processingCleanup();        curi.resetDeferrals();        curi.resetFetchAttempts();        try {            // No wait on failure. No contact was made with the server.            boolean shouldForget = shouldBeForgotten(curi);            if(shouldForget && alreadyIncluded != null){                alreadyIncluded.forget(canonicalize(curi.getUURI()),curi);            }            hq.update(curi,false, 0, shouldForget);         } catch (IOException e) {            // TODO Handle IOException            e.printStackTrace();        }    }    protected void disregardDisposition(CrawlURI curi) {        //Let interested listeners know of disregard disposition.        controller.fireCrawledURIDisregardEvent(curi);        // send to basic log        curi.aboutToLog();        Object array[] = { curi };        controller.uriProcessing.log(            Level.INFO,            curi.getUURI().toString(),            array);        disregardedUriCount++;                // Todo: consider timout before retrying disregarded elements.        //       Possibly add a setting to the WaitEvaluators?        curi.putLong(A_TIME_OF_NEXT_PROCESSING,Long.MAX_VALUE);         curi.setSchedulingDirective(CandidateURI.NORMAL);        AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());        // Ready the URI for reserialization.        curi.processingCleanup();         curi.resetDeferrals();        curi.resetFetchAttempts();        try {            // No politness wait on disregard. No contact was made with server            hq.update(curi, false, 0, shouldBeForgotten(curi));        } catch (IOException e) {            // TODO Handle IOException            e.printStackTrace();        }    }    /**     * Some URIs, if they recur,  deserve another     * chance at consideration: they might not be too     * many hops away via another path, or the scope     * may have been updated to allow them passage.     *     * @param curi     * @return True if curi should be forgotten.     */    protected boolean shouldBeForgotten(CrawlURI curi) {        switch(curi.getFetchStatus()) {            case S_OUT_OF_SCOPE:            case S_TOO_MANY_EMBED_HOPS:            case S_TOO_MANY_LINK_HOPS:                return true;            default:                return false;        }    }    /**     * Checks if a recently completed CrawlURI that did not finish successfully     * needs to be retried immediately (processed again as soon as politeness     * allows.)     *     * @param curi The CrawlURI to check     * @return True if we need to retry promptly.     * @throws AttributeNotFoundException If problems occur trying to read the     *            maximum number of retries from the settings framework.     */    protected boolean needsPromptRetry(CrawlURI curi)            throws AttributeNotFoundException {        if (curi.getFetchAttempts() >=                ((Integer)getAttribute(ATTR_MAX_RETRIES, curi)).intValue() ) {            return false;        }        switch (curi.getFetchStatus()) {            case S_DEFERRED:                return true;            case HttpStatus.SC_UNAUTHORIZED:                // We can get here though usually a positive status code is                // a success.  We get here if there is rfc2617 credential data                // loaded and we're supposed to go around again.  See if any                // rfc2617 credential present and if there, assume it got                // loaded in FetchHTTP on expectation that we're to go around                // again.  If no rfc2617 loaded, we should not be here.                boolean loaded = curi.hasRfc2617CredentialAvatar();                if (!loaded) {                    logger.severe("Have 401 but no creds loaded " + curi);                }                return loaded;            default:                return false;        }    }    /**     * Checks if a recently completed CrawlURI that did not finish successfully     * needs to be retried (processed again after some time elapses)     *     * @param curi The CrawlURI to check     * @return True if we need to retry.     * @throws AttributeNotFoundException If problems occur trying to read the     *            maximum number of retries from the settings framework.     */    protected boolean needsRetrying(CrawlURI curi)            throws AttributeNotFoundException {        // Check to see if maximum number of retries has been exceeded.        if (curi.getFetchAttempts() >=             ((Integer)getAttribute(ATTR_MAX_RETRIES,curi)).intValue() ) {            return false;        } else {            // Check if FetchStatus indicates that a delayed retry is needed.            switch (curi.getFetchStatus()) {                case S_CONNECT_FAILED:                case S_CONNECT_LOST:                case S_DOMAIN_UNRESOLVABLE:                    // these are all worth a retry                    // TODO: consider if any others (S_TIMEOUT in some cases?)                     //       deserve retry                    return true;                default:                    return false;            }        }    }        protected boolean isDisregarded(CrawlURI curi) {        switch (curi.getFetchStatus()) {            case S_ROBOTS_PRECLUDED :     // they don't want us to have it            case S_OUT_OF_SCOPE :         // filtered out by scope            case S_BLOCKED_BY_CUSTOM_PROCESSOR:            case S_BLOCKED_BY_USER :      // filtered out by user            case S_TOO_MANY_EMBED_HOPS :  // too far from last true link            case S_TOO_MANY_LINK_HOPS :   // too far from seeds            case S_DELETED_BY_USER :      // user deleted                return true;            default:                return false;        }    }        /**     * Calculates how long a host queue needs to be snoozed following the     * crawling of a URI.     *     * @param curi The CrawlURI     * @return How long to snooze.     */    protected long calculateSnoozeTime(CrawlURI curi) {        long durationToWait = 0;        if (curi.containsKey(A_FETCH_BEGAN_TIME)            && curi.containsKey(A_FETCH_COMPLETED_TIME)) {
💿 文件大小 9430 K
👤 上传用户 zergwyk
📂 所属分类 Internet/网络编程
🏷️ 相关标签

#lucece
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -