⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 preconditionenforcer.java

📁 最强的爬虫工程
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
            logger.fine("Deferring processing of CrawlURI " + curi.toString()                + " for dns lookup.");            String preq = "dns:" + ch.getHostName();            try {                curi.markPrerequisite(preq,                    getController().getPostprocessorChain());            } catch (URIException e) {                throw new RuntimeException(e); // shouldn't ever happen            }            return true;        }                // DNS preconditions OK        return false;    }    /**     * Get the maximum time a dns-record is valid.     *     * @param curi the uri this time is valid for.     * @return the maximum time a dns-record is valid -- in seconds -- or     * negative if record's ttl should be used.     */    public long getIPValidityDuration(CrawlURI curi) {        Integer d;        try {            d = (Integer)getAttribute(ATTR_IP_VALIDITY_DURATION, curi);        } catch (AttributeNotFoundException e) {            d = DEFAULT_IP_VALIDITY_DURATION;        }        return d.longValue();    }    /** Return true if ip should be looked up.     *     * @param curi the URI to check.     * @return true if ip should be looked up.     */    public boolean isIpExpired(CrawlURI curi) {        CrawlHost host = getController().getServerCache().getHostFor(curi);        if (!host.hasBeenLookedUp()) {            // IP has not been looked up yet.            return true;        }        if (host.getIpTTL() == CrawlHost.IP_NEVER_EXPIRES) {            // IP never expires (numeric IP)            return false;        }        long duration = getIPValidityDuration(curi);        if (duration == 0) {            // Never expire ip if duration is null (set by user or more likely,            // set to zero in case where we tried in FetchDNS but failed).            return false;        }        // catch old "default" -1 settings that are now problematic,        // convert to new minimum        if (duration <= 0) {            duration = DEFAULT_IP_VALIDITY_DURATION.intValue();        }                long ttl = host.getIpTTL();        if (ttl > duration) {            // Use the larger of the operator-set minimum duration             // or the DNS record TTL            duration = ttl;        }        // Duration and ttl are in seconds.  Convert to millis.        if (duration > 0) {            duration *= 1000;        }        return (duration + host.getIpFetched()) < System.currentTimeMillis();    }    /** Get the maximum time a robots.txt is valid.     *     * @param curi     * @return the time a robots.txt is valid in milliseconds.     */    public long getRobotsValidityDuration(CrawlURI curi) {        Integer d;        try {            d = (Integer) getAttribute(ATTR_ROBOTS_VALIDITY_DURATION, curi);        } catch (AttributeNotFoundException e) {            // This should never happen, but if it does, return default            logger.severe(e.getLocalizedMessage());            d = DEFAULT_ROBOTS_VALIDITY_DURATION;        }        // convert from seconds to milliseconds        return d.longValue() * 1000;    }    /**     * Is the robots policy expired.     *     * This method will also return true if we haven't tried to get the     * robots.txt for this server.     *     * @param curi     * @return true if the robots policy is expired.     */    public boolean isRobotsExpired(CrawlURI curi) {        CrawlServer server =            getController().getServerCache().getServerFor(curi);        long robotsFetched = server.getRobotsFetchedTime();        if (robotsFetched == CrawlServer.ROBOTS_NOT_FETCHED) {            // Have not attempted to fetch robots            return true;        }        long duration = getRobotsValidityDuration(curi);        if (duration == 0) {            // When zero, robots should be valid forever            return false;        }        if (robotsFetched + duration < System.currentTimeMillis()) {            // Robots is still valid            return true;        }        return false;    }   /**    * Consider credential preconditions.    *    * Looks to see if any credential preconditions (e.g. html form login    * credentials) for this <code>CrawlServer</code>. If there are, have they    * been run already? If not, make the running of these logins a precondition    * of accessing any other url on this <code>CrawlServer</code>.    *    * <p>    * One day, do optimization and avoid running the bulk of the code below.    * Argument for running the code everytime is that overrides and refinements    * may change what comes back from credential store.    *    * @param curi CrawlURI we're checking for any required preconditions.    * @return True, if this <code>curi</code> has a precondition that needs to    *         be met before we can proceed. False if we can precede to process    *         this url.    */    private boolean credentialPrecondition(final CrawlURI curi) {        boolean result = false;        CredentialStore cs =            CredentialStore.getCredentialStore(getSettingsHandler());        if (cs == null) {            logger.severe("No credential store for " + curi);            return result;        }        Iterator i = cs.iterator(curi);        if (i == null) {            return result;        }        while (i.hasNext()) {            Credential c = (Credential)i.next();            if (c.isPrerequisite(curi)) {                // This credential has a prereq. and this curi is it.  Let it                // through.  Add its avatar to the curi as a mark.  Also, does                // this curi need to be posted?  Note, we do this test for                // is it a prereq BEFORE we do the check that curi is of the                // credential domain because such as yahoo have you go to                // another domain altogether to login.                c.attach(curi);                curi.setPost(c.isPost(curi));                break;            }            if (!c.rootUriMatch(getController(), curi)) {                continue;            }            if (!c.hasPrerequisite(curi)) {                continue;            }            if (!authenticated(c, curi)) {                // Han't been authenticated.  Queue it and move on (Assumption                // is that we can do one authentication at a time -- usually one                // html form).                String prereq = c.getPrerequisite(curi);                if (prereq == null || prereq.length() <= 0) {                    CrawlServer server =                        getController().getServerCache().getServerFor(curi);                    logger.severe(server.getName() + " has "                        + " credential(s) of type " + c + " but prereq"                        + " is null.");                } else {                    try {                        curi.markPrerequisite(prereq,                            getController().getPostprocessorChain());                    } catch (URIException e) {                        logger.severe("unable to set credentials prerequisite "+prereq);                        getController().logUriError(e,curi.getUURI(),prereq);                        return false;                     }                    result = true;                    if (logger.isLoggable(Level.FINE)) {                        logger.fine("Queueing prereq " + prereq + " of type " +                            c + " for " + curi);                    }                    break;                }            }        }        return result;    }    /**     * Has passed credential already been authenticated.     *     * @param credential Credential to test.     * @param curi CrawlURI.     * @return True if already run.     */    private boolean authenticated(final Credential credential,            final CrawlURI curi) {        boolean result = false;        CrawlServer server =            getController().getServerCache().getServerFor(curi);        if (!server.hasCredentialAvatars()) {            return result;        }        Set avatars = server.getCredentialAvatars();        for (Iterator i = avatars.iterator(); i.hasNext();) {            CredentialAvatar ca = (CredentialAvatar)i.next();            String key = null;            try {                key = credential.getKey(curi);            } catch (AttributeNotFoundException e) {                logger.severe("Failed getting key for " + credential +                    " for " + curi);                continue;            }            if (ca.match(credential.getClass(), key)) {                result = true;            }        }        return result;    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -