📄 frontier.java

📁 爬虫
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
     * trying to fetch, most likely due to operator changing scope definition).     *     * @return Number of finished URIs.     */    public long finishedUriCount();    /**     * Number of <i>successfully</i> processed URIs.     *     * <p>Any URI that was processed successfully. This includes URIs that     * returned 404s and other error codes that do not originate within the     * crawler.     *     * @return Number of <i>successfully</i> processed URIs.     */    public long succeededFetchCount();    /**     * Number of URIs that <i>failed</i> to process.     *     * <p>URIs that could not be processed because of some error or failure in     * the processing chain. Can include failure to acquire prerequisites, to     * establish a connection with the host and any number of other problems.     * Does not count those that will be retried, only those that have     * permenantly failed.     *     * @return Number of URIs that failed to process.     */    public long failedFetchCount();    /**     * Number of URIs that were scheduled at one point but have been     * <i>disregarded</i>.     *     * <p>Counts any URI that is scheduled only to be disregarded     * because it is determined to lie outside the scope of the crawl. Most     * commonly this will be due to robots.txt exclusions.     *     * @return The number of URIs that have been disregarded.     */    public long disregardedUriCount();    /**     * Total number of bytes contained in all URIs that have been processed.     *     * @return The total amounts of bytes in all processed URIs.     */    public long totalBytesWritten();    /**     * Recover earlier state by reading a recovery log.     *     * <p>Some Frontiers are able to write detailed logs that can be loaded     * after a system crash to recover the state of the Frontier prior to the     * crash. This method is the one used to achive this.     *     * @param pathToLog The name (with full path) of the recover log.     * @param retainFailures If true, failures in log should count as      * having been included. (If false, failures will be ignored, meaning     * the corresponding URIs will be retried in the recovered crawl.)     * @throws IOException If problems occur reading the recover log.     */    public void importRecoverLog(String pathToLog, boolean retainFailures)			throws IOException;    /**     * Get a <code>URIFrontierMarker</code> initialized with the given     * regular expression at the 'start' of the Frontier.     * @param regexpr The regular expression that URIs within the frontier must     *                match to be considered within the scope of this marker     * @param inCacheOnly If set to true, only those URIs within the frontier     *                that are stored in cache (usually this means in memory     *                rather then on disk, but that is an implementation     *                detail) will be considered. Others will be entierly     *                ignored, as if they dont exist. This is usefull for quick     *                peeks at the top of the URI list.     * @return A URIFrontierMarker that is set for the 'start' of the frontier's     *                URI list.     */    public FrontierMarker getInitialMarker(String regexpr,                                              boolean inCacheOnly);    /**     * Returns a list of all uncrawled URIs starting from a specified marker     * until <code>numberOfMatches</code> is reached.     *     * <p>Any encountered URI that has not been successfully crawled, terminally     * failed, disregarded or is currently being processed is included. As     * there may be duplicates in the frontier, there may also be duplicates     * in the report. Thus this includes both discovered and pending URIs.     *     * <p>The list is a set of strings containing the URI strings. If verbose is     * true the string will include some additional information (path to URI     * and parent).     *     * <p>The <code>URIFrontierMarker</code> will be advanced to the position at     * which it's maximum number of matches found is reached. Reusing it for     * subsequent calls will thus effectively get the 'next' batch. Making     * any changes to the frontier can invalidate the marker.     *     * <p>While the order returned is consistent, it does <i>not</i> have any     * explicit relation to the likely order in which they may be processed.     *     * <p><b>Warning:</b> It is unsafe to make changes to the frontier while     * this method is executing. The crawler should be in a paused state before     * invoking it.     *     * @param marker     *            A marker specifing from what position in the Frontier the     *            list should begin.     * @param numberOfMatches     *            how many URIs to add at most to the list before returning it     * @param verbose     *            if set to true the strings returned will contain additional     *            information about each URI beyond their names.     * @return a list of all pending URIs falling within the specification     *            of the marker     * @throws InvalidFrontierMarkerException when the     *            <code>URIFronterMarker</code> does not match the internal     *            state of the frontier. Tolerance for this can vary     *            considerably from one URIFrontier implementation to the next.     * @see FrontierMarker     * @see #getInitialMarker(String, boolean)     */    public ArrayList getURIsList(FrontierMarker marker,                                 int numberOfMatches,                                 boolean verbose)                             throws InvalidFrontierMarkerException;    /**     * Delete any URI that matches the given regular expression from the list     * of discovered and pending URIs. This does not prevent them from being     * rediscovered.     *     * <p>Any encountered URI that has not been successfully crawled, terminally     * failed, disregarded or is currently being processed is considered to be     * a pending URI.     *     * <p><b>Warning:</b> It is unsafe to make changes to the frontier while     * this method is executing. The crawler should be in a paused state before     * invoking it.     *     * @param match A regular expression, any URIs that matches it will be     *              deleted.     * @return The number of URIs deleted     */    public long deleteURIs(String match);    /**     * Notify Frontier that a CrawlURI has been deleted outside of the     * normal next()/finished() lifecycle.      *      * @param curi Deleted CrawlURI.     */    public void deleted(CrawlURI curi);    /**     * Notify Frontier that it should consider the given UURI as if     * already scheduled.     *      * @param u UURI instance to add to the Already Included set.     */    public void considerIncluded(UURI u);    /**     * Notify Frontier that it should consider updating configuration     * info that may have changed in external files.     */    public void kickUpdate();    /**     * Notify Frontier that it should not release any URIs, instead     * holding all threads, until instructed otherwise.      */    public void pause();    /**     * Resumes the release of URIs to crawl, allowing worker     * ToeThreads to proceed.      */    public void unpause();    /**     * Notify Frontier that it should end the crawl, giving     * any worker ToeThread that askss for a next() an      * EndedException.      */    public void terminate();        /**     * @return Return the instance of {@link FrontierJournal} that     * this Frontier is using.  May be null if no journaling.     */    public FrontierJournal getFrontierJournal();        /**     * @param cauri CandidateURI for which we're to calculate and     * set class key.     * @return Classkey for <code>cauri</code>.     */    public String getClassKey(CandidateURI cauri);    /**     * Request that the Frontier load (or reload) crawl seeds,      * typically by contacting the Scope.      */    public void loadSeeds();    /**     * Request that Frontier allow crawling to begin. Usually     * just unpauses Frontier, if paused.      */    public void start();    /**     * Get the 'frontier group' (usually queue) for the given      * CrawlURI.      * @param curi CrawlURI to find matching group     * @return FrontierGroup for the CrawlURI     */    public FrontierGroup getGroup(CrawlURI curi);        /**     * Generic interface representing the internal groupings      * of a Frontier's URIs -- usually queues. Currently only      * offers the HasCrawlSubstats interface.      */    public interface FrontierGroup extends CrawlSubstats.HasCrawlSubstats {    }}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -