downloadparameters.java

来自「一个用java语言编写的网络爬虫程序」· Java 代码 · 共 251 行
JAVA
251 行
/* * WebSPHINX web crawling toolkit * Copyright (C) 1998,1999 Carnegie Mellon University  *  * This library is free software; you can redistribute it * and/or modify it under the terms of the GNU Library * General Public License as published by the Free Software  * Foundation, version 2. * * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/ */package websphinx;/** * Download parameters.  These parameters are limits on * how Page can download a Link.  A Crawler has a * default set of download parameters, but the defaults * can be overridden on individual links by calling * Link.setDownloadParameters(). * <P> * DownloadParameters is an immutable class (like String). * "Changing" a parameter actually returns a new instance * of the class with only the specified parameter changed. */ public class DownloadParameters implements Cloneable//#ifdef JDK1.1 , java.io.Serializable //#endif JDK1.1{    private int maxThreads = 4;        // number of background threads used by the crawler    private int maxPageSize = 100;        // maximum page size in kilobytes (-1 for no maximum)    private int downloadTimeout = 60;         // timeout for a single page, in seconds (-1 for no timeout)    private int crawlTimeout = -1;        // timeout for entire crawl in seconds (-1 for no timeout)    private boolean obeyRobotExclusion = false;        // obey crawling rules in robots.txt    private int maxRequestsPerServer = 2;         // maximum number of simultaneous requests to a server (-1 for no maximum)    private int delay = 500;        // delay (in milliseconds) between starts of requests to same server (0 for no delay)    private boolean interactive = true;        // user is available to answer dialog boxes, e.g. for authentication    private boolean useCaches = true;        // use cached pages to satisfy requests wherever possible    private String acceptedMIMETypes = null;        // accept header for HTTP request, or null to use default    private String userAgent = null;        // User-Agent header for HTTP request, or null to use default    /**     * Make a DownloadParameters object with default settigns.     */    public DownloadParameters () {    }        /**     * Clone a DownloadParameters object.     */    public Object clone () {        try {            return super.clone ();        } catch (CloneNotSupportedException e) {            throw new RuntimeException ("Internal error: " + e);        }    }    /**     * Get maximum threads.     * @return maximum number of background threads used by crawler.     *   Default is 4.     */    public int getMaxThreads() {        return maxThreads;    }    /**     * Set maximum threads.     * @param maxthreads maximum number of background threads used by crawler     * @return new DownloadParameters object with the specified parameter changed.     */    public DownloadParameters changeMaxThreads(int maxthreads) {        DownloadParameters dp = (DownloadParameters)clone();        dp.maxThreads = maxthreads;        return dp;    }    /**     * Get maximum page size.  Pages larger than this limit are neither     * downloaded nor parsed.     * Default value is 100 (KB).     * @return maximum page size in kilobytes     */    public int getMaxPageSize() {        return maxPageSize;    }    /**     * Change maximum page size.  Pages larger than this limit are treated as     * leaves in the crawl graph  -- neither downloaded nor parsed.     * @param maxPageSize maximum page size in kilobytes     * @return new DownloadParameters object with the specified parameter changed.     */    public DownloadParameters changeMaxPageSize(int maxPageSize) {        DownloadParameters dp = (DownloadParameters)clone();        dp.maxPageSize = maxPageSize;        return dp;    }    /**     * Get download timeout value.     * @return length of time (in seconds) that crawler will wait for a page to download     * before aborting it.     * timeout. Default is 60 seconds.     */    public int getDownloadTimeout() {        return downloadTimeout;    }    /**     * Change download timeout value.     * @param timeout length of time (in seconds) to wait for a page to download     *     Use a negative value to turn off timeout.     * @return new DownloadParameters object with the specified parameter changed.     */    public DownloadParameters changeDownloadTimeout(int timeout) {        DownloadParameters dp = (DownloadParameters)clone();        dp.downloadTimeout = timeout;        return dp;    }    /**     * Get timeout on entire crawl.     * @return maximum length of time (in seconds) that crawler will run     * before aborting.  Default is -1 (no limit).     */    public int getCrawlTimeout() {        return crawlTimeout;    }    /**     * Change timeout value.     * @param timeout maximum length of time (in seconds) that crawler will run.     *     Use a negative value to turn off timeout.     * @return new DownloadParameters object with the specified parameter changed.     */    public DownloadParameters changeCrawlTimeout(int timeout) {        DownloadParameters dp = (DownloadParameters)clone();        dp.crawlTimeout = timeout;        return dp;    }    /**     * Get obey-robot-exclusion flag.       * @return true iff the     * crawler checks robots.txt on the remote Web site     * before downloading a page.  Default is false.     */    public boolean getObeyRobotExclusion() {        return obeyRobotExclusion;    }    /**     * Change obey-robot-exclusion flag.     * @param f   If true, then the     * crawler checks robots.txt on the remote Web site     * before downloading a page.     * @return new DownloadParameters object with the specified parameter changed.     */    public DownloadParameters changeObeyRobotExclusion(boolean f) {        DownloadParameters dp = (DownloadParameters)clone();        dp.obeyRobotExclusion = f;        return dp;    }    /**     * Get interactive flag.     * @return true if a user is available to respond to     * dialog boxes (for instance, to enter passwords for     * authentication).  Default is true.     */    public boolean getInteractive() {        return interactive;    }    /**     * Change interactive flag.     * @param f true if a user is available to respond     * to dialog boxes     * @return new DownloadParameters object with the specified parameter changed.     */    public DownloadParameters changeInteractive(boolean f) {        DownloadParameters dp = (DownloadParameters)clone();        dp.interactive = f;        return dp;    }    /**     * Get use-caches flag.     * @return true if cached pages should be used whenever     * possible     */    public boolean getUseCaches() {        return useCaches;    }    /**     * Change use-caches flag.     * @param f true if cached pages should be used whenever possible     * @return new DownloadParameters object with the specified parameter changed.     */    public DownloadParameters changeUseCaches(boolean f) {        DownloadParameters dp = (DownloadParameters)clone();        dp.useCaches = f;        return dp;    }    /**     * Get accepted MIME types.     * @return list of MIME types that can be handled by      * the crawler (which are passed as the Accept header     * in the HTTP request).     * Default is null.     */    public String getAcceptedMIMETypes() {        return acceptedMIMETypes;    }    /**     * Change accepted MIME types.     * @param types list of MIME types that can be handled     * by the crawler.  Use null if the crawler can handle anything.     * @return new DownloadParameters object with the specified parameter changed.     */    public DownloadParameters changeAcceptedMIMETypes(String types) {        DownloadParameters dp = (DownloadParameters)clone();        dp.acceptedMIMETypes = types;        return dp;    }    /**     * Get User-agent header used in HTTP requests.     * @return user-agent field used in HTTP requests,     * or null if the Java library's default user-agent     * is used.  Default value is null (but for a Crawler,     * the default DownloadParameters has the Crawler's     * name as its default user-agent).     */    public String getUserAgent() {        return userAgent;    }    /**     * Change User-agent field used in HTTP requests.     * @param userAgent user-agent field used in HTTP     * requests.  Pass null to use the Java library's default     * user-agent field.     * @return new DownloadParameters object with the specified parameter changed.     */    public DownloadParameters changeUserAgent(String userAgent) {        DownloadParameters dp = (DownloadParameters)clone();        dp.userAgent = userAgent;        return dp;    }}
downloadparameters.java - 源码说明

本页面展示了「一个用java语言编写的网络爬虫程序」中的 downloadparameters.java 源码文件，采用 Java 编程语言编写，共 251 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与java相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?