⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fetchhttp.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
                logger.severe(e.getMessage());                e.printStackTrace();            }        }    }    /**     * @param curi Current CrawlURI.  Used to get context.     * @return Socket timeout value.     */    private int getSoTimeout(CrawlURI curi) {        Integer res = null;        try {            res = (Integer) getAttribute(ATTR_SOTIMEOUT_MS, curi);        } catch (Exception e) {            res = DEFAULT_SOTIMEOUT_MS;        }        return res.intValue();    }    /**     * @param curi Current CrawlURI.  Used to get context.     * @return Timeout value for total request.     */    private int getTimeout(CrawlURI curi) {        Integer res;        try {            res = (Integer) getAttribute(ATTR_TIMEOUT_SECONDS, curi);        } catch (Exception e) {            res = DEFAULT_TIMEOUT_SECONDS;        }        return res.intValue();    }    private int getMaxFetchRate(CrawlURI curi) {        Integer res;        try {            res = (Integer)getAttribute(ATTR_FETCH_BANDWIDTH_MAX, curi);        }        catch (Exception e) {            res = DEFAULT_FETCH_BANDWIDTH_MAX;        }        return res.intValue();    }    private long getMaxLength(CrawlURI curi) {        Long res;        try {            res = (Long) getAttribute(ATTR_MAX_LENGTH_BYTES, curi);            if (res.longValue() == OLD_DEFAULT_MAX_LENGTH_BYTES) {                res = DEFAULT_MAX_LENGTH_BYTES;            }        } catch (Exception e) {            res = DEFAULT_MAX_LENGTH_BYTES;        }        return res.longValue();    }    /**     * Load cookies from a file before the first fetch.     * <p>     * The file is a text file in the Netscape's 'cookies.txt' file format.<br>     * Example entry of cookies.txt file:<br>     * <br>     * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br>     * <br>     * Each line has 7 tab-separated fields:<br>     * <li>1. DOMAIN: The domain that created and have access to the cookie     * value.     * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given     * domain can access the cookie value.     * <li>3. PATH: The path within the domain that the cookie value is valid     * for.     * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure     * connection to access the cookie value.     * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.)     * <li>6. NAME: The name of the cookie value     * <li>7. VALUE: The cookie value     *     * @param cookiesFile file in the Netscape's 'cookies.txt' format.     */    public void loadCookies(String cookiesFile) {        // Do nothing if cookiesFile is not specified.        if (cookiesFile == null || cookiesFile.length() <= 0) {            return;        }        RandomAccessFile raf = null;        try {            raf = new RandomAccessFile(cookiesFile, "r");            String[] cookieParts;            String line;            Cookie cookie = null;            while ((line = raf.readLine()) != null) {                // Line that starts with # is commented line, therefore skip it.                if (!line.startsWith("#")) {                    cookieParts = line.split("\\t");                    if (cookieParts.length == 7) {                        // Create cookie with not expiration date (-1 value).                        // TODO: add this as an option.                        cookie =                            new Cookie(cookieParts[0], cookieParts[5],                                cookieParts[6], cookieParts[2], -1,                                Boolean.valueOf(cookieParts[3]).booleanValue());                        if (cookieParts[1].toLowerCase().equals("true")) {                            cookie.setDomainAttributeSpecified(true);                        } else {                            cookie.setDomainAttributeSpecified(false);                        }                        this.http.getState().addCookie(cookie);                        logger.fine(                            "Adding cookie: " + cookie.toExternalForm());                    }                }            }        } catch (FileNotFoundException e) {            // We should probably throw FatalConfigurationException.            System.out.println("Could not find file: " + cookiesFile                    + " (Element: " + ATTR_LOAD_COOKIES + ")");        } catch (IOException e) {            // We should probably throw FatalConfigurationException.            e.printStackTrace();        } finally {            try {                if (raf != null) {                    raf.close();                }            } catch (IOException e) {                e.printStackTrace();            }        }    }    /* (non-Javadoc)     * @see org.archive.crawler.framework.Processor#report()     */    public String report() {        StringBuffer ret = new StringBuffer();        ret.append("Processor: org.archive.crawler.fetcher.FetchHTTP\n");        ret.append("  Function:          Fetch HTTP URIs\n");        ret.append("  CrawlURIs handled: " + this.curisHandled + "\n");        ret.append("  Recovery retries:   " + this.recoveryRetries + "\n\n");        return ret.toString();    }    /**     * Load cookies from the file specified in the order file.     *     * <p>     * The file is a text file in the Netscape's 'cookies.txt' file format.<br>     * Example entry of cookies.txt file:<br>     * <br>     * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br>     * <br>     * Each line has 7 tab-separated fields:<br>     * <li>1. DOMAIN: The domain that created and have access to the cookie     * value.     * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given     * domain can access the cookie value.     * <li>3. PATH: The path within the domain that the cookie value is valid     * for.     * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure     * connection to access the cookie value.     * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.)     * <li>6. NAME: The name of the cookie value     * <li>7. VALUE: The cookie value     */    public void loadCookies() {        try {            loadCookies((String) getAttribute(ATTR_LOAD_COOKIES));        } catch (MBeanException e) {            logger.warning(e.getLocalizedMessage());        } catch (ReflectionException e) {            logger.warning(e.getLocalizedMessage());        } catch (AttributeNotFoundException e) {            logger.warning(e.getLocalizedMessage());        }    }    /**     * Saves cookies to the file specified in the order file.     *     * Output file is in the Netscape 'cookies.txt' format.     *     */    public void saveCookies() {        try {            saveCookies((String) getAttribute(ATTR_SAVE_COOKIES));        } catch (MBeanException e) {            logger.warning(e.getLocalizedMessage());        } catch (ReflectionException e) {            logger.warning(e.getLocalizedMessage());        } catch (AttributeNotFoundException e) {            logger.warning(e.getLocalizedMessage());        }    }    /**     * Saves cookies to a file.     *     * Output file is in the Netscape 'cookies.txt' format.     *     * @param saveCookiesFile output file.     */    public void saveCookies(String saveCookiesFile) {        // Do nothing if cookiesFile is not specified.        if (saveCookiesFile == null || saveCookiesFile.length() <= 0) {            return;        }        FileOutputStream out = null;        try {            out = new FileOutputStream(new File(saveCookiesFile));            @SuppressWarnings("unchecked")            Map<String,Cookie> cookies = http.getState().getCookiesMap();            String tab ="\t";            out.write("# Heritrix Cookie File\n".getBytes());            out.write(                "# This file is the Netscape cookies.txt format\n\n".getBytes());            for (Cookie cookie: cookies.values()) {                MutableString line =                    new MutableString(1024 * 2 /*Guess an initial size*/);                line.append(cookie.getDomain());                line.append(tab);                line.append(                    cookie.isDomainAttributeSpecified() == true                        ? "TRUE"                        : "FALSE");                line.append(tab);                line.append(cookie.getPath());                line.append(tab);                line.append(                    cookie.getSecure() == true ? "TRUE" : "FALSE");                line.append(tab);                line.append(cookie.getName());                line.append(tab);                line.append((null==cookie.getValue())?"":cookie.getValue());                line.append("\n");                out.write(line.toString().getBytes());            }        } catch (FileNotFoundException e) {            // We should probably throw FatalConfigurationException.            System.out.println("Could not find file: " + saveCookiesFile                    + " (Element: " + ATTR_SAVE_COOKIES + ")");        } catch (IOException e) {            e.printStackTrace();        } finally {            try {                if (out != null) {                    out.close();                }            } catch (IOException e) {                e.printStackTrace();            }        }    }    /* (non-Javadoc)     * @see org.archive.crawler.settings.ModuleType#listUsedFiles(java.util.List)     */    protected void listUsedFiles(List<String> list) {        // List the cookies files        // Add seed file        try {            String tmp = (String)getAttribute(ATTR_LOAD_COOKIES);            if(tmp != null && tmp.length() > 0){                File file = getSettingsHandler().                        getPathRelativeToWorkingDirectory(tmp);                list.add(file.getAbsolutePath());            }            tmp = (String)getAttribute(ATTR_SAVE_COOKIES);            if(tmp != null && tmp.length() > 0){                File file = getSettingsHandler().                        getPathRelativeToWorkingDirectory(tmp);                list.add(file.getAbsolutePath());            }        } catch (AttributeNotFoundException e) {            // TODO Auto-generated catch block            e.printStackTrace();        } catch (MBeanException e) {            // TODO Auto-generated catch block            e.printStackTrace();        } catch (ReflectionException e) {            // TODO Auto-generated catch block            e.printStackTrace();        }    }        private void setAcceptHeaders(CrawlURI curi, HttpMethod get) {        try {            StringList accept_headers = (StringList) getAttribute(ATTR_ACCEPT_HEADERS, curi);            if (!accept_headers.isEmpty()) {                for (ListIterator i = accept_headers.listIterator(); i.hasNext();) {                    String hdr = (String) i.next();                    String[] nvp = hdr.split(": +");                    if (nvp.length == 2) {                        get.setRequestHeader(nvp[0], nvp[1]);                    }                    else {                        logger.warning("Invalid accept header: " + hdr);                    }                }            }        }        catch (AttributeNotFoundException e) {            logger.severe(e.getMessage());        }    }    // custom serialization    private void writeObject(ObjectOutputStream stream) throws IOException {        stream.defaultWriteObject();        // save cookies        @SuppressWarnings("un

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -