📄 fetchhttp.java
字号:
logger.severe(e.getMessage()); e.printStackTrace(); } } } /** * @param curi Current CrawlURI. Used to get context. * @return Socket timeout value. */ private int getSoTimeout(CrawlURI curi) { Integer res = null; try { res = (Integer) getAttribute(ATTR_SOTIMEOUT_MS, curi); } catch (Exception e) { res = DEFAULT_SOTIMEOUT_MS; } return res.intValue(); } /** * @param curi Current CrawlURI. Used to get context. * @return Timeout value for total request. */ private int getTimeout(CrawlURI curi) { Integer res; try { res = (Integer) getAttribute(ATTR_TIMEOUT_SECONDS, curi); } catch (Exception e) { res = DEFAULT_TIMEOUT_SECONDS; } return res.intValue(); } private int getMaxFetchRate(CrawlURI curi) { Integer res; try { res = (Integer)getAttribute(ATTR_FETCH_BANDWIDTH_MAX, curi); } catch (Exception e) { res = DEFAULT_FETCH_BANDWIDTH_MAX; } return res.intValue(); } private long getMaxLength(CrawlURI curi) { Long res; try { res = (Long) getAttribute(ATTR_MAX_LENGTH_BYTES, curi); if (res.longValue() == OLD_DEFAULT_MAX_LENGTH_BYTES) { res = DEFAULT_MAX_LENGTH_BYTES; } } catch (Exception e) { res = DEFAULT_MAX_LENGTH_BYTES; } return res.longValue(); } /** * Load cookies from a file before the first fetch. * <p> * The file is a text file in the Netscape's 'cookies.txt' file format.<br> * Example entry of cookies.txt file:<br> * <br> * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br> * <br> * Each line has 7 tab-separated fields:<br> * <li>1. DOMAIN: The domain that created and have access to the cookie * value. * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given * domain can access the cookie value. * <li>3. PATH: The path within the domain that the cookie value is valid * for. * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure * connection to access the cookie value. * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.) * <li>6. NAME: The name of the cookie value * <li>7. VALUE: The cookie value * * @param cookiesFile file in the Netscape's 'cookies.txt' format. */ public void loadCookies(String cookiesFile) { // Do nothing if cookiesFile is not specified. if (cookiesFile == null || cookiesFile.length() <= 0) { return; } RandomAccessFile raf = null; try { raf = new RandomAccessFile(cookiesFile, "r"); String[] cookieParts; String line; Cookie cookie = null; while ((line = raf.readLine()) != null) { // Line that starts with # is commented line, therefore skip it. if (!line.startsWith("#")) { cookieParts = line.split("\\t"); if (cookieParts.length == 7) { // Create cookie with not expiration date (-1 value). // TODO: add this as an option. cookie = new Cookie(cookieParts[0], cookieParts[5], cookieParts[6], cookieParts[2], -1, Boolean.valueOf(cookieParts[3]).booleanValue()); if (cookieParts[1].toLowerCase().equals("true")) { cookie.setDomainAttributeSpecified(true); } else { cookie.setDomainAttributeSpecified(false); } this.http.getState().addCookie(cookie); logger.fine( "Adding cookie: " + cookie.toExternalForm()); } } } } catch (FileNotFoundException e) { // We should probably throw FatalConfigurationException. System.out.println("Could not find file: " + cookiesFile + " (Element: " + ATTR_LOAD_COOKIES + ")"); } catch (IOException e) { // We should probably throw FatalConfigurationException. e.printStackTrace(); } finally { try { if (raf != null) { raf.close(); } } catch (IOException e) { e.printStackTrace(); } } } /* (non-Javadoc) * @see org.archive.crawler.framework.Processor#report() */ public String report() { StringBuffer ret = new StringBuffer(); ret.append("Processor: org.archive.crawler.fetcher.FetchHTTP\n"); ret.append(" Function: Fetch HTTP URIs\n"); ret.append(" CrawlURIs handled: " + this.curisHandled + "\n"); ret.append(" Recovery retries: " + this.recoveryRetries + "\n\n"); return ret.toString(); } /** * Load cookies from the file specified in the order file. * * <p> * The file is a text file in the Netscape's 'cookies.txt' file format.<br> * Example entry of cookies.txt file:<br> * <br> * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br> * <br> * Each line has 7 tab-separated fields:<br> * <li>1. DOMAIN: The domain that created and have access to the cookie * value. * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given * domain can access the cookie value. * <li>3. PATH: The path within the domain that the cookie value is valid * for. * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure * connection to access the cookie value. * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.) * <li>6. NAME: The name of the cookie value * <li>7. VALUE: The cookie value */ public void loadCookies() { try { loadCookies((String) getAttribute(ATTR_LOAD_COOKIES)); } catch (MBeanException e) { logger.warning(e.getLocalizedMessage()); } catch (ReflectionException e) { logger.warning(e.getLocalizedMessage()); } catch (AttributeNotFoundException e) { logger.warning(e.getLocalizedMessage()); } } /** * Saves cookies to the file specified in the order file. * * Output file is in the Netscape 'cookies.txt' format. * */ public void saveCookies() { try { saveCookies((String) getAttribute(ATTR_SAVE_COOKIES)); } catch (MBeanException e) { logger.warning(e.getLocalizedMessage()); } catch (ReflectionException e) { logger.warning(e.getLocalizedMessage()); } catch (AttributeNotFoundException e) { logger.warning(e.getLocalizedMessage()); } } /** * Saves cookies to a file. * * Output file is in the Netscape 'cookies.txt' format. * * @param saveCookiesFile output file. */ public void saveCookies(String saveCookiesFile) { // Do nothing if cookiesFile is not specified. if (saveCookiesFile == null || saveCookiesFile.length() <= 0) { return; } FileOutputStream out = null; try { out = new FileOutputStream(new File(saveCookiesFile)); @SuppressWarnings("unchecked") Map<String,Cookie> cookies = http.getState().getCookiesMap(); String tab ="\t"; out.write("# Heritrix Cookie File\n".getBytes()); out.write( "# This file is the Netscape cookies.txt format\n\n".getBytes()); for (Cookie cookie: cookies.values()) { MutableString line = new MutableString(1024 * 2 /*Guess an initial size*/); line.append(cookie.getDomain()); line.append(tab); line.append( cookie.isDomainAttributeSpecified() == true ? "TRUE" : "FALSE"); line.append(tab); line.append(cookie.getPath()); line.append(tab); line.append( cookie.getSecure() == true ? "TRUE" : "FALSE"); line.append(tab); line.append(cookie.getName()); line.append(tab); line.append((null==cookie.getValue())?"":cookie.getValue()); line.append("\n"); out.write(line.toString().getBytes()); } } catch (FileNotFoundException e) { // We should probably throw FatalConfigurationException. System.out.println("Could not find file: " + saveCookiesFile + " (Element: " + ATTR_SAVE_COOKIES + ")"); } catch (IOException e) { e.printStackTrace(); } finally { try { if (out != null) { out.close(); } } catch (IOException e) { e.printStackTrace(); } } } /* (non-Javadoc) * @see org.archive.crawler.settings.ModuleType#listUsedFiles(java.util.List) */ protected void listUsedFiles(List<String> list) { // List the cookies files // Add seed file try { String tmp = (String)getAttribute(ATTR_LOAD_COOKIES); if(tmp != null && tmp.length() > 0){ File file = getSettingsHandler(). getPathRelativeToWorkingDirectory(tmp); list.add(file.getAbsolutePath()); } tmp = (String)getAttribute(ATTR_SAVE_COOKIES); if(tmp != null && tmp.length() > 0){ File file = getSettingsHandler(). getPathRelativeToWorkingDirectory(tmp); list.add(file.getAbsolutePath()); } } catch (AttributeNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (MBeanException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ReflectionException e) { // TODO Auto-generated catch block e.printStackTrace(); } } private void setAcceptHeaders(CrawlURI curi, HttpMethod get) { try { StringList accept_headers = (StringList) getAttribute(ATTR_ACCEPT_HEADERS, curi); if (!accept_headers.isEmpty()) { for (ListIterator i = accept_headers.listIterator(); i.hasNext();) { String hdr = (String) i.next(); String[] nvp = hdr.split(": +"); if (nvp.length == 2) { get.setRequestHeader(nvp[0], nvp[1]); } else { logger.warning("Invalid accept header: " + hdr); } } } } catch (AttributeNotFoundException e) { logger.severe(e.getMessage()); } } // custom serialization private void writeObject(ObjectOutputStream stream) throws IOException { stream.defaultWriteObject(); // save cookies @SuppressWarnings("un
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -