⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 httptool.java

📁 真正的网络爬虫的源代码啊,希望大家好好阅读,写出心得体会啊
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
package net.matuschek.http;

/*************************************************
    Copyright (c) 2001/2002 by Daniel Matuschek
*************************************************/

import java.io.BufferedInputStream;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.net.InetAddress;
import java.net.URL;
import java.net.UnknownHostException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import java.util.TimeZone;
import java.util.Properties;

import net.matuschek.http.connection.HttpConnection;
import net.matuschek.http.connection.HttpsHelper;
import net.matuschek.http.cookie.Cookie;
import net.matuschek.http.cookie.CookieException;
import net.matuschek.http.cookie.CookieManager;
import net.matuschek.http.cookie.MemoryCookieManager;
import net.matuschek.util.Base64;
import net.matuschek.util.ByteBuffer;
import net.matuschek.util.ChunkedInputStream;
import net.matuschek.util.LimitedBandwidthStream;

import org.apache.log4j.Category;

/**
 * Class for retrieving documents from HTTP servers.
 *
 * <p>The main purpose of this class is to retrieve a document
 * from an HTTP server. </p>
 *
 * <p>For many purposes the Java URLInputStream is good for this,
 * but if you want to have full control over the HTTP headers 
 * (both request and response headers), HttpTool is the answer. </p>
 *
 * <p>Also it defines a callback interface to inform a client about
 * the state of the current download operation. </p>
 * 
 * <p>It is possible to abort a download after getting the 
 * HTTP response headers from the server (e.g. if a document of
 * this Content-Type is useless for your application or the document
 * is to big or whatever you like) </p>
 * 
 * <p>HttpTool is reusuable. You should initializes it once and use
 * it for every download operation.</p>
 * 
 * @author Daniel Matuschek 
 * @version $Id: HttpTool.java,v 1.28 2004/03/26 20:28:44 matuschd Exp $
 */
public class HttpTool {

  /** Carriage return */
  final static byte CR = 13;

  /** Line feed */
  final static byte LF = 10;

  /** used HTTP version */
  final static String HTTP_VERSION="HTTP/1.1";

  /* Status constants */ 

  /** HTTP connection will be established */
  public final static int STATUS_CONNECTING=0;
  /** HTTP connection was established, but no data where retrieved */
  public final static int STATUS_CONNECTED=1;
  /** data will be retrieved now */
  public final static int STATUS_RETRIEVING=2;
  /** download finished */
  public final static int STATUS_DONE=3;
  /** download could not be finished because a DownloadRule denied it */
  public final static int STATUS_DENIEDBYRULE=4;

  /** default HTTP port */
  private final static int DEFAULT_HTTPPORT = 80;

  /** default HTTPS port */
  private final static int DEFAULT_HTTPSPORT = 443;

  /** default agent name */
  private final static String AGENTNAME = 
    "JoBo/1.4beta "
    +"(http://www.matuschek.net/jobo.html)";

  /** 
   * default update interval for calls of the callback interfaces 
   * (in bytes)
   */
  private final static int DEFAULT_UPDATEINTERVAL =1024;

  /** default socket timeout in seconds */
  private final static int DEFAULT_SOCKETTIMEOUT=20;

  /** HTTP AgentName header */
  private String agentName = AGENTNAME;
  
  /** HTTP Referer header */
  private String referer = null;

  /** HTTP From header */
  private String fromAddress = null;

  /** Date of the HTTP If-Modified-Since header */
  private Date modifyDate = null;
  
  /** 
   * maximal used bandwidth in bytes per second 
   * 0 disables bandwidth limitations
   */
  private int bandwidth = 0;

  /** proxy address */
  private InetAddress proxyAddr = null;

  /** proxy port number */
  private int proxyPort = 0;

  /** textual description of the proxy (format host:port) */
  private String proxyDescr="";

  /** timeout for getting data in seconds */
  private int socketTimeout = DEFAULT_SOCKETTIMEOUT;

  /** HttpTool should accept and use cookies */
  private boolean cookiesEnabled = true;

  /** Log4J Category object for logging */
  private Category log = null;

  /** Authentication infos */
  private Properties userInfos = new Properties();
  
  /** @link dependency */
  /*#HttpDoc lnkHttpDoc;*/


  /**
   * defines after how many bytes read from the web 
   * server the Callback interface will be called 
   * (default updates after one kilobyte)
   */
  private int updateInterval = DEFAULT_UPDATEINTERVAL;
  
  /**
   * callback interface that will be used after n bytes are
   * read from the web server to update the state of the current
   * retrieve operation to the application
   */
  private HttpToolCallback callback=null;

  /**
   * DownloadRuleSet tells the HttpTool, if it should download
   *  the whole file after getting the headers
   */
  private DownloadRuleSet downloadRules = null;

  /**
   * The cookie manager will be used to store cookies 
   */
  private CookieManager cookieManager = null;

  /**
   * The DateFormat instance will be used to format If-Modified-Since requests 
   */
  static SimpleDateFormat df;

  private NTLMAuthorization ntlmAuthorization = null;
    
  /*
   * Initialize df to a formatter for timezone "GMT" and locale Locale.US
   * without changing the default timezone. If-Modified-Since requests need
   * to be in that format.
   */
  static {
	TimeZone local = TimeZone.getDefault();		
	TimeZone gmt = TimeZone.getTimeZone("GMT");		
	TimeZone.setDefault(gmt);
	df = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z", Locale.US);
	TimeZone.setDefault(local);
  }


  /**
   * Initializes HttpTool with a new CookieManager (that will not contain
   * any cookie).
   * Enables logging
   */
  public HttpTool() {
    this.cookieManager = new MemoryCookieManager();
    log = Category.getInstance(getClass().getName());
  }


  /**
   * Sets the Referer: HTTP header 
   * @param referer value for the Referer header
   */
  public void setReferer(String referer) {
    this.referer = referer;
  }
  
  /**
   * Sets the User-Agent: HTTP header
   * @param name name of the user agent (may contain spaces)
   */
  public void setAgentName(String name) {
    this.agentName = name;
  }
  
  /**
   * Gets the current setting of the User-Agent HTTP header
   * @return the User-Agent name
   */
  public String getAgentName() {
    return agentName;
  }

  /**
   * <b>Insiders BugFix</b>
   * This method finishes the MemoryCleanupManager.
   */
  public void finish() {
	  if (cookieManager != null) {
		  cookieManager.finish();
	  }
  }

  /**
   * Sets the DownloadRules for this object <br />
   * A download rule uses the HTTP return headers to decide if the
   * download should be finished. 
   * @param rule a DownloadRule
   */
  public void setDownloadRuleSet(DownloadRuleSet rules) {
    this.downloadRules=rules;
  }


  /**
   * Gets the DownloadRules for this object
   * @return a DownloadRuleSet
   */
  public DownloadRuleSet getDownloadRuleSet() {
    return this.downloadRules;
  }


  /**
   * Gets the timeout for getting data in seconds
   * @return the value of sockerTimeout
   * @see #setTimeout(int)
   */
  public int getTimeout() {
    return this.socketTimeout;
  }


  /**
   * Sets the timeout for getting data. If HttpTool can't read
   * data from a remote web server after this number of seconds
   * it will stop the download of the current file
   * @param timeout Timeout in seconds
   */
  public void setTimeout(int timeout) {
    this.socketTimeout = timeout;
  }


  /**
   * Enable/disable cookies
   * @param enable if true, HTTP cookies will be enabled, if false
   * HttpTool will not use cookies
   */
  public void setEnableCookies(boolean enable) {
    this.cookiesEnabled=enable;
  }

  /**
   * Get the status of the cookie engine
   * @return true, if HTTP cookies are enabled, false otherwise
   */
  public boolean getEnableCookies() {
    return this.cookiesEnabled;
  }


  /** 
   *  sets a proxy to use 
   *  @param proxyDescr the Proxy definition in the format host:port
   */
  public void setProxy(String proxyDescr) 
    throws HttpException
  {
    proxyAddr=null;
    proxyPort=0;
    String proxyHost = null;

    if ((proxyDescr != null) &&
	(! proxyDescr.equals(""))) {
      int pos = proxyDescr.indexOf(":");
      if (pos > 0) {
	try {
	  String port = proxyDescr.substring(pos+1);
	  proxyHost = proxyDescr.substring(0,pos);
	  proxyPort = Integer.parseInt(port);
	  proxyAddr = InetAddress.getByName(proxyHost);
	} catch (NumberFormatException e) {
	  throw new HttpException("Proxy definition incorrect, "+
				  "port not numeric: "+
				  proxyDescr);
	} catch (UnknownHostException e) {
	  throw new HttpException("Host not found: "+proxyHost);
	}
      } else {
	throw new HttpException("Proxy definition incorrect, "+
				"fomat must be host:port: "+
				proxyDescr);	
      }
    }
    this.proxyDescr=proxyDescr;
  }


  /**
   * Gets a textual representation of the current proxy settings
   * @return return the proxy settings in the format host:port
   */
  public String getProxy() {
    return proxyDescr;
  }


  /**
   * Set the value of the "If-Modified-Since" header
   * Usually, this is null and HttpTool will retrieve every
   * document. Setting this to a date will retrieve only
   * documents that were modified since this time
   */
  public void setIfModifiedSince(Date modifyDate) {
    this.modifyDate=modifyDate;
  }


  /**
   * Returns the date used for the "If-Modified-Since" header
   * @return a Date object if the "If-Modified-Since" header is set,
   * null otherwise
   */
  public Date getIfModifiedSince() {
    return this.modifyDate;
  }


  /**
   * Sets the content From: HTTP header
   * @param fromAdress an email adress (e.g. some@where.com)
   */
  public void setFromAddress(String fromAddress) { 
    this.fromAddress=fromAddress; 
  }

  
  /**
   * Gets the current callback object
   * @return the defined HttpToolCallback object
   */
  public HttpToolCallback getCallback() { 
    return callback; 
  }


  /**
   * Get the value of bandwidth.
   * @return value of bandwidth.
   */
  public int getBandwidth() {
    return bandwidth;
  }
  

  /**
   * Set the value of bandwidth.
   * @param bandwith  Value to assign to bandwidth.
   */
  public void setBandwidth(int bandwidth) {
    this.bandwidth = bandwidth;
  }
  

  /**
   * Sets a callback object
   *
   * If set this object will be used to inform about the current
   * status of the download. HttpTool will call methods of this
   * object while retrieving a document.
   *  
   * @param callback a callback object
   * @see HttpToolCallback
   */
  public void setCallback(HttpToolCallback callback) { 
    this.callback = callback; 
  }


  /**
   * Gets the current update interval
   * @return the update interval in bytes
   * @see #setUpdateInterval(int)
   */
  public int getUpdateInterval() { 
    return updateInterval; 
  }


  /**
   * Sets the callback update interval
   *
   * This setting is used if a callback object is defined. Then  after
   * reading this number of bytes, the method 
   * <code>setHttpToolDocCurrentSize</code> will be called.
   * You should not set this to a value smaller then 1000 unless your
   * bandwidth is very small, because it will slow down downloads.
   *
   * @param updateInterval update interval in bytes
   *
   * @see HttpToolCallbackInterface#setHttpToolDocCurrentSize(int)
   */
  public void setUpdateInterval(int updateInterval) { 
    if (updateInterval > 0) {
      this.updateInterval = updateInterval; 
    } else {
      throw new IllegalArgumentException("updateInterval must be > 0 (was "+
					 updateInterval+")");
    }
  }

  /**
   * Sets the CookieManager for this HttpTool
   * By default a MemoryCookieManager will be used, but you can
   * use this method to use your own CookieManager implementation
   *
   * @param cm an object that implements the CookieManager interface
   */
  public void setCookieManager(CookieManager cm) {
    this.cookieManager = cm;
  }


  /**
   * Gets the CookieManager used by this HttpTool
   *
   * @return the CookieManager that will be used by this HttpTool
   */
  public CookieManager getCookieManager() {
    return this.cookieManager;
  }


  /**
   * Delete all cookies
   */
  public void clearCookies() {
    if (cookieManager != null) {
      cookieManager.clear();
    }
  }


  /**
   * Retrieves a document from the given URL. 
   * If Cookies are enabled it will use the CookieManager to set Cookies 
   * it got from former retrieveDocument operations.
   *
   * @param u the URL to retrieve (only http:// supported yet)
   * @param method HttpConstants.GET for a GET request, HttpConstants.POST
   * for a POST request
   * @param parameters additional parameters. Will be added to the URL if
   * this is a GET request, posted if it is a POST request
   * @return a HttpDoc if a document was retrieved, null otherwise
   *
   * @see HttpConstants
   */
  public HttpDoc retrieveDocument(URL u, int method, String parameters) throws HttpException {
  	DocAndConnection docAndConnection = retrieveDocumentInternal(u, method, parameters, null, null);
  	HttpDoc doc = docAndConnection != null ? docAndConnection.httpDoc : null;
  	if (doc != null && doc.getHttpCode() == 401) {
  		String authProtName = NTLMAuthorization.WWW_AUTHENTICATE_HEADER;
  		String authProtValue = doc.getHeaderValue(authProtName);
  		if (authProtValue == null) {
  			authProtName = NTLMAuthorization.PROXY_AUTHENTICATE_HEADER; 
			authProtValue = doc.getHeaderValue(authProtName);
  		}
  		if (authProtValue.indexOf(NTLMAuthorization.NTLM_TAG)>=0 ||
  			authProtValue.indexOf("Negotiate")>=0) {
  			
			try {
				// STEP 1 - send NTLM-Request
				NTLMAuthorization authorization = (NTLMAuthorization) ntlmAuthorization.clone();
				authorization.setHost(u.getHost());
				// log.info("NTLM-Authentication: " + authorization);
				String auth = authorization.getRequest();
				docAndConnection = retrieveDocumentInternal(u, method, parameters, null, auth);
				
				// STEP 2 - receive NTLM-Nonce
				doc = docAndConnection.httpDoc;
				authProtValue = doc.getHeaderValue(authProtName);
				authorization.extractNonce(authProtValue);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -