⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 httptool.java

📁 真正的网络爬虫的源代码啊,希望大家好好阅读,写出心得体会啊
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
				
				// STEP 3 - send NTLM-Response
				auth = authorization.getResponse();
				docAndConnection = retrieveDocumentInternal(u, method, parameters, docAndConnection.httpConnection, auth);
				if (docAndConnection != null) {
					doc = docAndConnection.httpDoc;
					if (docAndConnection.httpConnection != null) {
						docAndConnection.httpConnection.close();
					}
				} else {
					doc = null; // BUGFIX (Not modified files return null)
				}
				
			} catch (Exception e) {
				log.error("NTLM-Authentication Error: " + e.getMessage());
				throw new HttpException(e.getMessage());
			}
  		}
  	} 
  	return doc;
  }
   
   /**
    * Internal structure to keep connection after retrieval of doc.
    */
   protected class DocAndConnection {
   		HttpDoc httpDoc;
   		HttpConnection httpConnection;
   }
   
  /**
   * Same like method without parameter httpConnection, but this
   * method uses the passed connection.
   * @param u
   * @param method
   * @param parameters
   * @param httpConnection (Use this connection)
   * @return DocAndConnection
   * @throws HttpException
   */ 
  protected DocAndConnection retrieveDocumentInternal(URL u, int method, String parameters, 
  			 HttpConnection httpConn, String ntlmAuthorizationInfo) 
    throws HttpException 
  {
    String host = null;
    InetAddress addr = null;
    String path = null;
    String requestPath = null;
    String protocol = null;
    String userinfo = null;
    boolean chunkedEncoding = false;
    boolean secureConnection = false;
    ChunkedInputStream chunkStream=null;

    // Content-Length
    int docSize = -1;
      
    int port = 0;
    HttpDoc doc = new HttpDoc();
    int i = 0;

    // set document URL
    doc.setURL(u);

    // document buffer
    ByteBuffer buff = new ByteBuffer();

    // the connection to the HTTP server
    // HttpConnection httpConn = null;

    InputStream is = null;
    BufferedWriter bwrite = null;

    // get host
    host = u.getHost();
    if (host == null) {
      throw new HttpException("no host part in URL found");
    }

    // get address, if not using a proxy 
    // if the client runs behind a proxy it is possible, that name
    // resolution for the internet is not possible
    if(! useProxy()) {
      try {
	addr = InetAddress.getByName(host);
      } catch (UnknownHostException e) {
	addr = null;
      }
      if (addr == null) {
	throw new HttpException("host part (" + host + ") does not resolve");
      }
    }

    // get path    
    path = u.getFile();
    if (path.equals("")) {
      path = "/";
    }
    // replace spaces
    path=path.replaceAll(" ","%20");

    // get protocol and port
    port = u.getPort();
    protocol = u.getProtocol().toLowerCase();
    if (protocol.equals("http")) {
      if (port == -1) {
	port = DEFAULT_HTTPPORT;
      }
    } else if (protocol.equals("https")) {
      if (port == -1) {
	port = DEFAULT_HTTPSPORT;
      }
      secureConnection=true;
    } else {
      throw new HttpException("protocol " + protocol + " not supported");
    }

    // if using the proxy, request path is the whole URL, otherwise only
    // the path part of the URL
    if (useProxy() && (! secureConnection)) {
      requestPath="http://"+host+path;
    } else {
      requestPath=path;
    }

    // get user info
    userinfo = u.getUserInfo();
    if (userinfo != null) {
     if (userinfo.equals("")) {
      userinfo=null;
     } else {
      // Store user info for this host
      userInfos.setProperty(host,userinfo);
     }
    } else {
     // do we hae a stored user info?
     userinfo=userInfos.getProperty(host);
    }


    if (callback != null) {
      callback.setHttpToolDocUrl(u.toString());
      callback.setHttpToolStatus(STATUS_CONNECTING);
    }

    // okay, we got all needed information, try to connect to the host
    try {
		if (httpConn == null) {
	      // connect and initialize streams
	      // timeout is stored in seconds in HttpTool, but
	      // HttpConnection uses milliseconds
	      if (secureConnection) {
			HttpsHelper helper = new HttpsHelper(proxyAddr,proxyPort,useProxy());
			httpConn = helper.createHttpsConnection(host,port);
	      } else {
			if (useProxy()) {
			  httpConn = HttpConnection.createConnection(proxyAddr, 
								     proxyPort,
								     socketTimeout*1000);
			} else {
			  httpConn = HttpConnection.createConnection(addr, 
								     port,
								     socketTimeout*1000);
			}
	      }
		}
	
      is = new LimitedBandwidthStream(
		new BufferedInputStream(httpConn.getInputStream(), 256),
					bandwidth);
      bwrite = new BufferedWriter(
	         new OutputStreamWriter(httpConn.getOutputStream()));

      if (callback != null) {
	callback.setHttpToolStatus(STATUS_CONNECTED);
      }


      // write HTTP request
      // get or post ?
      if (method == HttpConstants.GET) {
	bwrite.write("GET ");
	bwrite.write(requestPath);
	if ((parameters != null) 
	    && (! parameters.equals(""))) {
	  bwrite.write("?");
	  bwrite.write(parameters);
	}

      } else if (method == HttpConstants.POST) {
	bwrite.write("POST " + requestPath);
      } else {
	throw new HttpException("HTTP method " + method + " not supported");
      }

      // last part of request line
      bwrite.write(" ");
      bwrite.write(HTTP_VERSION);
      bwrite.write("\r\n");

      // Referer header only if defined
      if (referer != null) {
	bwrite.write("Referer: " + referer + "\r\n");
      }

      // if cookies are enabled, write a Cookie: header
      if (cookiesEnabled) {
	String cookieString = cookieManager.cookiesForURL(u);
	if (cookieString != null) {
	  bwrite.write("Cookie: ");
	  bwrite.write(cookieString);
	  bwrite.write("\r\n");
	  log.debug("Cookie request header: "+cookieString);
	}
      }

      // Write other headers
      bwrite.write("Host: " + host + "\r\n");
      bwrite.write("User-Agent: " + agentName + "\r\n");
      bwrite.write("Accept: */*\r\n");
      if (ntlmAuthorizationInfo == null) {
      	bwrite.write("Connection: close\r\n");
      } else {
		bwrite.write("Connection: keep-alive\r\n");
      }

      // Write "From:" header only if a fromAddress is defined
      if (fromAddress != null) {
	bwrite.write("From: "+fromAddress+"\r\n");
      }

      // if we have username and password, lets write an Authorization 
      // header
      if (userinfo != null) {
	// special hack to support usernames with "@"
	// TO DO: find a better solution for this problem
	userinfo = userinfo.replace('%','@');
	bwrite.write("Authorization: Basic ");
	bwrite.write(Base64.encode(userinfo));
	bwrite.write("\r\n");
        log.debug(userinfo);
        
      }
      
      if (ntlmAuthorizationInfo != null) {
		bwrite.write("Authorization: NTLM ");
		bwrite.write(ntlmAuthorizationInfo);
		bwrite.write("\r\n");
      }
      

      // if there is a "If-Modified-Since" date, also write this header
      if (modifyDate != null) {	
	String dateStr = df.format(modifyDate);

	bwrite.write("If-Modified-Since: ");
	bwrite.write(dateStr);
	bwrite.write("\r\n");
	log.debug("If-Modified-Since header: "+dateStr);	
      }

      // for a POST request we also need a content-length header
      if (method == HttpConstants.POST) {
	bwrite.write("Content-Type: application/x-www-form-urlencoded\r\n");
	bwrite.write("Content-Length: "+parameters.length()+"\r\n");
      }

      // finished headers
      bwrite.write("\r\n");
      // if this is a POST request, we have to add the POST parameters
      if (method == HttpConstants.POST) {
	bwrite.write(parameters);
      }
      bwrite.flush();
      
      if (callback != null) {
	callback.setHttpToolStatus(STATUS_RETRIEVING);
      }

      // read the first line (HTTP return code)
      while ((i = is.read()) != 10) {
	if (i == -1) {
	  throw new HttpException("Could not get HTTP return code "+
				  "(buffer content is "+buff.toString()+")");
	}
	buff.append((byte)i);
      }

      String httpCode = lineString(buff.getContent());
      buff.clean();
      doc.setHttpCode(httpCode);


      // read the HTTP headers
      boolean finishedHeaders = false;
      while (!finishedHeaders) {
	i = is.read();
	if (i == -1) {
	  throw new HttpException("Could not read HTTP headers");
	}
	if (i >= 32) {
	  buff.append((byte)i);
	}
	// HTTP header processing
	if (i == LF) {
	  String line = lineString(buff.getContent());
	  
	  buff.clean();
	  // empty line means "end of headers"
	  if (line.trim().equals("")) {
	    finishedHeaders = true;
	  } else {
	    HttpHeader head = new HttpHeader(line);
	    doc.addHeader(head);

	    if (cookiesEnabled
		&& head.isSetCookie()) {
	      try {
		Cookie cookie = new Cookie(head.toLine(),u);
		cookieManager.add(cookie);
		log.debug("Got a cookie "+cookie);
	      } catch (CookieException e) {
		log.info("Could not interpret cookie: "+e.getMessage());
	      }
	    }

	    // Content chunked ?
	    if (head.getName().equalsIgnoreCase("Transfer-Encoding")
		&& head.getValue().equalsIgnoreCase("chunked")) {
	      chunkedEncoding = true;
	    }

	  }
	}
      }
      buff.clean();

      // if there is a DownloadRule, ask if we should download
      // the data 
      if (downloadRules != null) {
		// if it is not allowed to download this URL, close socket
		// and return a null document
		boolean isNotModified = false;
		if (modifyDate != null) {
			HttpHeader lastModifiedHeader = doc.getHttpHeader("Last-Modified");
			if (lastModifiedHeader != null) {
				try {
					Date lastModifiedDate = df.parse(lastModifiedHeader.getValue());
					if (lastModifiedDate.compareTo(modifyDate) <= 0) {
						isNotModified = true;
					}
				} catch (ParseException e) {}
			}
		}
		
		if (! downloadRules.downloadAllowed(doc.getHttpHeader()) || isNotModified) {
		  if (doc.isNotModified()) { 
			log.info("If-Not-Modified successfull for: " + u);
		  } else if (isNotModified) {
			log.info("Header indicates not modified for: " + u);
		  } else {
			log.info("Download not allowed by download rule.");
		  }
		  // Close connection
		  httpConn.close(); httpConn = null;
	
		  if (callback != null) {
		    callback.setHttpToolStatus(STATUS_DENIEDBYRULE);
		  }
		  return null;
		}
      }

      
      // if we got encoding "chunked", use the ChunkedInputStream
      if (chunkedEncoding) {
	chunkStream = new ChunkedInputStream(is);
      }
      

      // did we got an Content-Length header ?
      HttpHeader contentLength = doc.getHeader(HttpHeader.CONTENT_LENGTH);
      if (contentLength != null) {       
	
	try { 
	  docSize = Integer.parseInt(contentLength.getValue());
	} catch (NumberFormatException e) {
	  log.error("Got a malformed Content-Length header from the server");
	  docSize = -1;
	}

	// send information to callback
	if (callback != null) {
	  callback.setHttpToolDocSize(docSize);
	}

	// initialize the byte buffer with the given document size
	// there is no need to increase the buffer size dynamically
	if (docSize > 0) {
	  buff.setSize(docSize);
	}
      }

      // read data
      boolean finished = false;
      int count=0;

      while (! finished) {
	
	if (chunkedEncoding) {
	  i = chunkStream.read();
	} else {
	  i = is.read();
	}
	
	if (i == -1) {
	  // this should only happen on HTTP/1.0 responses
	  // without a Content-Length header
	  finished = true;
	} else {
	  buff.append((byte)i);
	  count++;
	}


	// finished ?
	// there are other tests then wait until read gives us a -1:

	// if there was a Content-Length header stop after reading the
	// given number of bytes
	if (count == docSize) {
	  finished = true;
	}
	
	// if it is a chunked stream we should use the isDone method
	// to look if we reached the end
	if (chunkedEncoding) {
	  if (chunkStream.isDone()) {
	    finished=true;
	  }
	}
	

	// should we call the callback interface ?
	if (callback != null) {
	  if (((buff.length() % updateInterval) == 0)
	      || finished) {
	    callback.setHttpToolDocCurrentSize(buff.length());
	  }
	}

	
      }
      
      doc.setContent(buff.getContent());

      if (ntlmAuthorizationInfo == null) {
		// close everything
		//      bwrite.close();
		//      is.close();
      	httpConn.close(); httpConn = null;
      }
      
      if (callback != null) {
	callback.setHttpToolStatus(STATUS_DONE);
      }

    } catch (IOException e) {
      throw new HttpException(e.getMessage());
    }

	DocAndConnection docAndConnection = new DocAndConnection();
	docAndConnection.httpDoc = doc;
	docAndConnection.httpConnection = httpConn;
	
    return docAndConnection;
  }



  /**
   * should I use a proxy ?
   * @return true if a proxy was configured, false otherwise
   */
  protected boolean useProxy() {
    return (proxyAddr != null);
  }


  /**
   * convert an array of bytes to a String. if the last byte is an CR
   * it will be ignored
   */
  protected String lineString(byte[] b) {
    if (b.length == 0) {
      return "";
    }

    if (b[b.length-1] != CR) {
      return new String(b);
    } else {
      return new String(b,0,b.length-1);
    }
  }

public void setNtlmAuthorization(NTLMAuthorization ntlmAuthorization) {
	this.ntlmAuthorization = ntlmAuthorization;
}

public NTLMAuthorization getNtlmAuthorization() {
	return ntlmAuthorization;
}

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -