⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 httpdoc.java

📁 真正的网络爬虫的源代码啊,希望大家好好阅读,写出心得体会啊
💻 JAVA
字号:
package net.matuschek.http;

import java.net.URL;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
import java.util.Vector;
import net.matuschek.util.MD5;

/*********************************************
	Copyright (c) 2001 by Daniel Matuschek
*******************************************/

/**
 * A HTTP document. It consists of the contents and HTTP headers.
 *
 * @author Daniel Matuschek (daniel@matuschek.net)
 * @author ptah
 * @version $Id: HttpDoc.java,v 1.11 2004/08/09 17:36:49 matuschd Exp $
 */
public class HttpDoc {
  /** The content */
  private byte[] content;

  /**
   * The HTTP header lines
   *
   * @link aggregation
   * @associates <{HttpHeader}>
   */
  private Vector<HttpHeader> httpHeader;
  
  /** place to store links of the document if necessary */
  private List links;

  private int httpReturnCode=0;
  private URL url;
  
  /** flag that indicates if this document is retrieved from cache */
  private boolean cached = false;

  private final static int HTTP_REDIRECTSTART=300;
  private final static int HTTP_REDIRECTEND=399;



  /**
   * Default constructor, initializes a new HttpDoc with
   * empty headers and no content
   */
  public HttpDoc() {
    httpHeader = new Vector<HttpHeader>();
  }
 
  /**
   * Gets the content of the document
   *
   * @return an array of bytes containing the document content. This
   * may represent text or binary data
   */
  public byte[] getContent() {
    return content;
  }
 
  /**
   * Set the content of the document
   * 
   * @param content
   */
  public void setContent(byte[] content) {
    this.content = content;
    // existing MD5 keys become invalid
    removeHeader(HttpHeader.CONTENT_MD5);
  }
  
  public void setHttpCode(String httpCode) {
    StringTokenizer st = new StringTokenizer(httpCode," ");
    // an HTTP answer must have at least 2 fields
    if (st.countTokens() < 2) {
      return;
    }
   
    st.nextToken();
    String codeStr = st.nextToken();
    
    try {
      httpReturnCode = Integer.parseInt(codeStr);
    } catch (NumberFormatException e) {
      // something is wrong !!!
    }
  }
  
  public void setHttpCode(int code) {
  	httpReturnCode = code;
  }

  /**
   * Get the Http Return-Code
   *
   * @return Http Return-Code
   */
  public int getHttpCode() {
	  return httpReturnCode;
  }

  /**
   * Add another HTTP header
   *
   * @param header an HttpHeader object to add to the lis
   * of headers
   */
  public void addHeader(HttpHeader header) {
    httpHeader.add(header);
  }
  
  /**
   * Get all HTTP header lines
   *
   * @return a Vector of HttpHeader objects
   */
  public Vector getHttpHeader() {
    return httpHeader;
  }

  /**
   * Get the HTTP header with the given name
   * @param headerName
   *
   * @return a HttpHeader with the given name or null if not found
   */
  public HttpHeader getHttpHeader(String headerName) {
	for (Iterator iter = httpHeader.iterator(); iter.hasNext();) {
		HttpHeader header = (HttpHeader) iter.next();
		if (header.getName().equals(headerName)) {
			return header;
		}
	}
	return null;
  }
  
  /**
   * Get the header value with the given name
   * @param headerName
   *
   * @return a HttpHeader.value with the given name or null if not found
   */
  public String getHeaderValue(String headerName) {
  	HttpHeader header = getHeader(headerName);
	return header != null ? header.getValue() : null;
  }

  /**
   * Set a HTTP header value with the given name (creates one if not found)
   * @param headerName
   * @param headerValue
   *
   * @return a HttpHeader.value with the given name or null if not found
   */
  public void setHeaderValue(String headerName, String headerValue) {
	  HttpHeader header = getHeader(headerName);
	  if (header == null) {
		  header = new HttpHeader(headerName, headerValue);
		  addHeader(header);
	  } else {
		  header.setValue(headerValue);
	  }
  }

  /**
   * Get the content of the Location header. This header will
   * be used for REDIRECTs.
   * 
   * @return the value of the HTTP Location header.
   */
  public String getLocation() {
    HttpHeader location= getHeader(HttpHeader.LOCATION);
    if (location == null) {
      return "";
    } else {
      return location.getValue();
    }
  }

  /**
   * Was it a redirect ?
   *
   * @return true if this document is a HTTP REDIRECT
   */
  public boolean isRedirect() {
    if ((httpReturnCode >= HTTP_REDIRECTSTART) &&
	(httpReturnCode <= HTTP_REDIRECTEND)) {
      return true;
    } else {
      return false;
    }
  }

  /**
   * Was it a "normal" document ?
   */
  public boolean isOk() {
    return (httpReturnCode == HttpConstants.HTTP_OK);
  }

  /**
   * Was it not modified ?
   */
  public boolean isNotModified() {
  	return (getHttpCode() == HttpConstants.HTTP_NOTMODIFIED);
  }
  
  /**
   * Was it not found ?
   */
  public boolean isNotFound() {
    return (httpReturnCode == HttpConstants.HTTP_NOTFOUND);
  }

  /**
   * did we get "Authorization required"
   */
  public boolean isUnauthorized() {
    return (httpReturnCode == HttpConstants.HTTP_UNAUTHORIZED);
  }

  /**
   * Gets the HttpHeader with the given name
   * 
   * @param headerName
   */
  public HttpHeader getHeader(String name) {
    for (int i=0; i<httpHeader.size(); i++) {
      HttpHeader h = (HttpHeader)httpHeader.elementAt(i);
      if (name.equalsIgnoreCase(h.getName())) {
	return h;
      }
    }   
    return null;
  }

  /**
   * Removes the HttpHeader with the given name
   * 
   * @param headerName
   */
  public HttpHeader removeHeader(String name) {
  	HttpHeader header = getHeader(name);
  	if (header != null) {
  		httpHeader.remove(header);
  	}
	return header;
  }
  
  /**
   * Get all the HTTP headers. This function is useful if you
   * don't know what headers exists and you want to have ALL
   * headers
   * 
   * @return a Vector containing HttpHeader objects
   */
  public Vector getHttpHeaders() {
    return httpHeader;
  }
  

  /**
   * is the content-type text/html ?
   * 
   * @return true if the HTTP Content-Type header has the
   * value text/html
   */
  public boolean isHTML() {
    HttpHeader ct = getHeader(HttpHeader.CONTENT_TYPE);
    if (ct==null) {
      return false;
    } else {
      if (ct.getValue().toLowerCase().startsWith("text/html")) {
	return true;
      }
    }
    return false;
  }
  
  /**
   * is this a Javascript document ?
   *
   * @return true if the Content-Type is text/x-javascript
   */
  public boolean isJavaScript(){
       HttpHeader ct = getHeader(HttpHeader.CONTENT_TYPE);
    if (ct==null) {
      return false;
    } else {
      if (ct.getValue().equalsIgnoreCase("text/x-javascript")) {
	return true;
      }
    }
    return false;
  }


  /**
   * Convert this object to a String.
   *
   * @return a String representation of this HttpDoc. Format
   * may change, therefore this should be used only for
   * logging or debugging
   */
  public String toString() {
    StringBuffer res = new StringBuffer();
  
    res.append(url.toString()+"\n\n");

    for (int i=0; i<httpHeader.size(); i++) {
      HttpHeader h = (HttpHeader)httpHeader.elementAt(i);
      res.append(h.toString());
      res.append("\n");
    }
    res.append("\n");
	if (content != null) {
    	res.append(new String(content));
	}
   
    return res.toString();
  }
  /**
   * Get the full URL where this document was retrieved from
   *
   * @return an URL object containing the location where this
   * document was retrieved from
   */
  public URL getURL() {
    return url;
  }


  /**
   * Set the location where this  document was retrieved from
   *
   * @param url the original location of this document
   */
  public void setURL(URL url) {
    this.url = url;
  }

  /**
   * Gets lastModified date as milliseconds.
   *
   * @return lastModified as milliseconds or -1 if not specified
   */
  public long getLastModifiedAsMilliSeconds() {
	String value = getHeaderValue(HttpHeader.LAST_MODIFIED);
	return value != null ? HTTPDateTool.parseDate(value) : -1;
  }
	
  /**
   * Gets date as milliseconds.
   *
   * @return date as milliseconds or -1 if not specified
   */
  public long getDateAsMilliSeconds() {
	  String value = getHeaderValue(HttpHeader.DATE);
	  return value != null ? HTTPDateTool.parseDate(value) : -1;
  }

  /**
   * Sets lastModified date in milliseconds.
   *
   * @param lastModified in milliseconds
   */
  public void setLastModified(long d) {
	  String dateString = HTTPDateTool.rfc1123Format.format(new Date(d));
	  setHeaderValue(HttpHeader.LAST_MODIFIED, dateString);
  }
	
  /**
   * Sets date in milliseconds.
   *
   * @param lastModified in milliseconds
   */
  public void setDate(long d) {
	  String dateString = HTTPDateTool.rfc1123Format.format(new Date(d));
	  setHeaderValue(HttpHeader.DATE, dateString);
  }
  
  /**
   * Calculates MD5 key for given content
   *
   * @param content
   * @return MD5 key for content
   */
  protected static String getContentMD5(byte[] content) {
	  if ((content == null) || (content.length == 0)) {
		  return "00000000000000000000000000000000";
	  }
	  MD5 md5 = new MD5();
	  md5.Update(content);
	  return md5.asHex();
  }
  
  /**
   * Gets MD5 key of document content.
   * A calculated key is stored as a header and reused 
   * in successive calls of this method.
   * 
   * @return MD5 key
   */
  public String getContentMD5() {
	  HttpHeader md5Header = getHeader(HttpHeader.CONTENT_MD5);
	  String md5;
	  if (md5Header != null) {
		  md5 = md5Header.getValue();
	  } else {
		  md5 = getContentMD5(getContent());
		  md5Header = new HttpHeader(HttpHeader.CONTENT_MD5, md5);
		  addHeader(md5Header);
	  }
	  return md5;
  }


	/**
	 * Set flag that indicates if this document is retrieved from cache
	 * @param cached
	 */
	public void setCached(boolean cached) {
		this.cached = cached;
	}
	
	/**
	 * Was this document retrieved from cache?
	 * @return cached
	 */
	public boolean isCached() {
		return cached;
	}

	/**
	 * Store calculated links of a HttpDoc.
	 * @param links
	 */
	public void setLinks(List links) {
		this.links = links;
	}

	/**
	 * Gets List of links (if set previously).
	 * @return List
	 */
	public List getLinks() {
		return links;
	}
  
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -