📄 httpdoctofile.java

📁 真正的网络爬虫的源代码啊,希望大家好好阅读,写出心得体会啊
💻 JAVA
字号:
package net.matuschek.http;

/************************************************
    Copyright (c) 2001/2002 by Daniel Matuschek
*************************************************/


import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;
import java.util.StringTokenizer;

import org.apache.log4j.Category;

/**
 * DocumentManager that will store document contents in a file.
 *
 * @author Daniel Matuschek 
 * @version $Revision: 1.11 $
 */
public class HttpDocToFile extends AbstractHttpDocManager
{
  /**
   * directory where the files will be created
   */
  private String baseDir;


  /**
   * the object will not store files smaller then this size !
   */
  private int minFileSize;
  

  /**
   * defines if special characters in the URL should be replaced
   * by "normal" characters
   * @see #setReplaceAllSpecials(boolean)
   */
  private boolean replaceAllSpecials = false;


  /**
   * defines, if CGIs should be stored on disc. 
   *
   * @see #setStoreCGI
   */
  private boolean storeCGI = true;

  /** Log4J logging */
  private Category log;


  
  /**
   * creates a new HttpDocToFile object that will store the
   * documents in the given directory
   */
  public HttpDocToFile(String baseDir) {
    this.baseDir = baseDir;
    log = Category.getInstance(getClass().getName());
  }
  

  /**
   * store document (that means write it to disk)
   * @param doc the document to store
   * @exception DocManagerException if the document can't be stored
   * (some IO error occured)
   */
  public void storeDocument(HttpDoc doc) 
    throws DocManagerException
  {
    if ((doc == null) || (doc.getContent() == null)) {
      return;
    }
    
    /* 
     * write file only, if this was NOT a cached document
     * (in this case we have it already on harddisk)
     */
    if (doc.isCached()) {
    	return;
    }


    if ((! storeCGI)
	&& (doc.getURL().toString().indexOf('?') >= 0)) {
      // do not store dynamic pages, because storeCGI is false
      // and the URL contains a "?"
      return;
    }


    String filename = url2Filename(doc.getURL());
    if (doc.getContent().length >= minFileSize) {
      try {
		createDirs(filename);
		BufferedOutputStream os = 
		  new BufferedOutputStream(new FileOutputStream(filename));
		os.write(doc.getContent());
		os.flush();
		os.close();
      } catch (IOException e) {
		throw new DocManagerException(e.getMessage());
      }
    }
  }


  /**
   * Gets the cacheFile of the given URL if its document was stored.
   * @param url
   * @return cacheFile
   */
  protected File getCacheFile(URL url) {
	  // does the file exists on the filesystem ?
	  File cacheFile = new File(url2Filename(url));
	  if (! (cacheFile.exists() && (cacheFile.isFile()))) {
		return null;
	  }
	  return cacheFile;
  }

	/**
	 * Gets the extension of the given URL if its document was stored.
	 * @param url
	 * @return String
	 */
	protected String getExtension(URL url) {
		// is it dynamic ?
		if ((url.toString().indexOf('?') >= 0) 
		|| (url.toString().indexOf("cgi") >= 0)) {
		  return null;
		}
		
	  	// do we have an filename extension ?
	  	// without it is not possible to guess the MIME type.
	  	String path = url.getPath();
	  	String ext = null;
	
	  	if (path.indexOf(".") < 0) {
			return null;
	  	}
	
	  	StringTokenizer st = new StringTokenizer(path,".");
	  	while (st.hasMoreTokens()) {
			ext = st.nextToken();
	  	}
	  	// no extension if ext contains a "/"
	  	if (ext.indexOf("/") >= 0) {
			return null;
	  	}
	  	
	  	return ext;
	}
	
  /**
   * Removes a document that was stored previous from the file system. Because
   * the HttpDocToFile does not store the HTTP headers, only the Content-Type
   * header will exists. Even this header may not be correct. It will only use a
   * simple heuristic to determine the possible MIME type.
   */
  public void removeDocument(URL u) {
	String ext = getExtension(u);
	if (ext == null) return;
	File cacheFile = getCacheFile(u);
	if (cacheFile == null) return ;
	
	cacheFile.delete();
  }

  /**
   * Gets a document that was stored previous from the file system.
   * Because the HttpDocToFile does not store the HTTP headers, only
   * the Content-Type header will exists. Even this header may not 
   * be correct. It will only use a simple heuristic to determine the
   * possible MIME type.
   *
   * @return null, if this document was not stored before or it seems
   * to be a dynamic document.
   */
  public HttpDoc retrieveFromCache(URL u) {
	String ext = getExtension(u);
	if (ext == null) return null;
	File cacheFile = getCacheFile(u);
	if (cacheFile == null) return null;
    
    // create a buffer;
    long size = cacheFile.length();
    if (size > Integer.MAX_VALUE) {
      log.info("File too large");
      return null;
    }

    byte[] buff = new byte[(int) size];

    // read the file
    try {
      FileInputStream fi = new FileInputStream(cacheFile);
      fi.read(buff);
    } catch (IOException e) {
      log.info("Could not read cached document "+e.getMessage());
      return null;
    }
    
    // create a new HttpDoc object
    HttpDoc doc = new HttpDoc();

    // and set the content and the header
    doc.setHttpCode("HTTP/1.0 200 OK");
    doc.setContent(buff);
    
   
    // now guess the MIME type
    String mimetype = null;

    if (ext.equals("html") 
	|| ext.equals("htm")
	|| ext.equals("shtml")
	|| ext.equals("asp")
	|| ext.equals("php")
	|| ext.equals("jsp")) {
      mimetype="text/html";
    } else {
      mimetype="application/unknown";
    }

    doc.addHeader(new HttpHeader("Content-Type",mimetype));    
    doc.setURL(u);
    doc.setCached(true);
	
    return doc;
  }
  

  /**
   * gets the value of baseDir
   * @return the value of baseDir
   */
  public String getBaseDir() {
    return baseDir;
  }
  

  /**
   * sets the value of basedir
   * @param baseDir the new value of baseDir
   */
  public void setBaseDir(String baseDir) {
    this.baseDir = baseDir;
  }
  

  /**
   * converts an URL to a filename http://host/path will 
   * be converted to basedir/host/path
   * @param URL a URL to convert, must not be null
   * @return a pathname
   */
  protected String url2Filename(URL u) {
    StringBuffer sb = new StringBuffer();

    sb.append(baseDir);
    sb.append(File.separatorChar);
    sb.append(u.getHost());
    sb.append(u.getFile());

    // is there a query part ?
    // that is something after the file name seperated by ?
    String query = u.getQuery();
    if ((query != null) &&
	(!query.equals(""))) {
      sb.append(File.separatorChar);
      sb.append(query);
    }

    // filename that ends with /
    // are directories, we will name the file "index.html"
    if (sb.charAt(sb.length()-1) == '/') {
      sb.append("index.html");
    } 

    // postprocess filename (replace special characters)
    for (int i=0; i<sb.length(); i++) {
      char c=sb.charAt(i);
      char newc=(char)0;

      // replace / by operating system file name separator
      if (c == '/') {
	newc = File.separatorChar;
      }
      
      // replace special characters from CGIs
      if (replaceAllSpecials) {
	if ((c == '?')
	    || (c == '=')
	    || (c == '&')) {
	  newc = '-';
	}
      }

      if ((newc != (char)0) 
	  && (newc != c)) {
	sb.setCharAt(i,newc);
      }
    }

    return sb.toString();
  }
  

  /** 
   * creates all directories that are needed to place the 
   * file filename if they don't exists 
   * @param filename the full path name of a file
   */
  protected void createDirs(String filename) throws IOException {
    int pos = -1;
    // look for the last directory separator in the filename
    for (int i = filename.length() - 1; i >= 0; i--) {
      if (filename.charAt(i) == File.separatorChar) {
	pos = i;
	i = -1;
      }
    }
    File dir = new File(filename.substring(0, pos));
    dir.mkdirs();
  }
  

  /**
   * gets the value of minFileSize. Files smaller then this size
   * (in Bytes) will not be saved to disk !
   * @return the value of minFileSize 
   */
  public int getMinFileSize() {
    return minFileSize;
  }

  
  /**
   * sets the value of minFileSize
   * @param minFileSize the new value of minFileSize
   * @see #getMinFileSize()
   */
  public void setMinFileSize(int minFileSize) {
    this.minFileSize = minFileSize;
  }


  /**
   * Get the value of replaceAllSpecials.
   *
   * if replaceAllSpecials is true, all sepcial characters in the URL
   * will be replaced by "-". This is useful for operating system that
   * can't handle files with special characters in the filename (e.g.
   * Windows)
   *
   * @return value of replaceAllSpecials.
   */
  public boolean isReplaceAllSpecials() {
    return replaceAllSpecials;
  }
  

  /**
   * Set the value of replaceAllSpecials.
   *
   * if replaceAllSpecials is true, all sepcial characters in the URL
   * will be replaced by "-". This is useful for operating system that
   * can't handle files with special characters in the filename (e.g.
   * Windows)
   *
   * @param v  Value to assign to replaceAllSpecials.
   */
  public void setReplaceAllSpecials(boolean  v) {
    this.replaceAllSpecials = v;
  } 


  /**
   * Get the value of storeCGI
   *
   * If this is true, the object will store ALL retrieved documents,
   * otherwise it will store only documents from URLs that do not
   * have a "?" in the URL
   */
  public boolean getStoreCGI() {
    return storeCGI;
  }
  

  /**
   * Set the value of storeCGI.
   *
   * If this is true, the object will store ALL retrieved documents,
   * otherwise it will store only documents from URLs that do not
   * have a "?" in the URL
   *
   * @param v  Value to assign to storeCGI.
   */
  public void setStoreCGI(boolean v) {
    this.storeCGI = v;
  } 

}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -