📄 httpdoccache.java

📁 真正的网络爬虫的源代码啊,希望大家好好阅读,写出心得体会啊
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
//////////////////////////////////////////////////////////////////////////////
// Copyright (c) Insiders Wissensbasierte Systeme GmbH, Germany
//////////////////////////////////////////////////////////////////////////////

package net.matuschek.http;

import java.io.*;
import java.net.*;
import java.util.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipOutputStream;

import net.matuschek.util.MD5;
import org.apache.log4j.Category;

/**
 * Full implementation of HttpDocManager interface.
 * Caches documents, links and headers in ZIP-files.
 * Documents with same content will be detected 
 * and share the same content-storage.
 *
 * @author Oliver Schmidt
 * @version $Revision: 1.2 $
 */
public class HttpDocCache implements HttpDocManager {

	/** internally used header name to mark duplicates */
	protected final static String CONTENT_DUPLICATE = "Content-Duplicate";
	
	/** use MD5 encoding for filenames */
	public boolean useMD5 = true;
	
	/** log4j logging instance */
	protected static Category log =
		Category.getInstance(HttpDocCache.class.getName());

	/** collection of visited URLs */
	private Collection urls = new LinkedList();

	/** storage main directory */
	protected String storagedir;
	
	/** file that holds directory information */
	protected File storageDirectoryFile = null;
	
	/** subdirectory name for links */
	protected final static String LINKS = "links" + File.separator;
	
	/** subdirectory name for content */
	protected final static String CONTENT = "content" + File.separator;
	
	/** subdirectory name for document information */
	protected final static String DOCUMENTS = "documents" + File.separator;
	
	/**
	 * Constructor
	 * @param storageDirectory
	 */
	public HttpDocCache(String storageDirectory) {
		setStorageDir(storageDirectory);
	}
	
	private FileOutputStream storageDirectoryStream = null;
	
	/**
	 * Set storage directory and create directories if necessary.
	 * @param newStoragedir
	 */
	private void setStorageDir(String newStoragedir) {
		storagedir = newStoragedir;
		
		if (!storagedir.endsWith(File.separator)) {
			storagedir = storagedir + File.separator;
		}
		
		// create the directories, if they do not exist yet.
		File storagedirFile = new File(storagedir + DOCUMENTS);
		if (!storagedirFile.exists()) {
			storagedirFile.mkdirs();
		}
		File contentFile = new File(storagedir + CONTENT);
		if (!contentFile.exists()) {
			contentFile.mkdirs();
		}
		
		if (useMD5) {
			storageDirectoryFile = new File(storagedir + "directory.csv");
			try {
				storageDirectoryStream = new FileOutputStream(storageDirectoryFile.getPath(), true);
				if (!storageDirectoryFile.exists()) {
					storageDirectoryStream.write(("Path,URL" + LF).getBytes());
				}
			} catch (Exception e) {
				log.error(e.getMessage());
			}
		}
	}
	
	final static String QUOTE = "\"";
	final static String LF = System.getProperty("line.separator");

	/**
	 * Method store.
	 * stores the document to the storage directory
	 * @param doc the document to be stored
	 * @param links to be stored (optional)
	 * @return String
	 * @throws DocManagerException if the document cannot be written to the directory
	 */
	public void storeDocument(HttpDoc doc) throws DocManagerException {
		List links = doc.getLinks();
		 
		// don磘 store cached documents
		if (doc.isCached()) {
			return;
		}
		
		// get the content type
		String filename = generateFilename(doc.getURL().toExternalForm());
		
		String filepath = storagedir + DOCUMENTS + filename;
		checkStoragePathFor(DOCUMENTS, filename);
					
		try {
			File f = new File(filepath + ".zip");
			if (!f.exists()) {
				writeDirectoryInfo(doc, filename);
			}
	
			// write it to the file
			OutputStream fs = new BufferedOutputStream(new FileOutputStream(f));
			ZipOutputStream zos = new ZipOutputStream(fs);
			zos.setLevel(9);
			
			try {
	//			writeContentToZipFile(doc, zos);
				storeContent(doc);
				writeHeadersToZipFile(doc, zos);
				writeUrlToZipFile(doc, zos);
				if (links != null) {
					writeLinksToZipFile(links, zos);
				}
			} catch (Throwable e){
				System.out.println(e);
			} finally {
				zos.close();
				fs.close();
				long date = doc.getDateAsMilliSeconds();
				f.setLastModified(date > 0 ? date : System.currentTimeMillis());
			}
		} catch (IOException ioex) {
			DocManagerException ex = new DocManagerException(ioex.getMessage());
			throw ex;
		}
	}

	/**
	 * Write Directory info.
	 * @param doc
	 * @param filename in cache
	 * @throws IOException
	 */
	protected void writeDirectoryInfo(HttpDoc doc, String filename)
		throws IOException {
		if (storageDirectoryFile != null) {
			synchronized(storageDirectoryFile) {
				try {
					String directoryInfo = QUOTE + filename + QUOTE + "," + QUOTE + doc.getURL() + QUOTE + LF;
					storageDirectoryStream.write(directoryInfo.getBytes());
				} catch (Exception e) {
					log.warn(e.getMessage());
					storageDirectoryStream.close();
				}
			}
		}
	}

	/**
	 * Write content to zipFile
	 * @param doc
	 * @param zos
	 * @throws IOException
	 */
	protected void writeContentToZipFile(HttpDoc doc, ZipOutputStream zos)
		throws IOException {
		String contenttype = doc.getHeaderValue(HttpHeader.CONTENT_TYPE);
		String extension = getExtensionFromContenttype(contenttype);
		ZipEntry zipEntry = new ZipEntry("content" + extension);
		long date = doc.getLastModifiedAsMilliSeconds();
		if (date < 0) {
			date = doc.getDateAsMilliSeconds();
		}
		zipEntry.setTime(date);
		zos.putNextEntry(zipEntry);
		zos.write(doc.getContent());
		zos.closeEntry();
	}

	/**
	 * Write headers to zipFile.
	 * @param doc
	 * @param zos
	 * @return ZipEntry
	 * @throws IOException
	 */
	protected ZipEntry writeHeadersToZipFile(HttpDoc doc, ZipOutputStream zos) throws IOException {
		StringBuffer comment = new StringBuffer();
		Vector headers = doc.getHttpHeader();
		for (Iterator iter = headers.iterator(); iter.hasNext();) {
			HttpHeader header = (HttpHeader) iter.next();
			if (!header.getName().equals(CONTENT_DUPLICATE)) {
				comment.append(header.toString());
				if (iter.hasNext()) {
					comment.append(LF);
				}
			}
		}
		ZipEntry ze = new ZipEntry("header");
		zos.putNextEntry(ze);
		zos.write(comment.toString().getBytes());
		long date = doc.getDateAsMilliSeconds();
		ze.setTime(date > 0 ? date : System.currentTimeMillis());
		zos.closeEntry();
		return ze;
	}
	
	/**
	 * Read headers from ZipFile
	 * @param doc
	 * @param zf
	 * @return boolean
	 * @throws IOException
	 */
	protected boolean readHeadersFromZipFile(HttpDoc doc, ZipFile zf) throws IOException {
		ZipEntry ze = zf.getEntry("header");
		if (ze != null) {
			InputStream is = zf.getInputStream(ze);
			BufferedReader reader = new BufferedReader(new InputStreamReader(is));
			while (reader.ready()) {
				String line = reader.readLine();
				int pos = line.indexOf(": ");
				if (pos >= 0) {
					String name = line.substring(0, pos);
					String value = line.substring(pos + 2);
					HttpHeader header = new HttpHeader(name, value);
					doc.addHeader(header);
				}
			}
			reader.close();
			return true;
		}
		return false;
	}
	
	/**
	 * Read links from ZipFile
	 * @param doc
	 * @param zf
	 * @return boolean
	 * @throws IOException
	 */
	protected boolean readLinksFromZipFile(HttpDoc doc, ZipFile zf) throws IOException {
		ZipEntry ze = zf.getEntry("links");
		List links = doc.getLinks();
		if (links == null) {
			links = new Vector();
			doc.setLinks(links);
		} else {
			links.clear();
		}
		
		if (ze != null) {
			InputStream is = zf.getInputStream(ze);
			BufferedReader reader = new BufferedReader(new InputStreamReader(is));
			while (reader.ready()) {
				String line = reader.readLine();
				if (line != null) {
					URL url = new URL(line);
					links.add(url);
				}
			}
			reader.close();
			return true;
		}
		return false;
	}
	
	/**
	 * Write Url to ZipFile.
	 * @param doc
	 * @param zos
	 * @return ZipEntry
	 * @throws IOException
	 */
	protected ZipEntry writeUrlToZipFile(HttpDoc doc, ZipOutputStream zos) throws IOException {
		String url = doc.getURL().toString();
		ZipEntry ze = new ZipEntry("url");
		zos.putNextEntry(ze);
		zos.write(url.getBytes());
		long date = doc.getDateAsMilliSeconds();
		ze.setTime(date > 0 ? date : System.currentTimeMillis());
		zos.closeEntry();
		return ze;
	}
	
	/**
	 * Get File of document content users.
	 * @param doc
	 * @return File
	 */
	private File getContentUsersFile(HttpDoc doc) {
		File f = null;
		byte[] content = doc.getContent();
		if (content.length != 0) {
			String md5 = doc.getContentMD5();
			f = contentFile(md5, ".txt");
		}
		return f;
	}
	
	/**
	 * Returns URL-String of duplicate content (if found).
	 * @see net.matuschek.http.HttpDocManager#findDuplicate(HttpDoc)
	 */
	public String findDuplicate(HttpDoc doc) throws IOException {
		String duplicate = null;
		File f = getContentUsersFile(doc);
		if (f != null) {
			String urlString = doc.getURL().toString();
			if (f.exists()) {
				BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
				while (reader.ready()) {
					String line = reader.readLine();
					if (line.equals(urlString)) {
						break;
					} else if (duplicate == null) {
						duplicate = line; 
					}
				}
				reader.close();
			} 
		}
		return duplicate;
	}
	
	/**
	 * Creates a file with a name created by the content, containing the URL.
	 * @param doc
	 */	
	protected void storeContent(HttpDoc doc) throws IOException {
		if (doc.getContent().length == 0) 
			return;
		File f = getContentUsersFile(doc);
		String urlString = doc.getURL().toString();
		String md5 = doc.getContentMD5();
		
		// is content user?
		boolean found = false;
		if (f.exists()) {
			BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
			try {
				while (reader.ready()) {
					String line = reader.readLine();
					if (line.equals(urlString)) {
						found = true; break;
					}
				}
			} finally {
				reader.close();
			}
		} 
		
		// write content
		File fzip = contentFile(md5, ".zip");
		if (!fzip.exists()) {
			checkStoragePathFor(CONTENT, useFirstCharactersAsDirectories(md5));
			OutputStream fs = new BufferedOutputStream(new FileOutputStream(fzip));
			ZipOutputStream zos = null;
			try {
				zos = new ZipOutputStream(fs);
				zos.setLevel(9);
				writeContentToZipFile(doc, zos);
			} finally {
				if (zos != null) {
12 下一页
💿 文件大小 180 K
👤 上传用户 chayangccc
📂 所属分类多国语言处理
🏷️ 相关标签

#正 #家 #网络爬虫 #源代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -