📄 httpdoccache.java

📁 真正的网络爬虫的源代码啊,希望大家好好阅读,写出心得体会啊
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
					zos.close();
				} else {
					fs.close();
				}
			}
		} else {
			fzip.setLastModified(System.currentTimeMillis());
		}
		
		// append user
		if (!found) {
			FileOutputStream os = new FileOutputStream(f.getPath(), true);
			try {
				os.write((urlString + LF).getBytes());
			} finally {
				os.close();
			}
		}
	}

	/**
	 * Write links to ZipFile.
	 * @param links
	 * @param ZipOutputStream
	 */	
	protected void writeLinksToZipFile(List links, ZipOutputStream zs)
		throws IOException {
		HashSet storedLinks = new HashSet();
		ZipEntry zipEntry = new ZipEntry("links");
		zs.putNextEntry(zipEntry);
		for (Iterator iter = links.iterator(); iter.hasNext();) {
			URL url = (URL) iter.next();
			if (!storedLinks.contains(url)) {
				zs.write((url.toString() + LF).getBytes());
				storedLinks.add(url);
			}
		}
		zs.closeEntry();
	}
	
	/**
	 * Collects Urls (duplicates will be skipped).
	 * 
	 * @param doc a HttpDoc object to process. This may also be null
	 * @exception DocManagerException will be thrown if an error occurs
	 * while processing the document.
	 * @see net.matuschek.http.HttpDocManager#processDocument(net.matuschek.http.HttpDoc)
	 */
	public void processDocument(HttpDoc doc) throws DocManagerException {
		log.info(
			"Processing "
				+ doc.getURL().toExternalForm()
				+ doc.getHttpHeader());
				
		// collect URL (only if content is no duplicate)
		HttpHeader duplicate = doc.getHeader(CONTENT_DUPLICATE);
		if (duplicate == null) {
			urls.add(doc.getURL());
		}
	}

	/**
	 * retrieves a document from the cache.
	 * @param url
	 * @see net.matuschek.http.HttpDocManager#retrieveFromCache(java.net.URL)
	 */
	public HttpDoc retrieveFromCache(java.net.URL url) {
		HttpDoc doc = null;
		File f = null;
		try {
			String filename0 = url.toExternalForm(); 
			String filename = generateFilename(filename0) + ".zip";
			f = new File(storagedir + DOCUMENTS + filename);
					
			if (f.exists()) {
				log.info("retrieve " + f);
				
				// create document and read it from file
				doc = new HttpDoc();
				doc.setURL(url);
				ZipFile zf = new ZipFile(f);
				
				// read headers
				readHeadersFromZipFile(doc, zf);
				
				// read links
				readLinksFromZipFile(doc, zf);
				
				doc.setCached(true);
				
				// read content
				String md5 = doc.getContentMD5();
				File contentFile = contentFile(md5, ".zip");
				if (contentFile.exists()) {
					ZipFile contentZip = new ZipFile(contentFile);
					readContentFromZipFile(doc, contentZip);
					contentZip.close();
				} else {
					doc.setContent(new byte[0]);
				}
				zf.close();
			} 
		} catch (Exception e) {
			log.warn("removing invalid file " + f);
			f.delete();
			doc = null;
		}
				
		return doc;
	}
	
	/**
	 * Read content from ZipFile
	 * @param doc
	 * @param contentZip
	 * @throws IOException
	 */
	protected void readContentFromZipFile(HttpDoc doc, ZipFile contentZip)
		throws IOException {
		byte[] content = null;
		for (Enumeration enumeration = contentZip.entries(); enumeration.hasMoreElements();) {
			ZipEntry zipEntry = (ZipEntry) enumeration.nextElement();
			if (zipEntry.getName().startsWith("content")) {
				InputStream is = contentZip.getInputStream(zipEntry);
				int length = (int) zipEntry.getSize();
				content = new byte[length]; 
				int startPos = 0;
				while (startPos < length) {
					startPos += is.read(content, startPos, length - startPos);
				}
				is.close();
				break;
			}
		}
		doc.setContent(content);
	}
	
	/**
	 * Remove document from cache.
	 * @param url
	 * @see net.matuschek.http.HttpDocManager#removeDocument(URL)
	 */
	public void removeDocument(URL url) {
		HttpDoc doc = retrieveFromCache(url);
		
		File f = null;
		try {
			String filename0 = url.toExternalForm(); 
			String filename = generateFilename(filename0) + ".zip";
			
			f = new File(storagedir + LINKS + filename);
			if (f.exists()) {
				f.delete();
			}
			
			deleteContent(doc);
			f = new File(storagedir + DOCUMENTS + filename);
			if (f.exists()) {
				f.delete();
			}
		} catch (Exception ex) {
			log.error(ex);
		}
	}
	
	/**
	 * Deletes stored content for the given document
	 * @param document
	 */	
	private void deleteContent(HttpDoc doc) throws IOException {
		byte[] content = doc.getContent();
		if (content.length == 0) {
			return;
		}
		String urlString = doc.getURL().toString();
		String md5 = doc.getContentMD5();
		File f = contentFile(md5, ".txt");
		ArrayList entries = new ArrayList();
		if (f.exists()) {
			BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
			while (reader.ready()) {
				String line = reader.readLine();
				if (!line.equals(urlString)) {
					entries.add(line);
				}
			}
			reader.close();
		}
		if (entries.size() > 0) {
			FileOutputStream os = new FileOutputStream(f.getPath(), false);
			for (Iterator iter = entries.iterator(); iter.hasNext();) {
				String line = (String) iter.next();
				os.write((line + LF).getBytes());
			}
			os.close();
		} else {
			f.delete();
			File fzip = contentFile(md5, ".zip");
			if (fzip.exists()) {
				fzip.delete();
			}
		}
	}
	
	/**
	 * List collected URLs.
	 * @see java.lang.Object#toString()
	 */
	public String toString() {
		StringBuffer sb = new StringBuffer(1000);
		for (Iterator i = urls.iterator(); i.hasNext();) {
			sb.append(i.next()).append("\n");
		}
		return sb.toString();
	}

	/**
	 * Uses the first storageDirDepth characters of filename as paths
	 * @param filename
	 */
	private final String useFirstCharactersAsDirectories(String filename) {
		int n = storageDirDepth;
		if (n > filename.length()) n = filename.length();
		char dir[] = new char[n*2];
		for (int i=0; i<n; i++) {
			dir[i*2] = filename.charAt(i);
			dir[i*2+1] = File.separatorChar;
		}
		return new String(dir);
	}
	
	/**
	 * Checks if the storage path for the given file exists and creates it if necessary.
	 * @param subdirectory
	 * @param filename
	 */
	private final void checkStoragePathFor(String subdirectory, String filename) {
		if (!subdirectory.endsWith(File.separator)) {
			subdirectory += File.separator;
		}
		String head = filename.substring(0, storageDirDepth*2);
		File path = new File(storagedir + subdirectory + head);
		if (!path.exists()) {
			path.mkdirs();
		}
	}
	
	/**
	 * Generate a valid filename for the given docURI.
	 * @param docURI
	 * @return String
	 */
	protected String generateFilename(String docURI) {
		if (useMD5) {
			MD5 md5 = new MD5(docURI);
			String hex = md5.asHex();
			if (storageDirDepth > 0) {
				return useFirstCharactersAsDirectories(hex) + hex.substring(storageDirDepth);
			}
			return hex;
		} else {
			StringBuffer buf = new StringBuffer(docURI.length());
			
			for (int i = 0; i < docURI.length(); i++) {
				char c = docURI.charAt(i);
				switch (c) {
					case '/' : buf.append("&slash;"); break;
					case '\\' : buf.append("&backslash"); break;
					case ':' : buf.append("&colon;"); break;
					case '*' : buf.append("&asterisk;"); break;
					case '?' : buf.append("&question;"); break;
					case '\"' : buf.append("&quot;"); break;
					case '<' : buf.append("&lt;"); break;
					case '>' : buf.append("&gt;"); break;
					case '|' : buf.append("&or;"); break;
					default : buf.append(c); break;
				}
			}
			docURI = buf.toString();
			
			return docURI;
		}
	}

	/**
	 * Returns a File with the mapping of this content to its URLs.
	 * @param content
	 * @return long
	 */
	protected File contentFile(String hex, String extension) {
		return new File(storagedir + CONTENT + useFirstCharactersAsDirectories(hex) + hex.substring(storageDirDepth) + extension);
	}
	
	/**
	 * Close storageDirectory File.
	 * @see net.matuschek.http.HttpDocManager#finish()
	 */
	public void finish() {
		if (storageDirectoryStream != null) {
			try {
				storageDirectoryStream.close();
				storageDirectoryStream = null;
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
	
	/**
	 * Calls finish and super.finalize().
	 * @see java.lang.Object#finalize()
	 */
	protected void finalize() throws Throwable { 
		finish();
		super.finalize();
	}
	
	/**
	 * Depth of source set directory.
	 * (depth = number of used subdirectory levels)
	 * The first storageDirDepth characters of file will be used
	 * as directories.
	 */
	protected int storageDirDepth = 0;
	
	/**
	 * Sets the desired directory depth of the source set directory
	 * (depth = number of used subdirectory levels)
	 * 
	 * @param desired depth of source set directory.
	 */
	public void setStorageDirDepth(int depth) {	storageDirDepth = depth; }
	
	/**
	 * Method getstorageDirDepth.
	 * returns the directory depth of the source set directory
	 * @param desired depth of source set directory.
	 * @return the directory depth of the source set directory
	 */
	public int getStorageDirDepth() { return storageDirDepth; }
	
	/**
	 * Get relevant part of contenttype and get default extension for it.
	 * @param contenttype
	 * @return extension
	 */
	private String getExtensionFromContenttype(String contenttype) {
		String extension = null;
		if (contenttype != null){
			String strContentType = null;
			int pos = contenttype.indexOf(';');
			if (pos > 0) {
				strContentType = contenttype.substring(0, pos).trim();
			} else {
				strContentType = contenttype.trim();
			}
			extension = getDefaultExtension(strContentType);
		}
		
		if (extension == null) {
			extension = "";
		} else {
			extension = "." + extension;
		}
		return extension;
	}

	/**
	 * Get default extension for given contentType.
	 * @param contentType
	 * @return default extension or null
	 */
	protected String getDefaultExtension(String contentType) {
		if (contentType == null) {
			return null;
		} else if (contentType.indexOf("text/html") >= 0) {
			return ".html";
		} else if (contentType.indexOf("text/") >= 0) {
			return ".txt";
		} else {
			return null;
		}
	}
}
上一页 12
💿 文件大小 180 K
👤 上传用户 chayangccc
📂 所属分类多国语言处理
🏷️ 相关标签

#正 #家 #网络爬虫 #源代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -