📄 httpdoccache.java
字号:
//////////////////////////////////////////////////////////////////////////////
// Copyright (c) Insiders Wissensbasierte Systeme GmbH, Germany
//////////////////////////////////////////////////////////////////////////////
package net.matuschek.http;
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipOutputStream;
import net.matuschek.util.MD5;
import org.apache.log4j.Category;
/**
* Full implementation of HttpDocManager interface.
* Caches documents, links and headers in ZIP-files.
* Documents with same content will be detected
* and share the same content-storage.
*
* @author Oliver Schmidt
* @version $Revision: 1.2 $
*/
public class HttpDocCache implements HttpDocManager {
/** internally used header name to mark duplicates */
protected final static String CONTENT_DUPLICATE = "Content-Duplicate";
/** use MD5 encoding for filenames */
public boolean useMD5 = true;
/** log4j logging instance */
protected static Category log =
Category.getInstance(HttpDocCache.class.getName());
/** collection of visited URLs */
private Collection urls = new LinkedList();
/** storage main directory */
protected String storagedir;
/** file that holds directory information */
protected File storageDirectoryFile = null;
/** subdirectory name for links */
protected final static String LINKS = "links" + File.separator;
/** subdirectory name for content */
protected final static String CONTENT = "content" + File.separator;
/** subdirectory name for document information */
protected final static String DOCUMENTS = "documents" + File.separator;
/**
* Constructor
* @param storageDirectory
*/
public HttpDocCache(String storageDirectory) {
setStorageDir(storageDirectory);
}
private FileOutputStream storageDirectoryStream = null;
/**
* Set storage directory and create directories if necessary.
* @param newStoragedir
*/
private void setStorageDir(String newStoragedir) {
storagedir = newStoragedir;
if (!storagedir.endsWith(File.separator)) {
storagedir = storagedir + File.separator;
}
// create the directories, if they do not exist yet.
File storagedirFile = new File(storagedir + DOCUMENTS);
if (!storagedirFile.exists()) {
storagedirFile.mkdirs();
}
File contentFile = new File(storagedir + CONTENT);
if (!contentFile.exists()) {
contentFile.mkdirs();
}
if (useMD5) {
storageDirectoryFile = new File(storagedir + "directory.csv");
try {
storageDirectoryStream = new FileOutputStream(storageDirectoryFile.getPath(), true);
if (!storageDirectoryFile.exists()) {
storageDirectoryStream.write(("Path,URL" + LF).getBytes());
}
} catch (Exception e) {
log.error(e.getMessage());
}
}
}
final static String QUOTE = "\"";
final static String LF = System.getProperty("line.separator");
/**
* Method store.
* stores the document to the storage directory
* @param doc the document to be stored
* @param links to be stored (optional)
* @return String
* @throws DocManagerException if the document cannot be written to the directory
*/
public void storeDocument(HttpDoc doc) throws DocManagerException {
List links = doc.getLinks();
// don磘 store cached documents
if (doc.isCached()) {
return;
}
// get the content type
String filename = generateFilename(doc.getURL().toExternalForm());
String filepath = storagedir + DOCUMENTS + filename;
checkStoragePathFor(DOCUMENTS, filename);
try {
File f = new File(filepath + ".zip");
if (!f.exists()) {
writeDirectoryInfo(doc, filename);
}
// write it to the file
OutputStream fs = new BufferedOutputStream(new FileOutputStream(f));
ZipOutputStream zos = new ZipOutputStream(fs);
zos.setLevel(9);
try {
// writeContentToZipFile(doc, zos);
storeContent(doc);
writeHeadersToZipFile(doc, zos);
writeUrlToZipFile(doc, zos);
if (links != null) {
writeLinksToZipFile(links, zos);
}
} catch (Throwable e){
System.out.println(e);
} finally {
zos.close();
fs.close();
long date = doc.getDateAsMilliSeconds();
f.setLastModified(date > 0 ? date : System.currentTimeMillis());
}
} catch (IOException ioex) {
DocManagerException ex = new DocManagerException(ioex.getMessage());
throw ex;
}
}
/**
* Write Directory info.
* @param doc
* @param filename in cache
* @throws IOException
*/
protected void writeDirectoryInfo(HttpDoc doc, String filename)
throws IOException {
if (storageDirectoryFile != null) {
synchronized(storageDirectoryFile) {
try {
String directoryInfo = QUOTE + filename + QUOTE + "," + QUOTE + doc.getURL() + QUOTE + LF;
storageDirectoryStream.write(directoryInfo.getBytes());
} catch (Exception e) {
log.warn(e.getMessage());
storageDirectoryStream.close();
}
}
}
}
/**
* Write content to zipFile
* @param doc
* @param zos
* @throws IOException
*/
protected void writeContentToZipFile(HttpDoc doc, ZipOutputStream zos)
throws IOException {
String contenttype = doc.getHeaderValue(HttpHeader.CONTENT_TYPE);
String extension = getExtensionFromContenttype(contenttype);
ZipEntry zipEntry = new ZipEntry("content" + extension);
long date = doc.getLastModifiedAsMilliSeconds();
if (date < 0) {
date = doc.getDateAsMilliSeconds();
}
zipEntry.setTime(date);
zos.putNextEntry(zipEntry);
zos.write(doc.getContent());
zos.closeEntry();
}
/**
* Write headers to zipFile.
* @param doc
* @param zos
* @return ZipEntry
* @throws IOException
*/
protected ZipEntry writeHeadersToZipFile(HttpDoc doc, ZipOutputStream zos) throws IOException {
StringBuffer comment = new StringBuffer();
Vector headers = doc.getHttpHeader();
for (Iterator iter = headers.iterator(); iter.hasNext();) {
HttpHeader header = (HttpHeader) iter.next();
if (!header.getName().equals(CONTENT_DUPLICATE)) {
comment.append(header.toString());
if (iter.hasNext()) {
comment.append(LF);
}
}
}
ZipEntry ze = new ZipEntry("header");
zos.putNextEntry(ze);
zos.write(comment.toString().getBytes());
long date = doc.getDateAsMilliSeconds();
ze.setTime(date > 0 ? date : System.currentTimeMillis());
zos.closeEntry();
return ze;
}
/**
* Read headers from ZipFile
* @param doc
* @param zf
* @return boolean
* @throws IOException
*/
protected boolean readHeadersFromZipFile(HttpDoc doc, ZipFile zf) throws IOException {
ZipEntry ze = zf.getEntry("header");
if (ze != null) {
InputStream is = zf.getInputStream(ze);
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
while (reader.ready()) {
String line = reader.readLine();
int pos = line.indexOf(": ");
if (pos >= 0) {
String name = line.substring(0, pos);
String value = line.substring(pos + 2);
HttpHeader header = new HttpHeader(name, value);
doc.addHeader(header);
}
}
reader.close();
return true;
}
return false;
}
/**
* Read links from ZipFile
* @param doc
* @param zf
* @return boolean
* @throws IOException
*/
protected boolean readLinksFromZipFile(HttpDoc doc, ZipFile zf) throws IOException {
ZipEntry ze = zf.getEntry("links");
List links = doc.getLinks();
if (links == null) {
links = new Vector();
doc.setLinks(links);
} else {
links.clear();
}
if (ze != null) {
InputStream is = zf.getInputStream(ze);
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
while (reader.ready()) {
String line = reader.readLine();
if (line != null) {
URL url = new URL(line);
links.add(url);
}
}
reader.close();
return true;
}
return false;
}
/**
* Write Url to ZipFile.
* @param doc
* @param zos
* @return ZipEntry
* @throws IOException
*/
protected ZipEntry writeUrlToZipFile(HttpDoc doc, ZipOutputStream zos) throws IOException {
String url = doc.getURL().toString();
ZipEntry ze = new ZipEntry("url");
zos.putNextEntry(ze);
zos.write(url.getBytes());
long date = doc.getDateAsMilliSeconds();
ze.setTime(date > 0 ? date : System.currentTimeMillis());
zos.closeEntry();
return ze;
}
/**
* Get File of document content users.
* @param doc
* @return File
*/
private File getContentUsersFile(HttpDoc doc) {
File f = null;
byte[] content = doc.getContent();
if (content.length != 0) {
String md5 = doc.getContentMD5();
f = contentFile(md5, ".txt");
}
return f;
}
/**
* Returns URL-String of duplicate content (if found).
* @see net.matuschek.http.HttpDocManager#findDuplicate(HttpDoc)
*/
public String findDuplicate(HttpDoc doc) throws IOException {
String duplicate = null;
File f = getContentUsersFile(doc);
if (f != null) {
String urlString = doc.getURL().toString();
if (f.exists()) {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
while (reader.ready()) {
String line = reader.readLine();
if (line.equals(urlString)) {
break;
} else if (duplicate == null) {
duplicate = line;
}
}
reader.close();
}
}
return duplicate;
}
/**
* Creates a file with a name created by the content, containing the URL.
* @param doc
*/
protected void storeContent(HttpDoc doc) throws IOException {
if (doc.getContent().length == 0)
return;
File f = getContentUsersFile(doc);
String urlString = doc.getURL().toString();
String md5 = doc.getContentMD5();
// is content user?
boolean found = false;
if (f.exists()) {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
try {
while (reader.ready()) {
String line = reader.readLine();
if (line.equals(urlString)) {
found = true; break;
}
}
} finally {
reader.close();
}
}
// write content
File fzip = contentFile(md5, ".zip");
if (!fzip.exists()) {
checkStoragePathFor(CONTENT, useFirstCharactersAsDirectories(md5));
OutputStream fs = new BufferedOutputStream(new FileOutputStream(fzip));
ZipOutputStream zos = null;
try {
zos = new ZipOutputStream(fs);
zos.setLevel(9);
writeContentToZipFile(doc, zos);
} finally {
if (zos != null) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -