⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 archivereaderfactory.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
字号:
/* $Id: ArchiveReaderFactory.java 4977 2007-03-09 23:57:28Z stack-sf $ * * Created on August 18th, 2006 * * Copyright (C) 2004 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.io;import it.unimi.dsi.fastutil.io.RepositionableStream;import java.io.File;import java.io.IOException;import java.io.InputStream;import java.net.HttpURLConnection;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import org.archive.io.arc.ARCReaderFactory;import org.archive.io.warc.WARCReaderFactory;import org.archive.net.UURI;import org.archive.net.md5.Md5URLConnection;import org.archive.net.rsync.RsyncURLConnection;import org.archive.util.FileUtils;import org.archive.util.IoUtils;/** * Factory that returns an Archive file Reader. * Returns Readers for ARCs or WARCs. * @author stack * @version $Date: 2007-03-09 23:57:28 +0000 (Fri, 09 Mar 2007) $ $Revision: 4977 $ */public class ArchiveReaderFactory implements ArchiveFileConstants {	/**	 * Offset value for when we want to stream all.	 */	private final static int STREAM_ALL = -1;		private static final ArchiveReaderFactory factory =		new ArchiveReaderFactory();	    /**     * Shutdown any public access to default constructor.     */    protected ArchiveReaderFactory() {        super();    }        /**     * Get an Archive file Reader on passed path or url.     * Does primitive heuristic figuring if path or URL.     * @param arcFileOrUrl File path or URL pointing at an Archive file.     * @return An Archive file Reader.     * @throws IOException      * @throws MalformedURLException      * @throws IOException      */    public static ArchiveReader get(final String arcFileOrUrl)    throws MalformedURLException, IOException {    	return ArchiveReaderFactory.factory.getArchiveReader(arcFileOrUrl);    }        protected ArchiveReader getArchiveReader(final String arcFileOrUrl)    throws MalformedURLException, IOException {    	return getArchiveReader(arcFileOrUrl, STREAM_ALL);    }        protected ArchiveReader getArchiveReader(final String arcFileOrUrl,    	final long offset)    throws MalformedURLException, IOException {    	return UURI.hasScheme(arcFileOrUrl)?    		get(new URL(arcFileOrUrl), offset):    			get(new File(arcFileOrUrl), offset);    }        /**     * @param f An Archive file to read.     * @return An ArchiveReader     * @throws IOException      */    public static ArchiveReader get(final File f) throws IOException {    	return ArchiveReaderFactory.factory.getArchiveReader(f);    }        protected ArchiveReader getArchiveReader(final File f)    throws IOException {    	return getArchiveReader(f, 0);    }        /**     * @param f An Archive file to read.     * @param offset Have returned Reader set to start reading at this offset.     * @return An ArchiveReader     * @throws IOException      */    public static ArchiveReader get(final File f, final long offset)    throws IOException {    	return ArchiveReaderFactory.factory.getArchiveReader(f, offset);	}        protected ArchiveReader getArchiveReader(final File f,    	final long offset)    throws IOException {    	if (ARCReaderFactory.isARCSuffix(f.getName())) {    		return ARCReaderFactory.get(f, true, offset);    	} else if (WARCReaderFactory.isWARCSuffix(f.getName())) {    		return WARCReaderFactory.get(f, offset);    	}    	throw new IOException("Unknown file extension (Not ARC nor WARC): "    		+ f.getName());    }        /**     * Wrap a Reader around passed Stream.     * @param s Identifying String for this Stream used in error messages.     * Must be a string that ends with the name of the file we're to put     * an ArchiveReader on.  This code looks at file endings to figure     * whether to return an ARC or WARC reader.     * @param is Stream.  Stream will be wrapped with implementation of     * RepositionableStream unless already supported.     * @param atFirstRecord Are we at first Record?     * @return ArchiveReader.     * @throws IOException     */    public static ArchiveReader get(final String s, final InputStream is,        final boolean atFirstRecord)    throws IOException {        return ArchiveReaderFactory.factory.getArchiveReader(s, is,        	atFirstRecord);    }        /**     * @param is     * @return If passed <code>is</code> is     * {@link RepositionableInputStream}, returns <code>is</code>, else we     * wrap <code>is</code> with {@link RepositionableStream}.     */    protected InputStream asRepositionable(final InputStream is) {        if (is instanceof RepositionableStream) {            return is;        }        // RepositionableInputStream calls mark on each read so can back up at        // least the read amount.  Needed for gzip inflater overinflations        // reading into the next gzip member.        return new RepositionableInputStream(is, 16 * 1024);    }        protected ArchiveReader getArchiveReader(final String id,     		final InputStream is, final boolean atFirstRecord)    throws IOException {    	final InputStream stream = asRepositionable(is);        if (ARCReaderFactory.isARCSuffix(id)) {            return ARCReaderFactory.get(id, stream, atFirstRecord);        } else if (WARCReaderFactory.isWARCSuffix(id)) {            return WARCReaderFactory.get(id, stream, atFirstRecord);        }        throw new IOException("Unknown extension (Not ARC nor WARC): " + id);    }        /**     * Get an Archive Reader aligned at <code>offset</code>.     * This version of get will not bring the file local but will try to     * stream across the net making an HTTP 1.1 Range request on remote     * http server (RFC1435 Section 14.35).     * @param u HTTP URL for an Archive file.     * @param offset Offset into file at which to start fetching.     * @return An ArchiveReader aligned at offset.     * @throws IOException     */    public static ArchiveReader get(final URL u, final long offset)    throws IOException {    	return ArchiveReaderFactory.factory.getArchiveReader(u, offset);    }        protected ArchiveReader getArchiveReader(final URL f, final long offset)    throws IOException {        // Get URL connection.        URLConnection connection = f.openConnection();        if (!(connection instanceof HttpURLConnection)) {            throw new IOException("This method only handles HTTP connections.");        }        addUserAgent((HttpURLConnection)connection);        if (offset != STREAM_ALL) {        	// Use a Range request (Assumes HTTP 1.1 on other end). If        	// length >= 0, add open-ended range header to the request.  Else,        	// because end-byte is inclusive, subtract 1.        	connection.addRequestProperty("Range", "bytes=" + offset + "-");        }                return getArchiveReader(f.toString(), connection.getInputStream(),            (offset == 0));    }        /**     * Get an ARCReader.     * Pulls the ARC local into whereever the System Property     * <code>java.io.tmpdir</code> points. It then hands back an ARCReader that     * points at this local copy.  A close on this ARCReader instance will     * remove the local copy.     * @param u An URL that points at an ARC.     * @return An ARCReader.     * @throws IOException      */    public static ArchiveReader get(final URL u)    throws IOException {    	return ArchiveReaderFactory.factory.getArchiveReader(u);    }        protected ArchiveReader getArchiveReader(final URL u)    throws IOException {        // If url represents a local file then return file it points to.        if (u.getPath() != null) {            // TODO: Add scheme check and host check.            File f = new File(u.getPath());            if (f.exists()) {                return get(f, 0);            }        }               String scheme = u.getProtocol();        if (scheme.startsWith("http") || scheme.equals("s3")) {            // Try streaming if http or s3 URLs rather than copying local        	// and then reading (Passing an offset will get us an Reader        	// that wraps a Stream).            return get(u, STREAM_ALL);        }                return makeARCLocal(u.openConnection());    }        protected ArchiveReader makeARCLocal(final URLConnection connection)    throws IOException {        File localFile = null;        if (connection instanceof HttpURLConnection) {            // If http url connection, bring down the resource local.            String p = connection.getURL().getPath();            int index = p.lastIndexOf('/');            if (index >= 0) {                // Name file for the file we're making local.                localFile = new File(FileUtils.TMPDIR, p.substring(index + 1));                if (localFile.exists()) {                    // If file of same name already exists in TMPDIR, then                    // clean it up (Assuming only reason a file of same name in                    // TMPDIR is because we failed a previous download).                    localFile.delete();                }            } else {                localFile = File.createTempFile(ArchiveReader.class.getName(),                    ".tmp", FileUtils.TMPDIR);            }            addUserAgent((HttpURLConnection)connection);            connection.connect();            try {                IoUtils.readFullyToFile(connection.getInputStream(), localFile,                    new byte[16 * 1024]);            } catch (IOException ioe) {                localFile.delete();                throw ioe;            }        } else if (connection instanceof RsyncURLConnection) {            // Then, connect and this will create a local file.            // See implementation of the rsync handler.            connection.connect();            localFile = ((RsyncURLConnection)connection).getFile();        } else if (connection instanceof Md5URLConnection) {            // Then, connect and this will create a local file.            // See implementation of the md5 handler.            connection.connect();            localFile = ((Md5URLConnection)connection).getFile();        } else {            throw new UnsupportedOperationException("No support for " +                connection);        }                ArchiveReader reader = null;        try {            reader = get(localFile, 0);        } catch (IOException e) {            localFile.delete();            throw e;        }                // Return a delegate that does cleanup of downloaded file on close.        return reader.getDeleteFileOnCloseReader(localFile);    }        protected void addUserAgent(final HttpURLConnection connection) {        connection.addRequestProperty("User-Agent", this.getClass().getName());    }        /**     * @param f File to test.     * @return True if <code>f</code> is compressed.     * @throws IOException     */    protected boolean isCompressed(final File f) throws IOException {        return f.getName().toLowerCase().        	endsWith(DOT_COMPRESSED_FILE_EXTENSION);    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -