⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 warcreaderfactory.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
字号:
/* $Id: WARCReaderFactory.java 4533 2006-08-24 00:59:04Z stack-sf $ * * Created Aug 22, 2006 * * Copyright (C) 2006 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.io.warc.v10;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStream;import java.net.MalformedURLException;import java.net.URL;import java.util.Iterator;import org.archive.io.ArchiveReader;import org.archive.io.ArchiveReaderFactory;import org.archive.io.ArchiveRecord;import org.archive.io.GzippedInputStream;import org.archive.io.warc.WARCConstants;import org.archive.util.FileUtils;import org.archive.net.UURI;/** * Factory for WARC Readers. * Figures whether to give out a compressed file Reader or an uncompressed * Reader. * @author stack * @version $Date: 2006-08-23 17:59:04 -0700 (Wed, 23 Aug 2006) $ $Version$ */public class WARCReaderFactory extends ArchiveReaderFactoryimplements WARCConstants {    private static final WARCReaderFactory factory = new WARCReaderFactory();    /**     * Shutdown any access to default constructor.     * This factory is Singleton.     */    private WARCReaderFactory() {        super();    }        public static WARCReader get(String arcFileOrUrl)    throws MalformedURLException, IOException {    	return (WARCReader)WARCReaderFactory.factory.    		getArchiveReader(arcFileOrUrl);    }        public static WARCReader get(final File f) throws IOException {    	return (WARCReader)WARCReaderFactory.factory.getArchiveReader(f);    }        /**     * @param f An arcfile to read.     * @param offset Have returned Reader set to start reading at this offset.     * @return A WARCReader.     * @throws IOException      */    public static WARCReader get(final File f, final long offset)    throws IOException {    	return (WARCReader)WARCReaderFactory.factory.    		getArchiveReader(f, offset);    }    protected ArchiveReader getArchiveReader(final String arcFileOrUrl,        final long offset)     throws MalformedURLException, IOException {        return UURI.hasScheme(arcFileOrUrl)?            get(new URL(arcFileOrUrl), offset):            get(new File(arcFileOrUrl), offset);    }         protected ArchiveReader getArchiveReader(final File f, final long offset)    throws IOException {		boolean compressed = testCompressedWARCFile(f);		if (!compressed) {			if (!FileUtils.isReadableWithExtensionAndMagic(f,					DOT_WARC_FILE_EXTENSION, WARC_010_MAGIC)) {				throw new IOException(f.getAbsolutePath()						+ " is not a WARC file.");			}		}		return (WARCReader)(compressed?			WARCReaderFactory.factory.new CompressedWARCReader(f, offset):			WARCReaderFactory.factory.new UncompressedWARCReader(f, offset));	}        public static ArchiveReader get(final String s, final InputStream is,            final boolean atFirstRecord)    throws IOException {        return WARCReaderFactory.factory.getArchiveReader(s, is,            atFirstRecord);    }        protected ArchiveReader getArchiveReader(final String f,			final InputStream is, final boolean atFirstRecord)			throws IOException {		// For now, assume stream is compressed. Later add test of input		// stream or handle exception thrown when figure not compressed stream.		return new CompressedWARCReader(f, is, atFirstRecord);	}        public static WARCReader get(final URL arcUrl, final long offset)    throws IOException {        return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl,            offset);    }        /**     * Get an ARCReader.     * Pulls the ARC local into whereever the System Property     * <code>java.io.tmpdir</code> points. It then hands back an ARCReader that     * points at this local copy.  A close on this ARCReader instance will     * remove the local copy.     * @param arcUrl An URL that points at an ARC.     * @return An ARCReader.     * @throws IOException      */    public static WARCReader get(final URL arcUrl)    throws IOException {        return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl);    }        /**     * Check file is compressed WARC.     *     * @param f File to test.     *     * @return True if this is compressed WARC (TODO: Just tests if file is     * GZIP'd file (It begins w/ GZIP MAGIC)).     *     * @exception IOException If file does not exist or is not unreadable.     */    public static boolean testCompressedWARCFile(final File f)    throws IOException {        FileUtils.isReadable(f);        boolean compressed = false;        final InputStream is = new FileInputStream(f);        try {            compressed = GzippedInputStream.isCompressedStream(is);        } finally {            is.close();        }        return compressed;    }    /**     * Uncompressed WARC file reader.     * @author stack     */    private class UncompressedWARCReader extends WARCReader {        /**         * Constructor.         * @param f Uncompressed arcfile to read.         * @throws IOException         */        public UncompressedWARCReader(final File f)        throws IOException {            this(f, 0);        }        /**         * Constructor.         *          * @param f Uncompressed file to read.         * @param offset Offset at which to position Reader.         * @throws IOException         */        public UncompressedWARCReader(final File f, final long offset)        throws IOException {            // File has been tested for existence by time it has come to here.            setIn(getInputStream(f, offset));            initialize(f.getAbsolutePath());        }                /**         * Constructor.         *          * @param f Uncompressed file to read.         * @param is InputStream.         */        public UncompressedWARCReader(final String f, final InputStream is) {            // Arc file has been tested for existence by time it has come            // to here.            setIn(is);            initialize(f);        }    }        /**     * Compressed WARC file reader.     *      * @author stack     */    private class CompressedWARCReader extends WARCReader {        /**         * Constructor.         *          * @param f Compressed file to read.         * @throws IOException         */        public CompressedWARCReader(final File f) throws IOException {            this(f, 0);        }        /**         * Constructor.         *          * @param f Compressed arcfile to read.         * @param offset Position at where to start reading file.         * @throws IOException         */        public CompressedWARCReader(final File f, final long offset)                throws IOException {            // File has been tested for existence by time it has come to here.            setIn(new GzippedInputStream(getInputStream(f, offset)));            setCompressed((offset == 0));            initialize(f.getAbsolutePath());        }                /**         * Constructor.         *          * @param f Compressed arcfile.         * @param is InputStream to use.         * @param atFirstRecord         * @throws IOException         */        public CompressedWARCReader(final String f, final InputStream is,            final boolean atFirstRecord)        throws IOException {            // Arc file has been tested for existence by time it has come            // to here.            setIn(new GzippedInputStream(is));            setCompressed(true);            initialize(f);            // TODO: Ignore atFirstRecord. Probably doesn't apply in WARC world.        }                /**         * Get record at passed <code>offset</code>.         *          * @param offset Byte index into file at which a record starts.         * @return A WARCRecord reference.         * @throws IOException         */        public WARCRecord get(long offset) throws IOException {            cleanupCurrentRecord();            ((GzippedInputStream)getIn()).gzipMemberSeek(offset);            return (WARCRecord) createArchiveRecord(getIn(), offset);        }                public Iterator<ArchiveRecord> iterator() {            /**             * Override ArchiveRecordIterator so can base returned iterator on             * GzippedInputStream iterator.             */            return new ArchiveRecordIterator() {                private GzippedInputStream gis =                    (GzippedInputStream)getInputStream();                private Iterator gzipIterator = this.gis.iterator();                protected boolean innerHasNext() {                    return this.gzipIterator.hasNext();                }                protected ArchiveRecord innerNext() throws IOException {                    // Get the positoin before gzipIterator.next moves                    // it on past the gzip header.                    long p = this.gis.position();                    InputStream is = (InputStream) this.gzipIterator.next();                    return createArchiveRecord(is, p);                }            };        }                protected void gotoEOR(ArchiveRecord rec) throws IOException {        	// TODO        }    }        public static boolean isWARCSuffix(final String f) {    	return (f == null)?    		false:    		(f.toLowerCase().endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION))?    		    true:    			(f.toLowerCase().endsWith(DOT_WARC_FILE_EXTENSION))?    			true: false;    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -