⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gzippedinputstream.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/* GzippedInputStream** $Id: GzippedInputStream.java 4995 2007-03-12 23:48:36Z stack-sf $** Created on July 5, 2004** Copyright (C) 2004 Internet Archive.** This file is part of the Heritrix web crawler (crawler.archive.org).** Heritrix is free software; you can redistribute it and/or modify* it under the terms of the GNU Lesser Public License as published by* the Free Software Foundation; either version 2.1 of the License, or* any later version.** Heritrix is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the* GNU Lesser Public License for more details.** You should have received a copy of the GNU Lesser Public License* along with Heritrix; if not, write to the Free Software* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/package org.archive.io;import it.unimi.dsi.fastutil.io.RepositionableStream;import java.io.ByteArrayOutputStream;import java.io.EOFException;import java.io.IOException;import java.io.InputStream;import java.util.Iterator;import java.util.logging.Logger;import java.util.zip.Deflater;import java.util.zip.GZIPInputStream;import java.util.zip.GZIPOutputStream;import java.util.zip.Inflater;import java.util.zip.ZipException;/** * Subclass of GZIPInputStream that can handle a stream made of multiple * concatenated GZIP members/records. *  * This class is needed because GZIPInputStream only finds the first GZIP * member in the file even if the file is made up of multiple GZIP members. *  * <p>Takes an InputStream stream that implements * {@link RepositionableStream} interface so it can backup over-reads done * by the zlib Inflater class. *  * <p>Use the {@link #iterator()} method to get a gzip member iterator. * Calls to {@link Iterator#next()} returns the next gzip member in the * stream.  Cast return from {@link Iterator#next()} to InputStream. *  * <p>Use {@link #gzipMemberSeek(long)} to position stream before reading * a gzip member if doing random accessing of gzip members.  Pass it offset * at which gzip member starts. *  * <p>If you need to know position at which a gzip member starts, call * {@link #position()} just after a call to {@link Iterator#hasNext()} * and before you call {@link Iterator#next()}. *  * @author stack */public class GzippedInputStreamextends GZIPInputStreamimplements RepositionableStream {    /**     * Tail on gzip members (The CRC).     */    private static final int GZIP_TRAILER_LENGTH = 8;        /**     * Utility class used probing for gzip members in stream.     * We need this instance to get at the readByte method.     */    private final GzipHeader gzipHeader = new GzipHeader();        /**     * Buffer size used skipping over gzip members.     */    private static final int LINUX_PAGE_SIZE = 4 * 1024;        private final long initialOffset;        public GzippedInputStream(InputStream is) throws IOException {        // Have buffer match linux page size.        this(is, LINUX_PAGE_SIZE);    }        /**     * @param is An InputStream that implements RespositionableStream and     * returns <code>true</code> when we call     * {@link InputStream#markSupported()} (Latter is needed so can setup     * an {@link Iterator} against the Gzip stream).     * @param size Size of blocks to use reading.     * @throws IOException     */    public GzippedInputStream(final InputStream is, final int size)    throws IOException {        super(checkStream(is), size);        if (!is.markSupported()) {        	throw new IllegalArgumentException("GzippedInputStream requires " +        		"a markable stream");        }        if (!(is instanceof RepositionableStream)) {        	throw new IllegalArgumentException("GzippedInputStream requires " +    		"a stream that implements RepositionableStream");        }        // We need to calculate the absolute offset of the current        // GZIP Member.  Its almost always going to be zero but not        // always (We may have been passed a stream that is already part        // ways through a stream of GZIP Members).  So, getting        // absolute offset is not exactly straight-forward. The super        // class, GZIPInputStream on construction reads in the GZIP Header        // which is a pain because I then do not know the absolute offset        // at which the GZIP record began.  So, the call above to checkStream()        // marked the stream before passing it to the super calls.  Then        // below we get current postion at just past the GZIP Header, call        // reset so we go back to the absolute start of the GZIP Member in        // the file, record the offset for later should we need to start        // over again in this file -- i.e. we're asked to get an iterator        // from Record zero on -- then we move the file position to just        // after the GZIP Header again so we're again aligned for inflation        // of the current record.        long afterGZIPHeader = ((RepositionableStream)is).position();        is.reset();        this.initialOffset = ((RepositionableStream)is).position();        ((RepositionableStream)is).position(afterGZIPHeader);    }        protected static InputStream checkStream(final InputStream is)    throws IOException {        if (is instanceof RepositionableStream) {        	// See note above in constructor on why the mark here.        	// Also minimal gzip header is 10.  IA GZIP Headers are 20 bytes.        	// Multiply by 4 in case extra info in the header.        	is.mark(GzipHeader.MINIMAL_GZIP_HEADER_LENGTH * 4);        	return is;        }        throw new IOException("Passed stream does not" +            " implement PositionableStream");    }        /**     * Exhaust current GZIP member content.     * Call this method when you think you're on the end of the     * GZIP member.  It will clean out any dross.     * @param ignore Character to ignore counting characters (Usually     * trailing new lines).     * @return Count of characters skipped over.     * @throws IOException     */    public long gotoEOR(int ignore) throws IOException {        long bytesSkipped = 0;        if (this.inf.getTotalIn() <= 0) {            return bytesSkipped;        }        if (!this.inf.finished()) {            int read = 0;            while ((read = read()) != -1) {                if ((byte)read == (byte)ignore) {                    continue;                }                bytesSkipped = gotoEOR() + 1;                break;            }        }        return bytesSkipped;    }        /**     * Exhaust current GZIP member content.     * Call this method when you think you're on the end of the     * GZIP member.  It will clean out any dross.     * @return Count of characters skipped over.     * @throws IOException     */    public long gotoEOR() throws IOException {        long bytesSkipped = 0;        if (this.inf.getTotalIn() <= 0) {            return bytesSkipped;        }        while(!this.inf.finished()) {            bytesSkipped += skip(Long.MAX_VALUE);        }        return bytesSkipped;    }        /**     * Returns a GZIP Member Iterator.     * Has limitations. Can only get one Iterator per instance of this class;     * you must get new instance if you want to get Iterator again.     * @return Iterator over GZIP Members.     */    public Iterator iterator() {        final Logger logger = Logger.getLogger(this.getClass().getName());                try {            // We know its a RepositionableStream else we'd have failed        	// construction.  On iterator construction, set file back to        	// initial position so we're ready to read GZIP Members        	// (May not always work dependent on how the        	// RepositionableStream was implemented).            ((RepositionableStream)this.in).position(this.initialOffset);        } catch (IOException e) {            throw new RuntimeException(e);        }        return new Iterator() {            private GzippedInputStream compressedStream =                GzippedInputStream.this;                        public boolean hasNext() {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -