⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gzippedinputstream.java

📁 最强的爬虫工程
💻 JAVA
字号:
/* GzippedInputStream** $Id: GzippedInputStream.java,v 1.19 2006/08/23 18:39:36 stack-sf Exp $** Created on July 5, 2004** Copyright (C) 2004 Internet Archive.** This file is part of the Heritrix web crawler (crawler.archive.org).** Heritrix is free software; you can redistribute it and/or modify* it under the terms of the GNU Lesser Public License as published by* the Free Software Foundation; either version 2.1 of the License, or* any later version.** Heritrix is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the* GNU Lesser Public License for more details.** You should have received a copy of the GNU Lesser Public License* along with Heritrix; if not, write to the Free Software* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/package org.archive.io;import it.unimi.dsi.fastutil.io.RepositionableStream;import java.io.ByteArrayOutputStream;import java.io.IOException;import java.io.InputStream;import java.util.Iterator;import java.util.zip.Deflater;import java.util.zip.GZIPInputStream;import java.util.zip.GZIPOutputStream;import java.util.zip.Inflater;/** * Subclass of GZIPInputStream that can handle a stream made of multiple * concatenated GZIP members/records. *  * This class is needed because GZIPInputStream only finds the first GZIP * member in the file even if the file is made up of multiple GZIP members. *  * <p>Takes an InputStream stream that implements * {@link RepositionableStream} interface so it can backup over-reads done * by the zlib Inflater class. *  * <p>Use the {@link #iterator()} method to get a gzip member iterator. * Calls to {@link Iterator#next()} returns the next gzip member in the * stream.  Cast return from {@link Iterator#next()} to InputStream. *  * <p>Use {@link #gzipMemberSeek(long)} to position stream before reading * a gzip member if doing random accessing of gzip members.  Pass it offset * at which gzip member starts. *  * <p>If you need to know position at which a gzip member starts, call * {@link #position()} just after a call to {@link Iterator#hasNext()} * and before you call {@link Iterator#next()}. *  * @author stack */public class GzippedInputStreamextends GZIPInputStreamimplements RepositionableStream {    /**     * Tail on gzip members (The CRC).     */    private static final int GZIP_TRAILER_LENGTH = 8;        /**     * Utility class used probing for gzip members in stream.     * We need this instance to get at the readByte method.     */    private final GzipHeader gzipHeader = new GzipHeader();        /**     * Buffer size used skipping over gzip members.     */    private static final int LINUX_PAGE_SIZE = 4 * 1024;            public GzippedInputStream(InputStream is) throws IOException {        // Have buffer match linux page size.        this(is, LINUX_PAGE_SIZE);    }        /**     * @param is An InputStream that implements RespositionableStream and     * returns <code>true</code> when we call     * {@link InputStream#markSupported()} (Latter is needed so can setup     * an {@link Iterator} against the Gzip stream).     * @param size Size of blocks to use reading.     * @throws IOException     */    public GzippedInputStream(final InputStream is, final int size)    throws IOException {        super(checkStream(is), size);    }        protected static InputStream checkStream(final InputStream is)    throws IOException {        if (is instanceof RepositionableStream) {            // Mark the stream.  The constructor will do a reset to move us            // back to the start of the first gzip member.            is.mark(10 * GzipHeader.MINIMAL_GZIP_HEADER_LENGTH);            return is;        }        throw new IOException("Passed stream does not" +            " implement PositionableStream");    }        /**     * Exhaust current GZIP member content.     * Call this method when you think you're on the end of the     * GZIP member.  It will clean out any dross.     * @param ignore Character to ignore counting characters (Usually     * trailing new lines).     * @return Count of characters skipped over.     * @throws IOException     */    public long gotoEOR(int ignore) throws IOException {        long bytesSkipped = 0;        if (this.inf.getTotalIn() <= 0) {            return bytesSkipped;        }        if (!this.inf.finished()) {            int read = 0;            while ((read = read()) != -1) {                if ((byte)read == (byte)ignore) {                    continue;                }                bytesSkipped = gotoEOR() + 1;                break;            }        }        return bytesSkipped;    }        /**     * Exhaust current GZIP member content.     * Call this method when you think you're on the end of the     * GZIP member.  It will clean out any dross.     * @return Count of characters skipped over.     * @throws IOException     */    public long gotoEOR() throws IOException {        long bytesSkipped = 0;        if (this.inf.getTotalIn() <= 0) {            return bytesSkipped;        }        while(!this.inf.finished()) {            bytesSkipped += skip(Long.MAX_VALUE);        }        return bytesSkipped;    }        /**     * Returns a GZIP Member Iterator.     * Has limitations. Can only get one Iterator per instance of this class;     * you must get new instance if you want to get Iterator again.     * @return Iterator over GZIP Members.     */    public Iterator iterator() {        try {            // We called mark in the constructor (See checkStream method).            this.in.reset();        } catch (IOException e) {            throw new RuntimeException(e);        }        return new Iterator() {            private GzippedInputStream compressedStream =                GzippedInputStream.this;                        public boolean hasNext() {                try {                    gotoEOR();                } catch (IOException e) {                    throw new RuntimeException(e);                }                return moveToNextGzipMember();            }                        /**             * @return An InputStream onto a GZIP Member.             */            public Object next() {                try {                    gzipMemberSeek();                } catch (IOException e) {                    throw new RuntimeException("Failed move to EOR or " +                        "failed header read: " + e.getMessage());                }                return this.compressedStream;            }                        public void remove() {                throw new UnsupportedOperationException();            }        };       }        /**     * @return True if we found another record in the stream.     */    protected boolean moveToNextGzipMember() {        boolean result = false;        // Move to the next gzip member, if there is one, positioning        // ourselves by backing up the stream so we reread any inflater        // remaining bytes. Then add 8 bytes to get us past the GZIP        // CRC trailer block that ends all gzip members.        try {            RepositionableStream ps =                (RepositionableStream)getInputStream();            // 8 is sizeof gzip CRC block thats on tail of gzipped            // record. If remaining is < 8 then experience indicates            // we're seeking past the gzip header -- don't backup the            // stream.            if (getInflater().getRemaining() > GZIP_TRAILER_LENGTH) {                ps.position(position() - getInflater().getRemaining() +                    GZIP_TRAILER_LENGTH);            }            // If at least MINIMAL_GZIP_HEADER_LENGTH available assume            // possible other gzip member. Move the stream on checking            // as we go to be sure of another record. We do this slow            // poke ahead because the calculation using            // this.inf.getRemaining can be off by a couple if the            // remaining is zero.  There seems to be nothing in            // gzipinputstream nor in the inflater that can be relied            // upon:  this.eos = false, this.inf.finished = false,            // this.inf.readEOF is true. I don't know why its messed up            // like this.  Study the core zlib and see if can figure            // where inflater is going wrong.            int read = -1;            int headerRead = 0;            while (getInputStream().available() >                    GzipHeader.MINIMAL_GZIP_HEADER_LENGTH) {                read = getGzipHeader().readByte(getInputStream());                if ((byte)read == (byte)GZIPInputStream.GZIP_MAGIC) {                    headerRead++;                    read = getGzipHeader().readByte(getInputStream());                    if((byte)read ==                            (byte)(GZIPInputStream.GZIP_MAGIC >> 8)) {                        headerRead++;                        read =                            getGzipHeader().readByte(getInputStream());                        if ((byte)read == Deflater.DEFLATED) {                            headerRead++;                            // Found gzip header. Backup the stream the                            // two bytes we just found and set result                            // true.                            ps.position(position() - headerRead);                            result = true;                            break;                        }                    }                    // Didn't find gzip header.  Back up stream one                    // byte because the byte just read might be the                    // actual start of the gzip header. Needs testing.                    ps.position(position() - headerRead);                    headerRead = 0;                }            }        } catch (IOException e) {            throw new RuntimeException("Failed i/o: " + e.getMessage());        }        return result;    }      protected Inflater getInflater() {        return this.inf;    }        protected InputStream getInputStream() {        return this.in;    }        protected GzipHeader getGzipHeader() {        return this.gzipHeader;    }        /**     * Move to next gzip member in the file.     */    protected void resetInflater() {        this.eos = false;        this.inf.reset();    }        /**     * Read in the gzip header.     * @throws IOException     */    protected void readHeader() throws IOException {        new GzipHeader(this.in);        // Reset the crc for subsequent reads.        this.crc.reset();    }    /**     * Seek to passed offset.     *      * After positioning the stream, it resets the inflater.     * Assumption is that public use of this method is only     * to position stream at start of a gzip member.     *      * @param position Absolute position of a gzip member start.     * @throws IOException     */    public void position(long position) throws IOException {        ((RepositionableStream)this.in).position(position);        resetInflater();    }    public long position() throws IOException {       return  ((RepositionableStream)this.in).position();    }        /**     * Seek to a gzip member.     *      * Moves stream to new position, resets inflater and reads in the gzip     * header ready for subsequent calls to read.     *      * @param position Absolute position of a gzip member start.     * @throws IOException     */    public void gzipMemberSeek(long position) throws IOException {        position(position);        readHeader();    }        public void gzipMemberSeek() throws IOException {        gzipMemberSeek(position());    }        /**     * Gzip passed bytes.     * Use only when bytes is small.     * @param bytes What to gzip.     * @return A gzip member of bytes.     * @throws IOException     */    public static byte [] gzip(byte [] bytes) throws IOException {        ByteArrayOutputStream baos = new ByteArrayOutputStream();        GZIPOutputStream gzipOS = new GZIPOutputStream(baos);        gzipOS.write(bytes, 0, bytes.length);        gzipOS.close();        return baos.toByteArray();    }        /**     * Tests passed stream is GZIP stream by reading in the HEAD.     * Does reposition of stream when done.     * @param rs An InputStream that is Repositionable.     * @return True if compressed stream.     * @throws IOException     */    public static boolean isCompressedRepositionableStream(            final RepositionableStream rs)    throws IOException {        boolean result = false;        long p = rs.position();        try {            result = isCompressedStream((InputStream)rs);        } finally {            rs.position(p);        }        return result;     }        /**     * Tests passed stream is gzip stream by reading in the HEAD.     * Does not reposition stream when done.     * @param is An InputStream.     * @return True if compressed stream.     * @throws IOException     */    public static boolean isCompressedStream(final InputStream is)    throws IOException {        try {            new GzipHeader(is);        } catch (NoGzipMagicException e) {            return false;        }        return true;    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -