📄 gzippedinputstream.java
字号:
/* GzippedInputStream** $Id: GzippedInputStream.java 4995 2007-03-12 23:48:36Z stack-sf $** Created on July 5, 2004** Copyright (C) 2004 Internet Archive.** This file is part of the Heritrix web crawler (crawler.archive.org).** Heritrix is free software; you can redistribute it and/or modify* it under the terms of the GNU Lesser Public License as published by* the Free Software Foundation; either version 2.1 of the License, or* any later version.** Heritrix is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the* GNU Lesser Public License for more details.** You should have received a copy of the GNU Lesser Public License* along with Heritrix; if not, write to the Free Software* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/package org.archive.io;import it.unimi.dsi.fastutil.io.RepositionableStream;import java.io.ByteArrayOutputStream;import java.io.EOFException;import java.io.IOException;import java.io.InputStream;import java.util.Iterator;import java.util.logging.Logger;import java.util.zip.Deflater;import java.util.zip.GZIPInputStream;import java.util.zip.GZIPOutputStream;import java.util.zip.Inflater;import java.util.zip.ZipException;/** * Subclass of GZIPInputStream that can handle a stream made of multiple * concatenated GZIP members/records. * * This class is needed because GZIPInputStream only finds the first GZIP * member in the file even if the file is made up of multiple GZIP members. * * <p>Takes an InputStream stream that implements * {@link RepositionableStream} interface so it can backup over-reads done * by the zlib Inflater class. * * <p>Use the {@link #iterator()} method to get a gzip member iterator. * Calls to {@link Iterator#next()} returns the next gzip member in the * stream. Cast return from {@link Iterator#next()} to InputStream. * * <p>Use {@link #gzipMemberSeek(long)} to position stream before reading * a gzip member if doing random accessing of gzip members. Pass it offset * at which gzip member starts. * * <p>If you need to know position at which a gzip member starts, call * {@link #position()} just after a call to {@link Iterator#hasNext()} * and before you call {@link Iterator#next()}. * * @author stack */public class GzippedInputStreamextends GZIPInputStreamimplements RepositionableStream { /** * Tail on gzip members (The CRC). */ private static final int GZIP_TRAILER_LENGTH = 8; /** * Utility class used probing for gzip members in stream. * We need this instance to get at the readByte method. */ private final GzipHeader gzipHeader = new GzipHeader(); /** * Buffer size used skipping over gzip members. */ private static final int LINUX_PAGE_SIZE = 4 * 1024; private final long initialOffset; public GzippedInputStream(InputStream is) throws IOException { // Have buffer match linux page size. this(is, LINUX_PAGE_SIZE); } /** * @param is An InputStream that implements RespositionableStream and * returns <code>true</code> when we call * {@link InputStream#markSupported()} (Latter is needed so can setup * an {@link Iterator} against the Gzip stream). * @param size Size of blocks to use reading. * @throws IOException */ public GzippedInputStream(final InputStream is, final int size) throws IOException { super(checkStream(is), size); if (!is.markSupported()) { throw new IllegalArgumentException("GzippedInputStream requires " + "a markable stream"); } if (!(is instanceof RepositionableStream)) { throw new IllegalArgumentException("GzippedInputStream requires " + "a stream that implements RepositionableStream"); } // We need to calculate the absolute offset of the current // GZIP Member. Its almost always going to be zero but not // always (We may have been passed a stream that is already part // ways through a stream of GZIP Members). So, getting // absolute offset is not exactly straight-forward. The super // class, GZIPInputStream on construction reads in the GZIP Header // which is a pain because I then do not know the absolute offset // at which the GZIP record began. So, the call above to checkStream() // marked the stream before passing it to the super calls. Then // below we get current postion at just past the GZIP Header, call // reset so we go back to the absolute start of the GZIP Member in // the file, record the offset for later should we need to start // over again in this file -- i.e. we're asked to get an iterator // from Record zero on -- then we move the file position to just // after the GZIP Header again so we're again aligned for inflation // of the current record. long afterGZIPHeader = ((RepositionableStream)is).position(); is.reset(); this.initialOffset = ((RepositionableStream)is).position(); ((RepositionableStream)is).position(afterGZIPHeader); } protected static InputStream checkStream(final InputStream is) throws IOException { if (is instanceof RepositionableStream) { // See note above in constructor on why the mark here. // Also minimal gzip header is 10. IA GZIP Headers are 20 bytes. // Multiply by 4 in case extra info in the header. is.mark(GzipHeader.MINIMAL_GZIP_HEADER_LENGTH * 4); return is; } throw new IOException("Passed stream does not" + " implement PositionableStream"); } /** * Exhaust current GZIP member content. * Call this method when you think you're on the end of the * GZIP member. It will clean out any dross. * @param ignore Character to ignore counting characters (Usually * trailing new lines). * @return Count of characters skipped over. * @throws IOException */ public long gotoEOR(int ignore) throws IOException { long bytesSkipped = 0; if (this.inf.getTotalIn() <= 0) { return bytesSkipped; } if (!this.inf.finished()) { int read = 0; while ((read = read()) != -1) { if ((byte)read == (byte)ignore) { continue; } bytesSkipped = gotoEOR() + 1; break; } } return bytesSkipped; } /** * Exhaust current GZIP member content. * Call this method when you think you're on the end of the * GZIP member. It will clean out any dross. * @return Count of characters skipped over. * @throws IOException */ public long gotoEOR() throws IOException { long bytesSkipped = 0; if (this.inf.getTotalIn() <= 0) { return bytesSkipped; } while(!this.inf.finished()) { bytesSkipped += skip(Long.MAX_VALUE); } return bytesSkipped; } /** * Returns a GZIP Member Iterator. * Has limitations. Can only get one Iterator per instance of this class; * you must get new instance if you want to get Iterator again. * @return Iterator over GZIP Members. */ public Iterator iterator() { final Logger logger = Logger.getLogger(this.getClass().getName()); try { // We know its a RepositionableStream else we'd have failed // construction. On iterator construction, set file back to // initial position so we're ready to read GZIP Members // (May not always work dependent on how the // RepositionableStream was implemented). ((RepositionableStream)this.in).position(this.initialOffset); } catch (IOException e) { throw new RuntimeException(e); } return new Iterator() { private GzippedInputStream compressedStream = GzippedInputStream.this; public boolean hasNext() {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -