📄 archivereader.java
字号:
/* $Id: ArchiveReader.java 4996 2007-03-13 00:08:58Z stack-sf $ * * Created on August 21st, 2006 * * Copyright (C) 2006 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package org.archive.io;import it.unimi.dsi.fastutil.io.RepositionableStream;import java.io.BufferedInputStream;import java.io.BufferedWriter;import java.io.EOFException;import java.io.File;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.util.ArrayList;import java.util.Iterator;import java.util.List;import java.util.logging.Level;import java.util.logging.Logger;import org.apache.commons.cli.Option;import org.apache.commons.cli.Options;import org.archive.util.MimetypeUtils;/** * Reader for an Archive file of Archive {@link ArchiveRecord}s. * @author stack * @version $Date: 2007-03-13 00:08:58 +0000 (Tue, 13 Mar 2007) $ $Version$ */public abstract class ArchiveReader implements ArchiveFileConstants { /** * Is this Archive file compressed? */ private boolean compressed = false; /** * Should we digest as we read? */ private boolean digest = true; /** * Should the parse be strict? */ private boolean strict = false; /** * Archive file input stream. * * Keep it around so we can close it when done. * * <p>Set in constructor. Must support {@link RepositionableStream} * interface. Make it protected so subclasses have access. */ private InputStream in = null; /** * Maximum amount of recoverable exceptions in a row. * If more than this amount in a row, we'll let out the exception rather * than go back in for yet another retry. */ public static final int MAX_ALLOWED_RECOVERABLES = 10; /** * The Record currently being read. * * Keep this ongoing reference so we'll close the record even if the caller * doesn't. */ private ArchiveRecord currentRecord = null; /** * Descriptive string for the Archive file we're going against: * full path, url, etc. -- depends on context in which file was made. */ private String identifier = null; /** * Archive file version. */ private String version = null; protected ArchiveReader() { super(); } /** * Convenience method used by subclass constructors. * @param i Identifier for Archive file this reader goes against. */ protected void initialize(final String i) { setReaderIdentifier(i); } /** * Convenience method for constructors. * * @param f File to read. * @param offset Offset at which to start reading. * @return InputStream to read from. * @throws IOException If failed open or fail to get a memory * mapped byte buffer on file. */ protected InputStream getInputStream(final File f, final long offset) throws IOException { return new RandomAccessBufferedInputStream( new RandomAccessInputStream(f, offset)); } public boolean isCompressed() { return this.compressed; } /** * Get record at passed <code>offset</code>. * * @param offset Byte index into file at which a record starts. * @return An Archive Record reference. * @throws IOException */ public ArchiveRecord get(long offset) throws IOException { cleanupCurrentRecord(); RepositionableStream ps = (RepositionableStream)this.in; long currentOffset = ps.position(); if (currentOffset != offset) { currentOffset = offset; ps.position(offset); } return createArchiveRecord(this.in, currentOffset); } /** * @return Return Archive Record created against current offset. * @throws IOException */ public ArchiveRecord get() throws IOException { return createArchiveRecord(this.in, ((RepositionableStream)this.in).position()); } public void close() throws IOException { if (this.in != null) { this.in.close(); this.in = null; } } /** * Rewinds stream to start of the Archive file. * @throws IOException if stream is not resettable. */ protected void rewind() throws IOException { cleanupCurrentRecord(); if (this.in instanceof RepositionableStream) { try { ((RepositionableStream)this.in).position(0); } catch (IOException e) { throw new RuntimeException(e); } } else { throw new IOException("Stream is not resettable."); } } /** * Cleanout the current record if there is one. * @throws IOException */ protected void cleanupCurrentRecord() throws IOException { if (this.currentRecord != null) { this.currentRecord.close(); gotoEOR(this.currentRecord); this.currentRecord = null; } } /** * Return an Archive Record homed on <code>offset</code> into * <code>is</code>. * @param is Stream to read Record from. * @param offset Offset to find Record at. * @return ArchiveRecord instance. * @throws IOException */ protected abstract ArchiveRecord createArchiveRecord(InputStream is, long offset) throws IOException; /** * Skip over any trailing new lines at end of the record so we're lined up * ready to read the next. * @param record * @throws IOException */ protected abstract void gotoEOR(ArchiveRecord record) throws IOException; public abstract String getFileExtension(); public abstract String getDotFileExtension(); /** * @return Version of this Archive file. */ public String getVersion() { return this.version; } /** * Validate the Archive file. * * This method iterates over the file throwing exception if it fails * to successfully parse any record. * * <p>Assumes the stream is at the start of the file. * @return List of all read Archive Headers. * * @throws IOException */ public List validate() throws IOException { return validate(-1); } /** * Validate the Archive file. * * This method iterates over the file throwing exception if it fails * to successfully parse. * * <p>We start validation from whereever we are in the stream. * * @param noRecords Number of records expected. Pass -1 if number is * unknown. * * @return List of all read metadatas. As we validate records, we add * a reference to the read metadata. * * @throws IOException */ public List validate(int noRecords) throws IOException { List<ArchiveRecordHeader> hs = new ArrayList<ArchiveRecordHeader>(); int count = 0; setStrict(true); for (Iterator<ArchiveRecord> i = iterator(); i.hasNext();) { count++; ArchiveRecord r = i.next(); if (r.getHeader().getLength() <= 0 && r.getHeader().getMimetype(). equals(MimetypeUtils.NO_TYPE_MIMETYPE)) { throw new IOException("ARCRecord content is empty."); } r.close(); // Add reference to metadata into a list of metadatas. hs.add(r.getHeader()); } if (noRecords != -1) { if (count != noRecords) { throw new IOException("Count of records, " + Integer.toString(count) + " is less than expected " + Integer.toString(noRecords)); } } return hs; } /** * Test Archive file is valid. * Assumes the stream is at the start of the file. Be aware that this * method makes a pass over the whole file. * @return True if file can be successfully parsed. */ public boolean isValid() { boolean valid = false; try { validate(); valid = true; } catch(Exception e) { // File is not valid if exception thrown parsing. valid = false; } return valid; } /** * @return Returns the strict. */ public boolean isStrict() { return this.strict; } /** * @param s The strict to set. */ public void setStrict(boolean s) { this.strict = s; } /** * @param d True if we're to digest. */ public void setDigest(boolean d) { this.digest = d; } /** * @return True if we're digesting as we read. */ public boolean isDigest() { return this.digest; } protected Logger getLogger() { return Logger.getLogger(this.getClass().getName()); } protected InputStream getInputStream() { return this.in; } /** * Returns an ArchiveRecord iterator. * Of note, on IOException, especially if ZipException reading compressed * ARCs, rather than fail the iteration, try moving to the next record. * If {@link ArchiveReader#strict} is not set, this will usually succeed. * @return An iterator over ARC records. */ public Iterator<ArchiveRecord> iterator() { // Eat up any record outstanding. try { cleanupCurrentRecord(); } catch (IOException e) { throw new RuntimeException(e); } // Now reset stream to the start of the arc file. try { rewind(); } catch (IOException e) { throw new RuntimeException(e); } return new ArchiveRecordIterator(); } protected void setCompressed(boolean compressed) { this.compressed = compressed; } /** * @return The current ARC record or null if none. * After construction has the arcfile header record. * @see #get() */ protected ArchiveRecord getCurrentRecord() { return this.currentRecord; } protected ArchiveRecord currentRecord(final ArchiveRecord currentRecord) { this.currentRecord = currentRecord; return currentRecord; } protected InputStream getIn() { return in; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -