📄 archivereader.java
字号:
protected void setIn(InputStream in) { this.in = in; } protected void setVersion(String version) { this.version = version; } public String getReaderIdentifier() { return this.identifier; } protected void setReaderIdentifier(final String i) { this.identifier = i; } /** * Log on stderr. * Logging should go via the logging system. This method * bypasses the logging system going direct to stderr. * Should not generally be used. Its used for rare messages * that come of cmdline usage of ARCReader ERRORs and WARNINGs. * Override if using ARCReader in a context where no stderr or * where you'd like to redirect stderr to other than System.err. * @param level Level to log message at. * @param message Message to log. */ public void logStdErr(Level level, String message) { System.err.println(level.toString() + " " + message); } /** * Add buffering to RandomAccessInputStream. */ protected class RandomAccessBufferedInputStream extends BufferedInputStream implements RepositionableStream { public RandomAccessBufferedInputStream(RandomAccessInputStream is) throws IOException { super(is); } public RandomAccessBufferedInputStream(RandomAccessInputStream is, int size) throws IOException { super(is, size); } public long position() throws IOException { // Current position is the underlying files position // minus the amount thats in the buffer yet to be read. return ((RandomAccessInputStream)this.in).position() - (this.count - this.pos); } public void position(long position) throws IOException { // Force refill of buffer whenever there's been a seek. this.pos = 0; this.count = 0; ((RandomAccessInputStream)this.in).position(position); } } /** * Inner ArchiveRecord Iterator class. * Throws RuntimeExceptions in {@link #hasNext()} and {@link #next()} if * trouble pulling record from underlying stream. * @author stack */ protected class ArchiveRecordIterator implements Iterator<ArchiveRecord> { private final Logger logger = Logger.getLogger(this.getClass().getName()); /** * @return True if we have more records to read. * @exception RuntimeException Can throw an IOException wrapped in a * RuntimeException if a problem reading underlying stream (Corrupted * gzip, etc.). */ public boolean hasNext() { // Call close on any extant record. This will scoot us past // any content not yet read. try { cleanupCurrentRecord(); } catch (IOException e) { if (isStrict()) { throw new RuntimeException(e); } if (e instanceof EOFException) { logger.warning("Premature EOF cleaning up " + currentRecord.getHeader().toString() + ": " + e.getMessage()); return false; } // If not strict, try going again. We might be able to skip // over the bad record. logger.warning("Trying skip of failed record cleanup of " + currentRecord.getHeader().toString() + ": " + e.getMessage()); } return innerHasNext(); } protected boolean innerHasNext() { long offset = -1; try { offset = ((RepositionableStream)getInputStream()).position(); return getInputStream().available() > 0; } catch (IOException e) { throw new RuntimeException("Offset " + offset, e); } } /** * Tries to move to next record if we get * {@link RecoverableIOException}. If not <code>strict</code> * tries to move to next record if we get an * {@link IOException}. * @return Next object. * @exception RuntimeException Throws a runtime exception, * usually a wrapping of an IOException, if trouble getting * a record (Throws exception rather than return null). */ public ArchiveRecord next() { long offset = -1; try { offset = ((RepositionableStream)getInputStream()).position(); return exceptionNext(); } catch (IOException e) { if (!isStrict()) { // Retry though an IOE. Maybe we will succeed reading // subsequent record. try { if (hasNext()) { getLogger().warning("Bad Record. Trying skip " + "(Current offset " + offset + "): " + e.getMessage()); return exceptionNext(); } // Else we are at last record. Iterator#next is // expecting value. We do not have one. Throw exception. throw new RuntimeException("Retried but no next " + "record (Offset " + offset + ")", e); } catch (IOException e1) { throw new RuntimeException("After retry (Offset " + offset + ")", e1); } } throw new RuntimeException("(Offset " + offset + ")", e); } } /** * A next that throws exceptions and has handling of * recoverable exceptions moving us to next record. Can call * hasNext which itself may throw exceptions. * @return Next record. * @throws IOException * @throws RuntimeException Thrown when we've reached maximum * retries. */ protected ArchiveRecord exceptionNext() throws IOException, RuntimeException { ArchiveRecord result = null; IOException ioe = null; for (int i = MAX_ALLOWED_RECOVERABLES; i > 0 && result == null; i--) { ioe = null; try { result = innerNext(); } catch (RecoverableIOException e) { ioe = e; getLogger().warning(e.getMessage()); if (hasNext()) { continue; } // No records left. Throw exception rather than // return null. The caller is expecting to get // back a record since they've just called // hasNext. break; } } if (ioe != null) { // Then we did MAX_ALLOWED_RECOVERABLES retries. Throw // the recoverable ioe wrapped in a RuntimeException so // it goes out pass checks for IOE. throw new RuntimeException("Retried " + MAX_ALLOWED_RECOVERABLES + " times in a row", ioe); } return result; } protected ArchiveRecord innerNext() throws IOException { return get(((RepositionableStream)getInputStream()).position()); } public void remove() { throw new UnsupportedOperationException(); } } protected static String stripExtension(final String name, final String ext) { return (!name.endsWith(ext))? name: name.substring(0, name.length() - ext.length()); } /** * @return short name of Archive file. */ public String getFileName() { return (new File(getReaderIdentifier())).getName(); } /** * @return short name of Archive file. */ public String getStrippedFileName() { return getStrippedFileName(getFileName(), getDotFileExtension()); } /** * @param name Name of ARCFile. * @param dotFileExtension '.arc' or '.warc', etc. * @return short name of Archive file. */ public static String getStrippedFileName(String name, final String dotFileExtension) { name = stripExtension(name, ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION); return stripExtension(name, dotFileExtension); } /** * @param value Value to test. * @return True if value is 'true', else false. */ protected static boolean getTrueOrFalse(final String value) { if (value == null || value.length() <= 0) { return false; } return Boolean.TRUE.toString().equals(value.toLowerCase()); } /** * @param format Format to use outputting. * @throws IOException * @throws java.text.ParseException * @return True if handled. */ protected boolean output(final String format) throws IOException, java.text.ParseException { boolean result = true; // long start = System.currentTimeMillis(); // Write output as pseudo-CDX file. See // http://www.archive.org/web/researcher/cdx_legend.php // and http://www.archive.org/web/researcher/example_cdx.php. // Hash is hard-coded straight SHA-1 hash of content. if (format.equals(DUMP)) { // No point digesting dumping. setDigest(false); dump(false); } else if (format.equals(GZIP_DUMP)) { // No point digesting dumping. setDigest(false); dump(true); } else if (format.equals(CDX)) { cdxOutput(false); } else if (format.equals(CDX_FILE)) { cdxOutput(true); } else { result = false; } return result; } protected void cdxOutput(boolean toFile) throws IOException { BufferedWriter cdxWriter = null; if (toFile) { String cdxFilename = stripExtension(getReaderIdentifier(), DOT_COMPRESSED_FILE_EXTENSION); cdxFilename = stripExtension(cdxFilename, getDotFileExtension()); cdxFilename += ('.' + CDX); cdxWriter = new BufferedWriter(new FileWriter(cdxFilename)); } String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v") + " n g"; if (toFile) { cdxWriter.write(header); cdxWriter.newLine(); } else { System.out.println(header); } String strippedFileName = getStrippedFileName(); try { for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) { ArchiveRecord r = ii.next(); if (toFile) { cdxWriter.write(r.outputCdx(strippedFileName)); cdxWriter.newLine(); } else { System.out.println(r.outputCdx(strippedFileName)); } } } finally { if (toFile) { cdxWriter.close(); } } } /** * Output passed record using passed format specifier. * @param format What format to use outputting. * @throws IOException * @return True if handled. */ public boolean outputRecord(final String format) throws IOException { boolean result = true; if (format.equals(CDX)) { System.out.println(get().outputCdx(getStrippedFileName())); } else if(format.equals(ArchiveFileConstants.DUMP)) { // No point digesting if dumping content. setDigest(false); get().dump(); } else { result = false; } return result; } /** * Dump this file on STDOUT * @throws compress True if dumped output is compressed. * @throws IOException * @throws java.text.ParseException */ public abstract void dump(final boolean compress) throws IOException, java.text.ParseException; /** * @return an ArchiveReader that will delete a local file on close. Used * when we bring Archive files local and need to clean up afterward. */ public abstract ArchiveReader getDeleteFileOnCloseReader(final File f); /** * Output passed record using passed format specifier. * @param r ARCReader instance to output. * @param format What format to use outputting. * @throws IOException */ protected static void outputRecord(final ArchiveReader r, final String format) throws IOException { if (!r.outputRecord(format)) { throw new IOException("Unsupported format" + " (or unsupported on a single record): " + format); } } /** * @return Base Options object filled out with help, digest, strict, etc. * options. */ protected static Options getOptions() { Options options = new Options(); options.addOption(new Option("h","help", false, "Prints this message and exits.")); options.addOption(new Option("o","offset", true, "Outputs record at this offset into file.")); options.addOption(new Option("d","digest", true, "Pass true|false. Expensive. Default: true (SHA-1).")); options.addOption(new Option("s","strict", false, "Strict mode. Fails parse if incorrectly formatted file.")); options.addOption(new Option("f","format", true, "Output options: 'cdx', cdxfile', 'dump', 'gzipdump'," + "'or 'nohead'. Default: 'cdx'.")); return options; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -