📄 archivereader.java
字号:
protected void setCompressed(boolean compressed) { this.compressed = compressed; } /** * @return The current ARC record or null if none. * After construction has the arcfile header record. * @see #get() */ protected ArchiveRecord getCurrentRecord() { return this.currentRecord; } protected ArchiveRecord currentRecord(final ArchiveRecord currentRecord) { this.currentRecord = currentRecord; return currentRecord; } protected InputStream getIn() { return in; } protected void setIn(InputStream in) { this.in = in; } protected void setVersion(String version) { this.version = version; } public String getReaderIdentifier() { return this.identifier; } protected void setReaderIdentifier(final String i) { this.identifier = i; } /** * Log on stderr. * Logging should go via the logging system. This method * bypasses the logging system going direct to stderr. * Should not generally be used. Its used for rare messages * that come of cmdline usage of ARCReader ERRORs and WARNINGs. * Override if using ARCReader in a context where no stderr or * where you'd like to redirect stderr to other than System.err. * @param level Level to log message at. * @param message Message to log. */ public void logStdErr(Level level, String message) { System.err.println(level.toString() + " " + message); } /** * Class that adds PositionableStream methods to a BufferedInputStream. */ protected class RepositionableBufferedInputStream extends BufferedInputStream implements RepositionableStream { public RepositionableBufferedInputStream(InputStream is) throws IOException { super(is); doStreamCheck(); } public RepositionableBufferedInputStream(InputStream is, int size) throws IOException { super(is, size); doStreamCheck(); } private void doStreamCheck() throws IOException { if (!(this.in instanceof RepositionableStream)) { throw new IOException( "Passed stream must implement PositionableStream"); } } public long position() throws IOException { // Current position is the underlying files position // minus the amount thats in the buffer yet to be read. return ((RepositionableStream)this.in).position() - (this.count - this.pos); } public void position(long position) throws IOException { // Force refill of buffer whenever there's been a seek. this.pos = 0; this.count = 0; ((RepositionableStream)this.in).position(position); } } /** * Inner ArchiveRecord Iterator class. * Throws RuntimeExceptions in {@link #hasNext()} and {@link #next()} if * trouble pulling record from underlying stream. * @author stack */ protected class ArchiveRecordIterator implements Iterator<ArchiveRecord> { /** * @return True if we have more records to read. * @exception RuntimeException Can throw an IOException wrapped in a * RuntimeException if a problem reading underlying stream (Corrupted * gzip, etc.). */ public boolean hasNext() { // Call close on any extant record. This will scoot us past // any content not yet read. try { cleanupCurrentRecord(); } catch (IOException e) { throw new RuntimeException(e); } return innerHasNext(); } protected boolean innerHasNext() { long offset = -1; try { offset = ((RepositionableStream)getInputStream()).position(); return getInputStream().available() > 0; } catch (IOException e) { throw new RuntimeException("Offset " + offset, e); } } /** * Tries to move to next record if we get * {@link RecoverableIOException}. If not <code>strict</code> * tries to move to next record if we get an * {@link IOException}. * @return Next object. * @exception RuntimeException Throws a runtime exception, * usually a wrapping of an IOException, if trouble getting * a record (Throws exception rather than return null). */ public ArchiveRecord next() { long offset = -1; try { offset = ((RepositionableStream)getInputStream()).position(); return exceptionNext(); } catch (IOException e) { if (!isStrict()) { // Retry once. try { if (hasNext()) { getLogger().warning("Retrying (Current offset " + offset + "): " + e.getMessage()); return exceptionNext(); } // There is no next and we don't have a record // to return. Throw the recoverable. throw new RuntimeException("Retried but " + "no next record (Offset " + offset + ")", e); } catch (IOException e1) { throw new RuntimeException("After retry (Offset " + offset + ")", e1); } } throw new RuntimeException("(Offset " + offset + ")", e); } } /** * A next that throws exceptions and has handling of * recoverable exceptions moving us to next record. Can call * hasNext which itself may throw exceptions. * @return Next record. * @throws IOException * @throws RuntimeException Thrown when we've reached maximum * retries. */ protected ArchiveRecord exceptionNext() throws IOException, RuntimeException { ArchiveRecord result = null; IOException ioe = null; for (int i = MAX_ALLOWED_RECOVERABLES; i > 0 && result == null; i--) { ioe = null; try { result = innerNext(); } catch (RecoverableIOException e) { ioe = e; getLogger().warning(e.getMessage()); if (hasNext()) { continue; } // No records left. Throw exception rather than // return null. The caller is expecting to get // back a record since they've just called // hasNext. break; } } if (ioe != null) { // Then we did MAX_ALLOWED_RECOVERABLES retries. Throw // the recoverable ioe wrapped in a RuntimeException so // it goes out pass checks for IOE. throw new RuntimeException("Retried " + MAX_ALLOWED_RECOVERABLES + " times in a row", ioe); } return result; } protected ArchiveRecord innerNext() throws IOException { return get(((RepositionableStream)getInputStream()).position()); } public void remove() { throw new UnsupportedOperationException(); } } protected static String stripExtension(final String name, final String ext) { return (!name.endsWith(ext))? name: name.substring(0, name.length() - ext.length()); } /** * @return short name of Archive file. */ public String getStrippedFileName() { return getStrippedFileName((new File(getReaderIdentifier())).getName(), getDotFileExtension()); } /** * @param name Name of ARCFile. * @param dotFileExtension '.arc' or '.warc', etc. * @return short name of Archive file. */ public static String getStrippedFileName(String name, final String dotFileExtension) { name = stripExtension(name, ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION); return stripExtension(name, dotFileExtension); } /** * @param value Value to test. * @return True if value is 'true', else false. */ protected static boolean getTrueOrFalse(final String value) { if (value == null || value.length() <= 0) { return false; } return Boolean.TRUE.toString().equals(value.toLowerCase()); } /** * @param format Format to use outputting. * @throws IOException * @throws java.text.ParseException * @return True if handled. */ protected boolean output(final String format) throws IOException, java.text.ParseException { boolean result = true; // long start = System.currentTimeMillis(); // Write output as pseudo-CDX file. See // http://www.archive.org/web/researcher/cdx_legend.php // and http://www.archive.org/web/researcher/example_cdx.php. // Hash is hard-coded straight SHA-1 hash of content. if (format.equals(DUMP)) { // No point digesting dumping. setDigest(false); dump(false); } else if (format.equals(GZIP_DUMP)) { // No point digesting dumping. setDigest(false); dump(true); } else if (format.equals(CDX)) { cdxOutput(false); } else if (format.equals(CDX_FILE)) { cdxOutput(true); } else { result = false; } return result; } protected void cdxOutput(boolean toFile) throws IOException { BufferedWriter cdxWriter = null; if (toFile) { String cdxFilename = stripExtension(getReaderIdentifier(), DOT_COMPRESSED_FILE_EXTENSION); cdxFilename = stripExtension(cdxFilename, getDotFileExtension()); cdxFilename += ('.' + CDX); cdxWriter = new BufferedWriter(new FileWriter(cdxFilename)); } String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v") + " n g"; if (toFile) { cdxWriter.write(header); cdxWriter.newLine(); } else { System.out.println(header); } String strippedFileName = getStrippedFileName(); try { for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) { ArchiveRecord r = ii.next(); if (toFile) { cdxWriter.write(r.outputCdx(strippedFileName)); cdxWriter.newLine(); } else { System.out.println(r.outputCdx(strippedFileName)); } } } finally { if (toFile) { cdxWriter.close(); } } } /** * Output passed record using passed format specifier. * @param format What format to use outputting. * @throws IOException * @return True if handled. */ protected boolean outputRecord(final String format) throws IOException { boolean result = true; if (format.equals(CDX)) { System.out.println(get().outputCdx(getStrippedFileName())); } else if(format.equals(ArchiveFileConstants.DUMP)) { // No point digesting if dumping content. setDigest(false); get().dump(); } else { result = false; } return result; } /** * Dump this file on STDOUT * @throws compress True if dumped output is compressed. * @throws IOException * @throws java.text.ParseException */ public abstract void dump(final boolean compress) throws IOException, java.text.ParseException;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -