⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 archivereader.java

📁 最强的爬虫工程
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
	protected void setCompressed(boolean compressed) {		this.compressed = compressed;	}    /**     * @return The current ARC record or null if none.     * After construction has the arcfile header record.     * @see #get()     */	protected ArchiveRecord getCurrentRecord() {		return this.currentRecord;	}	protected ArchiveRecord currentRecord(final ArchiveRecord currentRecord) {		this.currentRecord = currentRecord;        return currentRecord;	}	protected InputStream getIn() {		return in;	}	protected void setIn(InputStream in) {		this.in = in;	}	protected void setVersion(String version) {		this.version = version;	}	public String getReaderIdentifier() {		return this.identifier;	}	protected void setReaderIdentifier(final String i) {		this.identifier = i;	}	    /**     * Log on stderr.     * Logging should go via the logging system.  This method     * bypasses the logging system going direct to stderr.     * Should not generally be used.  Its used for rare messages     * that come of cmdline usage of ARCReader ERRORs and WARNINGs.     * Override if using ARCReader in a context where no stderr or     * where you'd like to redirect stderr to other than System.err.     * @param level Level to log message at.     * @param message Message to log.     */    public void logStdErr(Level level, String message) {        System.err.println(level.toString() + " " + message);    }        /**     * Class that adds PositionableStream methods to a BufferedInputStream.     */    protected class RepositionableBufferedInputStream    extends BufferedInputStream    		implements RepositionableStream {        public RepositionableBufferedInputStream(InputStream is)        		throws IOException {            super(is);            doStreamCheck();        }        public RepositionableBufferedInputStream(InputStream is, int size)        		throws IOException {            super(is, size);            doStreamCheck();        }                private void doStreamCheck() throws IOException {            if (!(this.in instanceof RepositionableStream)) {                throw new IOException(                    "Passed stream must implement PositionableStream");            }        }        public long position() throws IOException {            // Current position is the underlying files position            // minus the amount thats in the buffer yet to be read.            return ((RepositionableStream)this.in).position() -            	(this.count - this.pos);        }        public void position(long position) throws IOException {            // Force refill of buffer whenever there's been a seek.            this.pos = 0;            this.count = 0;            ((RepositionableStream)this.in).position(position);        }    }        /**     * Inner ArchiveRecord Iterator class.     * Throws RuntimeExceptions in {@link #hasNext()} and {@link #next()} if     * trouble pulling record from underlying stream.     * @author stack     */    protected class ArchiveRecordIterator implements Iterator<ArchiveRecord> {        /**         * @return True if we have more records to read.         * @exception RuntimeException Can throw an IOException wrapped in a         * RuntimeException if a problem reading underlying stream (Corrupted         * gzip, etc.).         */        public boolean hasNext() {            // Call close on any extant record.  This will scoot us past            // any content not yet read.            try {                cleanupCurrentRecord();            } catch (IOException e) {                throw new RuntimeException(e);            }            return innerHasNext();        }                protected boolean innerHasNext() {            long offset = -1;            try {                offset = ((RepositionableStream)getInputStream()).position();                return getInputStream().available() > 0;            } catch (IOException e) {                throw new RuntimeException("Offset " + offset, e);            }        }        /**         * Tries to move to next record if we get         * {@link RecoverableIOException}. If not <code>strict</code>         * tries to move to next record if we get an         * {@link IOException}.         * @return Next object.         * @exception RuntimeException Throws a runtime exception,         * usually a wrapping of an IOException, if trouble getting         * a record (Throws exception rather than return null).         */        public ArchiveRecord next() {            long offset = -1;            try {                offset = ((RepositionableStream)getInputStream()).position();                return exceptionNext();            } catch (IOException e) {                if (!isStrict()) {                    // Retry once.                    try {                        if (hasNext()) {                            getLogger().warning("Retrying (Current offset " +                                offset + "): " +  e.getMessage());                            return exceptionNext();                        }                        // There is no next and we don't have a record                        // to return.  Throw the recoverable.                        throw new RuntimeException("Retried but " +                            "no next record (Offset " + offset + ")",                            e);                    } catch (IOException e1) {                        throw new RuntimeException("After retry (Offset " +                                offset + ")", e1);                    }                }                throw new RuntimeException("(Offset " + offset + ")", e);            }        }                /**         * A next that throws exceptions and has handling of         * recoverable exceptions moving us to next record. Can call         * hasNext which itself may throw exceptions.         * @return Next record.         * @throws IOException         * @throws RuntimeException Thrown when we've reached maximum         * retries.         */        protected ArchiveRecord exceptionNext()        throws IOException, RuntimeException {            ArchiveRecord result = null;            IOException ioe = null;            for (int i = MAX_ALLOWED_RECOVERABLES; i > 0 &&                    result == null; i--) {                ioe = null;                try {                    result = innerNext();                } catch (RecoverableIOException e) {                    ioe = e;                    getLogger().warning(e.getMessage());                    if (hasNext()) {                        continue;                    }                    // No records left.  Throw exception rather than                    // return null.  The caller is expecting to get                    // back a record since they've just called                    // hasNext.                    break;                }            }            if (ioe != null) {                // Then we did MAX_ALLOWED_RECOVERABLES retries.  Throw                // the recoverable ioe wrapped in a RuntimeException so                // it goes out pass checks for IOE.                throw new RuntimeException("Retried " +                    MAX_ALLOWED_RECOVERABLES + " times in a row", ioe);            }            return result;        }                protected ArchiveRecord innerNext() throws IOException {            return get(((RepositionableStream)getInputStream()).position());        }                public void remove() {            throw new UnsupportedOperationException();        }    }        protected static String stripExtension(final String name,    		final String ext) {        return (!name.endsWith(ext))? name:            name.substring(0, name.length() - ext.length());    }        /**     * @return short name of Archive file.     */    public String getStrippedFileName() {    	return getStrippedFileName((new File(getReaderIdentifier())).getName(),    		getDotFileExtension());    }        /**     * @param name Name of ARCFile.     * @param dotFileExtension '.arc' or '.warc', etc.     * @return short name of Archive file.     */    public static String getStrippedFileName(String name,    		final String dotFileExtension) {    	name = stripExtension(name,    		ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION);    	return stripExtension(name, dotFileExtension);    }        /**     * @param value Value to test.     * @return True if value is 'true', else false.     */    protected static boolean getTrueOrFalse(final String value) {    	if (value == null || value.length() <= 0) {    		return false;    	}        return Boolean.TRUE.toString().equals(value.toLowerCase());    }        /**     * @param format Format to use outputting.     * @throws IOException     * @throws java.text.ParseException     * @return True if handled.     */    protected boolean output(final String format)    throws IOException, java.text.ParseException {    	boolean result = true;        // long start = System.currentTimeMillis();    	        // Write output as pseudo-CDX file.  See        // http://www.archive.org/web/researcher/cdx_legend.php        // and http://www.archive.org/web/researcher/example_cdx.php.        // Hash is hard-coded straight SHA-1 hash of content.        if (format.equals(DUMP)) {        	// No point digesting dumping.        	setDigest(false);            dump(false);        } else if (format.equals(GZIP_DUMP)) {        	// No point digesting dumping.        	setDigest(false);            dump(true);        } else if (format.equals(CDX)) {        	cdxOutput(false);           } else if (format.equals(CDX_FILE)) {            cdxOutput(true);        } else {        	result = false;        }	        return result;    }        protected void cdxOutput(boolean toFile)    throws IOException {        BufferedWriter cdxWriter = null;        if (toFile) {            String cdxFilename = stripExtension(getReaderIdentifier(),                DOT_COMPRESSED_FILE_EXTENSION);            cdxFilename = stripExtension(cdxFilename, getDotFileExtension());            cdxFilename += ('.' + CDX);            cdxWriter = new BufferedWriter(new FileWriter(cdxFilename));        }                String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v")            + " n g";        if (toFile) {            cdxWriter.write(header);            cdxWriter.newLine();        } else {            System.out.println(header);        }                String strippedFileName = getStrippedFileName();        try {            for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) {            	ArchiveRecord r = ii.next();                if (toFile) {                    cdxWriter.write(r.outputCdx(strippedFileName));                    cdxWriter.newLine();                } else {                    System.out.println(r.outputCdx(strippedFileName));                }            }        } finally {            if (toFile) {                cdxWriter.close();            }        }    }        /**     * Output passed record using passed format specifier.     * @param format What format to use outputting.     * @throws IOException     * @return True if handled.     */    protected boolean outputRecord(final String format)    throws IOException {    	boolean result = true;        if (format.equals(CDX)) {            System.out.println(get().outputCdx(getStrippedFileName()));        } else if(format.equals(ArchiveFileConstants.DUMP)) {            // No point digesting if dumping content.            setDigest(false);            get().dump();        } else {        	result = false;        }        return result;    }    /**     * Dump this file on STDOUT     * @throws compress True if dumped output is compressed.     * @throws IOException     * @throws java.text.ParseException     */    public abstract void dump(final boolean compress)    throws IOException, java.text.ParseException;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -