📄 archivereader.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
	protected void setIn(InputStream in) {		this.in = in;	}	protected void setVersion(String version) {		this.version = version;	}	public String getReaderIdentifier() {		return this.identifier;	}	protected void setReaderIdentifier(final String i) {		this.identifier = i;	}	    /**     * Log on stderr.     * Logging should go via the logging system.  This method     * bypasses the logging system going direct to stderr.     * Should not generally be used.  Its used for rare messages     * that come of cmdline usage of ARCReader ERRORs and WARNINGs.     * Override if using ARCReader in a context where no stderr or     * where you'd like to redirect stderr to other than System.err.     * @param level Level to log message at.     * @param message Message to log.     */    public void logStdErr(Level level, String message) {        System.err.println(level.toString() + " " + message);    }        /**     * Add buffering to RandomAccessInputStream.     */    protected class RandomAccessBufferedInputStream    extends BufferedInputStream implements RepositionableStream {        public RandomAccessBufferedInputStream(RandomAccessInputStream is)        		throws IOException {            super(is);        }        public RandomAccessBufferedInputStream(RandomAccessInputStream is, int size)        		throws IOException {            super(is, size);        }        public long position() throws IOException {            // Current position is the underlying files position            // minus the amount thats in the buffer yet to be read.            return ((RandomAccessInputStream)this.in).position() -            	(this.count - this.pos);        }        public void position(long position) throws IOException {            // Force refill of buffer whenever there's been a seek.            this.pos = 0;            this.count = 0;            ((RandomAccessInputStream)this.in).position(position);        }    }        /**     * Inner ArchiveRecord Iterator class.     * Throws RuntimeExceptions in {@link #hasNext()} and {@link #next()} if     * trouble pulling record from underlying stream.     * @author stack     */    protected class ArchiveRecordIterator implements Iterator<ArchiveRecord> {        private final Logger logger =            Logger.getLogger(this.getClass().getName());        /**         * @return True if we have more records to read.         * @exception RuntimeException Can throw an IOException wrapped in a         * RuntimeException if a problem reading underlying stream (Corrupted         * gzip, etc.).         */        public boolean hasNext() {            // Call close on any extant record.  This will scoot us past            // any content not yet read.            try {                cleanupCurrentRecord();            } catch (IOException e) {                if (isStrict()) {                    throw new RuntimeException(e);                }                if (e instanceof EOFException) {                    logger.warning("Premature EOF cleaning up " +                         currentRecord.getHeader().toString() + ": " +                        e.getMessage());                    return false;                }                // If not strict, try going again.  We might be able to skip                // over the bad record.                logger.warning("Trying skip of failed record cleanup of " +                    currentRecord.getHeader().toString() + ": " +                    e.getMessage());            }            return innerHasNext();        }                protected boolean innerHasNext() {            long offset = -1;            try {                offset = ((RepositionableStream)getInputStream()).position();                return getInputStream().available() > 0;            } catch (IOException e) {                throw new RuntimeException("Offset " + offset, e);            }        }        /**         * Tries to move to next record if we get         * {@link RecoverableIOException}. If not <code>strict</code>         * tries to move to next record if we get an         * {@link IOException}.         * @return Next object.         * @exception RuntimeException Throws a runtime exception,         * usually a wrapping of an IOException, if trouble getting         * a record (Throws exception rather than return null).         */        public ArchiveRecord next() {            long offset = -1;            try {                offset = ((RepositionableStream)getInputStream()).position();                return exceptionNext();            } catch (IOException e) {                if (!isStrict()) {                    // Retry though an IOE.  Maybe we will succeed reading                    // subsequent record.                    try {                        if (hasNext()) {                            getLogger().warning("Bad Record. Trying skip " +                                "(Current offset " +  offset + "): " +                                e.getMessage());                            return exceptionNext();                        }                        // Else we are at last record.  Iterator#next is                        // expecting value. We do not have one. Throw exception.                        throw new RuntimeException("Retried but no next " +                             "record (Offset " + offset + ")", e);                    } catch (IOException e1) {                        throw new RuntimeException("After retry (Offset " +                                offset + ")", e1);                    }                }                throw new RuntimeException("(Offset " + offset + ")", e);            }        }                /**         * A next that throws exceptions and has handling of         * recoverable exceptions moving us to next record. Can call         * hasNext which itself may throw exceptions.         * @return Next record.         * @throws IOException         * @throws RuntimeException Thrown when we've reached maximum         * retries.         */        protected ArchiveRecord exceptionNext()        throws IOException, RuntimeException {            ArchiveRecord result = null;            IOException ioe = null;            for (int i = MAX_ALLOWED_RECOVERABLES; i > 0 &&                    result == null; i--) {                ioe = null;                try {                    result = innerNext();                } catch (RecoverableIOException e) {                    ioe = e;                    getLogger().warning(e.getMessage());                    if (hasNext()) {                        continue;                    }                    // No records left.  Throw exception rather than                    // return null.  The caller is expecting to get                    // back a record since they've just called                    // hasNext.                    break;                }            }            if (ioe != null) {                // Then we did MAX_ALLOWED_RECOVERABLES retries.  Throw                // the recoverable ioe wrapped in a RuntimeException so                // it goes out pass checks for IOE.                throw new RuntimeException("Retried " +                    MAX_ALLOWED_RECOVERABLES + " times in a row", ioe);            }            return result;        }                protected ArchiveRecord innerNext() throws IOException {            return get(((RepositionableStream)getInputStream()).position());        }                public void remove() {            throw new UnsupportedOperationException();        }    }        protected static String stripExtension(final String name,    		final String ext) {        return (!name.endsWith(ext))? name:            name.substring(0, name.length() - ext.length());    }        /**     * @return short name of Archive file.     */    public String getFileName() {        return (new File(getReaderIdentifier())).getName();    }    /**     * @return short name of Archive file.     */    public String getStrippedFileName() {        return getStrippedFileName(getFileName(),    		getDotFileExtension());    }        /**     * @param name Name of ARCFile.     * @param dotFileExtension '.arc' or '.warc', etc.     * @return short name of Archive file.     */    public static String getStrippedFileName(String name,    		final String dotFileExtension) {    	name = stripExtension(name,    		ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION);    	return stripExtension(name, dotFileExtension);    }        /**     * @param value Value to test.     * @return True if value is 'true', else false.     */    protected static boolean getTrueOrFalse(final String value) {    	if (value == null || value.length() <= 0) {    		return false;    	}        return Boolean.TRUE.toString().equals(value.toLowerCase());    }        /**     * @param format Format to use outputting.     * @throws IOException     * @throws java.text.ParseException     * @return True if handled.     */    protected boolean output(final String format)    throws IOException, java.text.ParseException {    	boolean result = true;        // long start = System.currentTimeMillis();    	        // Write output as pseudo-CDX file.  See        // http://www.archive.org/web/researcher/cdx_legend.php        // and http://www.archive.org/web/researcher/example_cdx.php.        // Hash is hard-coded straight SHA-1 hash of content.        if (format.equals(DUMP)) {        	// No point digesting dumping.        	setDigest(false);            dump(false);        } else if (format.equals(GZIP_DUMP)) {        	// No point digesting dumping.        	setDigest(false);            dump(true);        } else if (format.equals(CDX)) {        	cdxOutput(false);           } else if (format.equals(CDX_FILE)) {            cdxOutput(true);        } else {        	result = false;        }	        return result;    }        protected void cdxOutput(boolean toFile)    throws IOException {        BufferedWriter cdxWriter = null;        if (toFile) {            String cdxFilename = stripExtension(getReaderIdentifier(),                DOT_COMPRESSED_FILE_EXTENSION);            cdxFilename = stripExtension(cdxFilename, getDotFileExtension());            cdxFilename += ('.' + CDX);            cdxWriter = new BufferedWriter(new FileWriter(cdxFilename));        }                String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v")            + " n g";        if (toFile) {            cdxWriter.write(header);            cdxWriter.newLine();        } else {            System.out.println(header);        }                String strippedFileName = getStrippedFileName();        try {            for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) {            	ArchiveRecord r = ii.next();                if (toFile) {                    cdxWriter.write(r.outputCdx(strippedFileName));                    cdxWriter.newLine();                } else {                    System.out.println(r.outputCdx(strippedFileName));                }            }        } finally {            if (toFile) {                cdxWriter.close();            }        }    }        /**     * Output passed record using passed format specifier.     * @param format What format to use outputting.     * @throws IOException     * @return True if handled.     */    public boolean outputRecord(final String format)    throws IOException {    	boolean result = true;        if (format.equals(CDX)) {            System.out.println(get().outputCdx(getStrippedFileName()));        } else if(format.equals(ArchiveFileConstants.DUMP)) {            // No point digesting if dumping content.            setDigest(false);            get().dump();        } else {        	result = false;        }        return result;    }    /**     * Dump this file on STDOUT     * @throws compress True if dumped output is compressed.     * @throws IOException     * @throws java.text.ParseException     */    public abstract void dump(final boolean compress)    throws IOException, java.text.ParseException;        /**     * @return an ArchiveReader that will delete a local file on close.  Used     * when we bring Archive files local and need to clean up afterward.     */    public abstract ArchiveReader getDeleteFileOnCloseReader(final File f);        /**     * Output passed record using passed format specifier.     * @param r ARCReader instance to output.     * @param format What format to use outputting.     * @throws IOException     */    protected static void outputRecord(final ArchiveReader r,        final String format)    throws IOException {        if (!r.outputRecord(format)) {            throw new IOException("Unsupported format" +                " (or unsupported on a single record): " + format);        }    }        /**     * @return Base Options object filled out with help, digest, strict, etc.     * options.     */    protected static Options getOptions() {        Options options = new Options();        options.addOption(new Option("h","help", false,            "Prints this message and exits."));        options.addOption(new Option("o","offset", true,            "Outputs record at this offset into file."));        options.addOption(new Option("d","digest", true,            "Pass true|false. Expensive. Default: true (SHA-1)."));        options.addOption(new Option("s","strict", false,            "Strict mode. Fails parse if incorrectly formatted file."));        options.addOption(new Option("f","format", true,            "Output options: 'cdx', cdxfile', 'dump', 'gzipdump'," +            "'or 'nohead'. Default: 'cdx'."));        return options;    }}
上一页 12
💿 文件大小 9430 K
👤 上传用户 zergwyk
📂 所属分类 Internet/网络编程
🏷️ 相关标签

#lucece
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -