⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 archivereader.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/* $Id: ArchiveReader.java 4996 2007-03-13 00:08:58Z stack-sf $ * * Created on August 21st, 2006 * * Copyright (C) 2006 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.io;import it.unimi.dsi.fastutil.io.RepositionableStream;import java.io.BufferedInputStream;import java.io.BufferedWriter;import java.io.EOFException;import java.io.File;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.util.ArrayList;import java.util.Iterator;import java.util.List;import java.util.logging.Level;import java.util.logging.Logger;import org.apache.commons.cli.Option;import org.apache.commons.cli.Options;import org.archive.util.MimetypeUtils;/** * Reader for an Archive file of Archive {@link ArchiveRecord}s. * @author stack * @version $Date: 2007-03-13 00:08:58 +0000 (Tue, 13 Mar 2007) $ $Version$ */public abstract class ArchiveReader implements ArchiveFileConstants {        /**     * Is this Archive file compressed?     */    private boolean compressed = false;        /**     * Should we digest as we read?     */    private boolean digest = true;        /**     * Should the parse be strict?     */    private boolean strict = false;        /**     * Archive file input stream.     *     * Keep it around so we can close it when done.     *     * <p>Set in constructor. Must support {@link RepositionableStream}     * interface.  Make it protected so subclasses have access.     */    private InputStream in = null;        /**     * Maximum amount of recoverable exceptions in a row.     * If more than this amount in a row, we'll let out the exception rather     * than go back in for yet another retry.     */    public static final int MAX_ALLOWED_RECOVERABLES = 10;        /**     * The Record currently being read.     *     * Keep this ongoing reference so we'll close the record even if the caller     * doesn't.     */    private ArchiveRecord currentRecord = null;        /**     * Descriptive string for the Archive file we're going against:     * full path, url, etc. -- depends on context in which file was made.     */    private String identifier = null;        /**     * Archive file version.     */    private String version = null;            protected ArchiveReader() {        super();    }        /**     * Convenience method used by subclass constructors.     * @param i Identifier for Archive file this reader goes against.     */    protected void initialize(final String i) {        setReaderIdentifier(i);    }        /**     * Convenience method for constructors.     *      * @param f File to read.     * @param offset Offset at which to start reading.     * @return InputStream to read from.     * @throws IOException If failed open or fail to get a memory     * mapped byte buffer on file.     */    protected InputStream getInputStream(final File f, final long offset)    throws IOException {        return new RandomAccessBufferedInputStream(            new RandomAccessInputStream(f, offset));    }    public boolean isCompressed() {        return this.compressed;    }    /**     * Get record at passed <code>offset</code>.     *      * @param offset Byte index into file at which a record starts.     * @return An Archive Record reference.     * @throws IOException     */    public ArchiveRecord get(long offset) throws IOException {        cleanupCurrentRecord();        RepositionableStream ps = (RepositionableStream)this.in;        long currentOffset = ps.position();        if (currentOffset != offset) {            currentOffset = offset;            ps.position(offset);        }        return createArchiveRecord(this.in, currentOffset);    }        /**     * @return Return Archive Record created against current offset.     * @throws IOException     */    public ArchiveRecord get() throws IOException {        return createArchiveRecord(this.in,            ((RepositionableStream)this.in).position());    }    public void close() throws IOException {        if (this.in != null) {            this.in.close();            this.in = null;        }    }        /**     * Rewinds stream to start of the Archive file.     * @throws IOException if stream is not resettable.     */    protected void rewind() throws IOException {        cleanupCurrentRecord();        if (this.in instanceof RepositionableStream) {            try {                ((RepositionableStream)this.in).position(0);            } catch (IOException e) {                throw new RuntimeException(e);            }       } else {           throw new IOException("Stream is not resettable.");       }    }        /**     * Cleanout the current record if there is one.     * @throws IOException     */    protected void cleanupCurrentRecord() throws IOException {        if (this.currentRecord != null) {            this.currentRecord.close();            gotoEOR(this.currentRecord);            this.currentRecord = null;        }    }        /**     * Return an Archive Record homed on <code>offset</code> into     * <code>is</code>.     * @param is Stream to read Record from.     * @param offset Offset to find Record at.     * @return ArchiveRecord instance.     * @throws IOException     */    protected abstract ArchiveRecord createArchiveRecord(InputStream is,    	long offset)    throws IOException;        /**     * Skip over any trailing new lines at end of the record so we're lined up     * ready to read the next.     * @param record     * @throws IOException     */    protected abstract void gotoEOR(ArchiveRecord record) throws IOException;        public abstract String getFileExtension();    public abstract String getDotFileExtension();    /**     * @return Version of this Archive file.     */    public String getVersion() {    	return this.version;    }    /**     * Validate the Archive file.     *     * This method iterates over the file throwing exception if it fails     * to successfully parse any record.     *     * <p>Assumes the stream is at the start of the file.     * @return List of all read Archive Headers.     *     * @throws IOException     */    public List validate() throws IOException {        return validate(-1);    }    /**     * Validate the Archive file.     *     * This method iterates over the file throwing exception if it fails     * to successfully parse.     *     * <p>We start validation from whereever we are in the stream.     *     * @param noRecords Number of records expected.  Pass -1 if number is     * unknown.     *     * @return List of all read metadatas. As we validate records, we add     * a reference to the read metadata.     *     * @throws IOException     */    public List validate(int noRecords) throws IOException {        List<ArchiveRecordHeader> hs = new ArrayList<ArchiveRecordHeader>();        int count = 0;        setStrict(true);        for (Iterator<ArchiveRecord> i = iterator(); i.hasNext();) {            count++;            ArchiveRecord r = i.next();            if (r.getHeader().getLength() <= 0                && r.getHeader().getMimetype().                    equals(MimetypeUtils.NO_TYPE_MIMETYPE)) {                throw new IOException("ARCRecord content is empty.");            }            r.close();            // Add reference to metadata into a list of metadatas.            hs.add(r.getHeader());        }        if (noRecords != -1) {            if (count != noRecords) {                throw new IOException("Count of records, " +                    Integer.toString(count) + " is less than expected " +                    Integer.toString(noRecords));            }        }        return hs;    }    /**     * Test Archive file is valid.     * Assumes the stream is at the start of the file.  Be aware that this     * method makes a pass over the whole file.      * @return True if file can be successfully parsed.     */    public boolean isValid() {        boolean valid = false;        try {            validate();            valid = true;        } catch(Exception e) {            // File is not valid if exception thrown parsing.            valid = false;        }            return valid;    }    /**     * @return Returns the strict.     */    public boolean isStrict() {        return this.strict;    }    /**     * @param s The strict to set.     */    public void setStrict(boolean s) {        this.strict = s;    }    /**     * @param d True if we're to digest.     */    public void setDigest(boolean d) {        this.digest = d;    }    /**     * @return True if we're digesting as we read.     */    public boolean isDigest() {        return this.digest;    }     protected Logger getLogger() {        return Logger.getLogger(this.getClass().getName());    }        protected InputStream getInputStream() {        return this.in;    }        /**     * Returns an ArchiveRecord iterator.     * Of note, on IOException, especially if ZipException reading compressed     * ARCs, rather than fail the iteration, try moving to the next record.     * If {@link ArchiveReader#strict} is not set, this will usually succeed.     * @return An iterator over ARC records.     */    public Iterator<ArchiveRecord> iterator() {        // Eat up any record outstanding.        try {            cleanupCurrentRecord();        } catch (IOException e) {            throw new RuntimeException(e);        }                // Now reset stream to the start of the arc file.        try {            rewind();        } catch (IOException e) {            throw new RuntimeException(e);        }        return new ArchiveRecordIterator();    }	protected void setCompressed(boolean compressed) {		this.compressed = compressed;	}    /**     * @return The current ARC record or null if none.     * After construction has the arcfile header record.     * @see #get()     */	protected ArchiveRecord getCurrentRecord() {		return this.currentRecord;	}	protected ArchiveRecord currentRecord(final ArchiveRecord currentRecord) {		this.currentRecord = currentRecord;        return currentRecord;	}	protected InputStream getIn() {		return in;	}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -