📄 arcreader.java

📁 一个基于lucene&heritrix的搜索引擎
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* $Id: ARCReader.java,v 1.72 2006/08/25 17:34:38 stack-sf Exp $ * * Created on May 1, 2004 * * Copyright (C) 2004 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.io.arc;import java.io.ByteArrayOutputStream;import java.io.File;import java.io.IOException;import java.io.InputStream;import java.util.ArrayList;import java.util.Arrays;import java.util.HashMap;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.concurrent.atomic.AtomicInteger;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Matcher;import org.apache.commons.cli.CommandLine;import org.apache.commons.cli.HelpFormatter;import org.apache.commons.cli.Option;import org.apache.commons.cli.Options;import org.apache.commons.cli.ParseException;import org.apache.commons.cli.PosixParser;import org.archive.io.ArchiveReader;import org.archive.io.ArchiveRecord;import org.archive.io.ArchiveRecordHeader;import org.archive.io.RecoverableIOException;import org.archive.io.WriterPoolMember;import org.archive.util.ArchiveUtils;import org.archive.util.InetAddressUtil;import org.archive.util.TextUtils;/** * Get an iterator on an ARC file or get a record by absolute position. * * ARC files are described here: * <a href="http://www.archive.org/web/researcher/ArcFileFormat.php">Arc * File Format</a>. * * <p>This class knows how to parse an ARC file.  Pass it a file path * or an URL to an ARC. It can parse ARC Version 1 and 2. * * <p>Iterator returns <code>ARCRecord</code> * though {@link Iterator#next()} is returning * java.lang.Object.  Cast the return. * * <p>Profiling java.io vs. memory-mapped ByteBufferInputStream shows the * latter slightly slower -- but not by much.  TODO: Test more.  Just * change {@link #getInputStream(File, long)}. * * @author stack * @version $Date: 2006/08/25 17:34:38 $ $Revision: 1.72 $ */public abstract class ARCReader extends ArchiveReaderimplements ARCConstants {    Logger logger = Logger.getLogger(ARCReader.class.getName());        /**     * Set to true if we are aligned on first record of Archive file.     * We used depend on offset. If offset was zero, then we were     * aligned on first record.  This is no longer necessarily the case when     * Reader is created at an offset into an Archive file: The offset is zero     * but its relative to where we started reading.     */    private boolean alignedOnFirstRecord = true;        /**     * Assumed maximum size of a record meta header line.     *     * This 100k which seems massive but its the same as the LINE_LENGTH from     * <code>alexa/include/a_arcio.h</code>:     * <pre>     * #define LINE_LENGTH     (100*1024)     * </pre>     */    private static final int MAX_HEADER_LINE_LENGTH = 1024 * 100;    /**     * Array of field names.     *      * Used to initialize <code>headerFieldNameKeys</code>.     */    private final String [] headerFieldNameKeysArray = {        URL_FIELD_KEY,        IP_HEADER_FIELD_KEY,        DATE_FIELD_KEY,        MIMETYPE_FIELD_KEY,        LENGTH_FIELD_KEY    };        /**     * An array of the header field names found in the ARC file header on     * the 3rd line.     *      * We used to read these in from the arc file first record 3rd line but     * now we hardcode them for sake of improved performance.     */    private final List<String> headerFieldNameKeys =        Arrays.asList(this.headerFieldNameKeysArray);        private boolean parseHttpHeaders = true;        ARCReader() {    	super();    }        /**     * Skip over any trailing new lines at end of the record so we're lined up     * ready to read the next.     * @param record     * @throws IOException     */    protected void gotoEOR(ArchiveRecord record) throws IOException {        if (getIn().available() <= 0) {            return;        }                // Remove any trailing LINE_SEPARATOR        int c = -1;        while (getIn().available() > 0) {            if (getIn().markSupported()) {                getIn().mark(1);            }            c = getIn().read();            if (c != -1) {                if (c == LINE_SEPARATOR) {                    continue;                }                if (getIn().markSupported()) {                    // We've overread.  We're probably in next record.  There is                    // no way of telling for sure. It may be dross at end of                    // current record. Backup.                	getIn().reset();                    break;                }                ArchiveRecordHeader h = (getCurrentRecord() != null)?                    record.getHeader(): null;                throw new IOException("Read " + (char)c +                    " when only " + LINE_SEPARATOR + " expected. " +                     getReaderIdentifier() + ((h != null)?                        h.getHeaderFields().toString(): ""));            }        }    }        /**     * Create new arc record.     *     * Encapsulate housekeeping that has to do w/ creating a new record.     *     * <p>Call this method at end of constructor to read in the     * arcfile header.  Will be problems reading subsequent arc records     * if you don't since arcfile header has the list of metadata fields for     * all records that follow.     *      * <p>When parsing through ARCs writing out CDX info, we spend about     * 38% of CPU in here -- about 30% of which is in getTokenizedHeaderLine     * -- of which 16% is reading.     *     * @param is InputStream to use.     * @param offset Absolute offset into arc file.     * @return An arc record.     * @throws IOException     */    protected ARCRecord createArchiveRecord(InputStream is, long offset)    throws IOException {        ArrayList<String> firstLineValues = new ArrayList<String>(20);        getTokenizedHeaderLine(is, firstLineValues);        int bodyOffset = 0;        if (offset == 0 && isAlignedOnFirstRecord()) {            // If offset is zero and we were aligned at first record on            // creation (See #alignedOnFirstRecord for more on this), then no            // records have been read yet and we're reading our first one, the            // record of ARC file meta info.  Its special.  In ARC versions            // 1.x, first record has three lines of meta info. We've just read            // the first line. There are two more.  The second line has misc.            // info.  We're only interested in the first field, the version            // number.  The third line is the list of field names. Here's what            // ARC file version 1.x meta content looks like:            //            // filedesc://testIsBoundary-JunitIAH200401070157520.arc 0.0.0.0 \\            //      20040107015752 text/plain 77            // 1 0 InternetArchive            // URL IP-address Archive-date Content-type Archive-length            //            ArrayList<String> secondLineValues = new ArrayList<String>(20);            bodyOffset += getTokenizedHeaderLine(is, secondLineValues);            setVersion((String)secondLineValues.get(0) +                "." + (String)secondLineValues.get(1));            // Just read over the 3rd line.  We used to parse it and use            // values found here but now we just hardcode them to avoid            // having to read this 3rd line even for random arc file accesses.            bodyOffset += getTokenizedHeaderLine(is, null);        }        try {            currentRecord(new ARCRecord(is,                (ArchiveRecordHeader)computeMetaData(this.headerFieldNameKeys,                	firstLineValues,                    getVersion(), offset), bodyOffset, isDigest(),                    isStrict(), isParseHttpHeaders()));        } catch (IOException e) {            IOException newE = new IOException(e.getMessage() + " (Offset " +                    offset + ").");            newE.setStackTrace(e.getStackTrace());            throw newE;        }        return (ARCRecord)getCurrentRecord();    }        /**     * Returns version of this ARC file.  Usually read from first record of ARC.     * If we're reading without having first read the first record -- e.g.     * random access into middle of an ARC -- then version will not have been     * set.  For now, we return a default, version 1.1.  Later, if more than     * just one version of ARC, we could look at such as the meta line to see     * what version of ARC this is.     * @return Version of this ARC file.     */    public String getVersion() {        return (super.getVersion() == null)? "1.1": super.getVersion();    }    /**     * Get a record header line as list of tokens.     *     * We keep reading till we find a LINE_SEPARATOR or we reach the end     * of file w/o finding a LINE_SEPARATOR or the line length is crazy.     *     * @param stream InputStream to read from.     * @param list Empty list that gets filled w/ string tokens.     * @return Count of characters read.     * @exception IOException If problem reading stream or no line separator     * found or EOF before EOL or we didn't get minimum header fields.     */    private int getTokenizedHeaderLine(final InputStream stream,            List<String> list) throws IOException {        // Preallocate usual line size.        // TODO: Replace StringBuffer with more lightweight.  We burn        // alot of our parse CPU in this method.        StringBuffer buffer = new StringBuffer(2048 + 20);        int read = 0;        for (int c = -1; true;) {            c = stream.read();            if (c == -1) {                throw new RecoverableIOException("Hit EOF before header EOL.");            }            c &= 0xff;             read++;            if (read > MAX_HEADER_LINE_LENGTH) {                throw new IOException("Header line longer than max allowed " +                    " -- " + String.valueOf(MAX_HEADER_LINE_LENGTH) +                    " -- or passed buffer doesn't contain a line (Read: " +                    buffer.length() + ").  Here's" +                    " some of what was read: " +                    buffer.substring(0, Math.min(buffer.length(), 256)));            }            if (c == LINE_SEPARATOR) {                if (buffer.length() == 0) {                    // Empty line at start of buffer.  Skip it and try again.                    continue;                }                if (list != null) {                    list.add(buffer.toString());                }                // LOOP TERMINATION.                break;            } else if (c == HEADER_FIELD_SEPARATOR) {                if (list != null) {                    list.add(buffer.toString());                }                buffer = new StringBuffer();            } else {                buffer.append((char)c);            }        }        // List must have at least 3 elements in it and no more than 10.  If        // it has other than this, then bogus parse.        if (list != null && (list.size() < 3 || list.size() > 100)) {            throw new IOException("Unparseable header line: " + list);        }        return read;    }    /**     * Compute metadata fields.     *     * Here we check the meta field has right number of items in it.     *     * @param keys Keys to use composing headerFields map.     * @param values Values to set into the headerFields map.     * @param v The version of this ARC file.     * @param offset Offset into arc file.     *     * @return Metadata structure for this record.     *     * @exception IOException  If no. of keys doesn't match no. of values.     */    private ARCRecordMetaData computeMetaData(List<String> keys,    		List<String> values, String v, long offset)    throws IOException {        if (keys.size() != values.size()) {            List<String> originalValues = values;            if (!isStrict()) {                values = fixSpaceInMetadataLine(values, keys.size());            }            if (keys.size() != values.size()) {                throw new IOException("Size of field name keys does" +                    " not match count of field values: " + values);            }            // Note that field was fixed on stderr.            logStdErr(Level.WARNING, "Fixed spaces in metadata URL." +                " Original: " + originalValues + ", New: " + values);        }                Map<Object, Object> headerFields =        	new HashMap<Object, Object>(keys.size() + 2);        for (int i = 0; i < keys.size(); i++) {            headerFields.put(keys.get(i), values.get(i));        }                // Add a check for tabs in URLs.  If any, replace with '%09'.        // See https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966,        // [ 1010966 ] crawl.log has URIs with spaces in them.        String url = (String)headerFields.get(URL_FIELD_KEY);        if (url != null && url.indexOf('\t') >= 0) {            headerFields.put(URL_FIELD_KEY,
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -