📄 arcreader.java
字号:
/* $Id: ARCReader.java,v 1.72 2006/08/25 17:34:38 stack-sf Exp $ * * Created on May 1, 2004 * * Copyright (C) 2004 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package org.archive.io.arc;import java.io.ByteArrayOutputStream;import java.io.File;import java.io.IOException;import java.io.InputStream;import java.util.ArrayList;import java.util.Arrays;import java.util.HashMap;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.concurrent.atomic.AtomicInteger;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Matcher;import org.apache.commons.cli.CommandLine;import org.apache.commons.cli.HelpFormatter;import org.apache.commons.cli.Option;import org.apache.commons.cli.Options;import org.apache.commons.cli.ParseException;import org.apache.commons.cli.PosixParser;import org.archive.io.ArchiveReader;import org.archive.io.ArchiveRecord;import org.archive.io.ArchiveRecordHeader;import org.archive.io.RecoverableIOException;import org.archive.io.WriterPoolMember;import org.archive.util.ArchiveUtils;import org.archive.util.InetAddressUtil;import org.archive.util.TextUtils;/** * Get an iterator on an ARC file or get a record by absolute position. * * ARC files are described here: * <a href="http://www.archive.org/web/researcher/ArcFileFormat.php">Arc * File Format</a>. * * <p>This class knows how to parse an ARC file. Pass it a file path * or an URL to an ARC. It can parse ARC Version 1 and 2. * * <p>Iterator returns <code>ARCRecord</code> * though {@link Iterator#next()} is returning * java.lang.Object. Cast the return. * * <p>Profiling java.io vs. memory-mapped ByteBufferInputStream shows the * latter slightly slower -- but not by much. TODO: Test more. Just * change {@link #getInputStream(File, long)}. * * @author stack * @version $Date: 2006/08/25 17:34:38 $ $Revision: 1.72 $ */public abstract class ARCReader extends ArchiveReaderimplements ARCConstants { Logger logger = Logger.getLogger(ARCReader.class.getName()); /** * Set to true if we are aligned on first record of Archive file. * We used depend on offset. If offset was zero, then we were * aligned on first record. This is no longer necessarily the case when * Reader is created at an offset into an Archive file: The offset is zero * but its relative to where we started reading. */ private boolean alignedOnFirstRecord = true; /** * Assumed maximum size of a record meta header line. * * This 100k which seems massive but its the same as the LINE_LENGTH from * <code>alexa/include/a_arcio.h</code>: * <pre> * #define LINE_LENGTH (100*1024) * </pre> */ private static final int MAX_HEADER_LINE_LENGTH = 1024 * 100; /** * Array of field names. * * Used to initialize <code>headerFieldNameKeys</code>. */ private final String [] headerFieldNameKeysArray = { URL_FIELD_KEY, IP_HEADER_FIELD_KEY, DATE_FIELD_KEY, MIMETYPE_FIELD_KEY, LENGTH_FIELD_KEY }; /** * An array of the header field names found in the ARC file header on * the 3rd line. * * We used to read these in from the arc file first record 3rd line but * now we hardcode them for sake of improved performance. */ private final List<String> headerFieldNameKeys = Arrays.asList(this.headerFieldNameKeysArray); private boolean parseHttpHeaders = true; ARCReader() { super(); } /** * Skip over any trailing new lines at end of the record so we're lined up * ready to read the next. * @param record * @throws IOException */ protected void gotoEOR(ArchiveRecord record) throws IOException { if (getIn().available() <= 0) { return; } // Remove any trailing LINE_SEPARATOR int c = -1; while (getIn().available() > 0) { if (getIn().markSupported()) { getIn().mark(1); } c = getIn().read(); if (c != -1) { if (c == LINE_SEPARATOR) { continue; } if (getIn().markSupported()) { // We've overread. We're probably in next record. There is // no way of telling for sure. It may be dross at end of // current record. Backup. getIn().reset(); break; } ArchiveRecordHeader h = (getCurrentRecord() != null)? record.getHeader(): null; throw new IOException("Read " + (char)c + " when only " + LINE_SEPARATOR + " expected. " + getReaderIdentifier() + ((h != null)? h.getHeaderFields().toString(): "")); } } } /** * Create new arc record. * * Encapsulate housekeeping that has to do w/ creating a new record. * * <p>Call this method at end of constructor to read in the * arcfile header. Will be problems reading subsequent arc records * if you don't since arcfile header has the list of metadata fields for * all records that follow. * * <p>When parsing through ARCs writing out CDX info, we spend about * 38% of CPU in here -- about 30% of which is in getTokenizedHeaderLine * -- of which 16% is reading. * * @param is InputStream to use. * @param offset Absolute offset into arc file. * @return An arc record. * @throws IOException */ protected ARCRecord createArchiveRecord(InputStream is, long offset) throws IOException { ArrayList<String> firstLineValues = new ArrayList<String>(20); getTokenizedHeaderLine(is, firstLineValues); int bodyOffset = 0; if (offset == 0 && isAlignedOnFirstRecord()) { // If offset is zero and we were aligned at first record on // creation (See #alignedOnFirstRecord for more on this), then no // records have been read yet and we're reading our first one, the // record of ARC file meta info. Its special. In ARC versions // 1.x, first record has three lines of meta info. We've just read // the first line. There are two more. The second line has misc. // info. We're only interested in the first field, the version // number. The third line is the list of field names. Here's what // ARC file version 1.x meta content looks like: // // filedesc://testIsBoundary-JunitIAH200401070157520.arc 0.0.0.0 \\ // 20040107015752 text/plain 77 // 1 0 InternetArchive // URL IP-address Archive-date Content-type Archive-length // ArrayList<String> secondLineValues = new ArrayList<String>(20); bodyOffset += getTokenizedHeaderLine(is, secondLineValues); setVersion((String)secondLineValues.get(0) + "." + (String)secondLineValues.get(1)); // Just read over the 3rd line. We used to parse it and use // values found here but now we just hardcode them to avoid // having to read this 3rd line even for random arc file accesses. bodyOffset += getTokenizedHeaderLine(is, null); } try { currentRecord(new ARCRecord(is, (ArchiveRecordHeader)computeMetaData(this.headerFieldNameKeys, firstLineValues, getVersion(), offset), bodyOffset, isDigest(), isStrict(), isParseHttpHeaders())); } catch (IOException e) { IOException newE = new IOException(e.getMessage() + " (Offset " + offset + ")."); newE.setStackTrace(e.getStackTrace()); throw newE; } return (ARCRecord)getCurrentRecord(); } /** * Returns version of this ARC file. Usually read from first record of ARC. * If we're reading without having first read the first record -- e.g. * random access into middle of an ARC -- then version will not have been * set. For now, we return a default, version 1.1. Later, if more than * just one version of ARC, we could look at such as the meta line to see * what version of ARC this is. * @return Version of this ARC file. */ public String getVersion() { return (super.getVersion() == null)? "1.1": super.getVersion(); } /** * Get a record header line as list of tokens. * * We keep reading till we find a LINE_SEPARATOR or we reach the end * of file w/o finding a LINE_SEPARATOR or the line length is crazy. * * @param stream InputStream to read from. * @param list Empty list that gets filled w/ string tokens. * @return Count of characters read. * @exception IOException If problem reading stream or no line separator * found or EOF before EOL or we didn't get minimum header fields. */ private int getTokenizedHeaderLine(final InputStream stream, List<String> list) throws IOException { // Preallocate usual line size. // TODO: Replace StringBuffer with more lightweight. We burn // alot of our parse CPU in this method. StringBuffer buffer = new StringBuffer(2048 + 20); int read = 0; for (int c = -1; true;) { c = stream.read(); if (c == -1) { throw new RecoverableIOException("Hit EOF before header EOL."); } c &= 0xff; read++; if (read > MAX_HEADER_LINE_LENGTH) { throw new IOException("Header line longer than max allowed " + " -- " + String.valueOf(MAX_HEADER_LINE_LENGTH) + " -- or passed buffer doesn't contain a line (Read: " + buffer.length() + "). Here's" + " some of what was read: " + buffer.substring(0, Math.min(buffer.length(), 256))); } if (c == LINE_SEPARATOR) { if (buffer.length() == 0) { // Empty line at start of buffer. Skip it and try again. continue; } if (list != null) { list.add(buffer.toString()); } // LOOP TERMINATION. break; } else if (c == HEADER_FIELD_SEPARATOR) { if (list != null) { list.add(buffer.toString()); } buffer = new StringBuffer(); } else { buffer.append((char)c); } } // List must have at least 3 elements in it and no more than 10. If // it has other than this, then bogus parse. if (list != null && (list.size() < 3 || list.size() > 100)) { throw new IOException("Unparseable header line: " + list); } return read; } /** * Compute metadata fields. * * Here we check the meta field has right number of items in it. * * @param keys Keys to use composing headerFields map. * @param values Values to set into the headerFields map. * @param v The version of this ARC file. * @param offset Offset into arc file. * * @return Metadata structure for this record. * * @exception IOException If no. of keys doesn't match no. of values. */ private ARCRecordMetaData computeMetaData(List<String> keys, List<String> values, String v, long offset) throws IOException { if (keys.size() != values.size()) { List<String> originalValues = values; if (!isStrict()) { values = fixSpaceInMetadataLine(values, keys.size()); } if (keys.size() != values.size()) { throw new IOException("Size of field name keys does" + " not match count of field values: " + values); } // Note that field was fixed on stderr. logStdErr(Level.WARNING, "Fixed spaces in metadata URL." + " Original: " + originalValues + ", New: " + values); } Map<Object, Object> headerFields = new HashMap<Object, Object>(keys.size() + 2); for (int i = 0; i < keys.size(); i++) { headerFields.put(keys.get(i), values.get(i)); } // Add a check for tabs in URLs. If any, replace with '%09'. // See https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966, // [ 1010966 ] crawl.log has URIs with spaces in them. String url = (String)headerFields.get(URL_FIELD_KEY); if (url != null && url.indexOf('\t') >= 0) { headerFields.put(URL_FIELD_KEY,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -