📄 replaycharsequencefactory.java

📁 最强的爬虫工程
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* ReplayCharSequenceFactory * * Created on Mar 8, 2004 * * Copyright (C) 2004 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.io;import java.io.BufferedWriter;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.OutputStreamWriter;import java.io.RandomAccessFile;import java.io.UnsupportedEncodingException;import java.io.Writer;import java.nio.ByteBuffer;import java.nio.CharBuffer;import java.nio.channels.FileChannel;import java.nio.charset.Charset;import java.nio.charset.CharsetDecoder;import java.nio.charset.CoderResult;import java.nio.charset.CodingErrorAction;import java.nio.charset.IllegalCharsetNameException;import java.nio.charset.UnsupportedCharsetException;import java.util.logging.Level;import java.util.logging.Logger;import org.archive.util.DevUtils;/** * Factory that returns a ReplayCharSequence view on to a recording stream. * * This factory encapsulates the decision-making figuring which * ReplayCharSequence to return whether the single byte or multibyte handling * ReplayCharSequence implementations.  Get instance of this factory * using {@link #getInstance()} and then call * {@link #getReplayCharSequence(byte [], long, long, String, String)}. * * @author stack * @version $Revision: 1.39 $, $Date: 2006/06/01 05:58:37 $ */public class ReplayCharSequenceFactory {    /**     * Logger.     *     * Logger used by this factory and by the ReplayCharSequence's returned.     */    protected static Logger logger =        Logger.getLogger("org.archive.io.ReplayCharSequenceFactory");    /**     * Singleton instance of this factory.     */    private static final ReplayCharSequenceFactory factory =        new ReplayCharSequenceFactory();    /**     * Private constructor.     *     * Private ensures only one singleton instance.     */    private ReplayCharSequenceFactory() {        super();    }    /**     * @return Instance of the singleton ReplayCharSequenceFactory.     */    public static ReplayCharSequenceFactory getInstance() {        return ReplayCharSequenceFactory.factory;    }    /**     * Return appropriate ReplayCharSequence switching off passed encoding.     *     * We look at the encoding and try to figure whether to pass back a     * byte-orientated ReplayCharSequence or a character-orientated     * ReplayCharStream.     *     * @param buffer In-memory buffer of recordings prefix.  We read from here     * first and will only go to the backing file if <code>size</code> requested     * is greater than <code>buffer.length</code>.     * @param size Total size of stream to replay in bytes.  Used to find EOS.     * This is total length of content including HTTP headers if present.     * @param responseBodyStart Where the response body starts in bytes. Used to     * skip over the HTTP headers if present.     * @param backingFilename Full path to backing file with content in excess     * of whats in <code>buffer</code>.     * @param encoding Encoding to use reading the passed prefix buffer and     * backing file.  For now, should be java canonical name for the encoding.     * (If null is passed, we will default to ByteReplayCharSequence).     *     * @return A ReplayCharSequence.     *     * @throws IOException Problems accessing backing file or writing new file     * of the decoded content.     */    public ReplayCharSequence getReplayCharSequence(byte[] buffer, long size,                long responseBodyStart, String backingFilename, String encoding)        throws IOException {        checkParameters(buffer, size, responseBodyStart);        ReplayCharSequence rcs = null;        if (isMultibyteEncoding(encoding)) {            rcs = new MultiByteReplayCharSequence(buffer, size,                responseBodyStart, backingFilename, encoding);        } else {            rcs = new ByteReplayCharSequence(buffer, size, responseBodyStart,                backingFilename);        }        return rcs;    }        /**     * Make decision as to whether encoding warrants single-byte replay char     * sequence or multi-byte.     *     * @param encoding Encoding to use reading the passed prefix buffer and     * backing file.  For now, should be java canonical name for the encoding.     * (If null is passed, we will default to ByteReplayCharSequence).     *     * @return True if multibyte encoding.     */    private boolean isMultibyteEncoding(String encoding) {        boolean isMultibyte = false;        final Charset cs;        try {            if (encoding != null && encoding.length() > 0) {                cs = Charset.forName(encoding);                if(cs.canEncode()) {                    isMultibyte = cs.newEncoder().maxBytesPerChar() > 1;                } else {                    isMultibyte = false;                    logger.info("Encoding not fully supported: " + encoding                            + ".  Defaulting to single byte.");                }            }        } catch (IllegalCharsetNameException e) {            // Unsupported encoding.  Default to singlebyte.            isMultibyte = false;            logger.info("Illegal encoding name: " + encoding                + ".  Defaulting to single byte.");        } catch (UnsupportedCharsetException e) {            // Unsupported encoding.  Default to singlebyte.            isMultibyte = false;            logger.info("Unsupported encoding " + encoding                + ".  Defaulting to single byte.");        }        logger.fine("Encoding " + encoding + " is multibyte: "            + ((isMultibyte) ? Boolean.TRUE : Boolean.FALSE));                return isMultibyte;    }    /**     * Test passed arguments.     *     * @param buffer In-memory buffer of recordings prefix.  We read from here     * first and will only go to the backing file if <code>size</code> requested     * is greater than <code>buffer.length</code>.     * @param size Total size of stream to replay in bytes.  Used to find EOS.     * This is total length of content including HTTP headers if present.     * @param responseBodyStart Where the response body starts in bytes. Used to     * skip over the HTTP headers if present.     *     * @throws IllegalArgumentException Thrown if passed an illegal argument.     */    protected void checkParameters(byte[] buffer, long size,            long responseBodyStart)        throws IllegalArgumentException {        if (responseBodyStart > size) {            throw new IllegalArgumentException("Illegal response body offset" +                " of " + responseBodyStart + " whereas size is only " + size);        }        if (responseBodyStart > Integer.MAX_VALUE) {            // A value of this size will mess up math below.            throw new IllegalArgumentException("Response body start " +                " of " + responseBodyStart + " > Integer.MAX_VALUE.");        }        if (responseBodyStart > buffer.length) {            logger.log(Level.WARNING,                "Unexpected response body offset " + responseBodyStart + ",\n" +                "beyond the first buffer of length "+buffer.length+".\n" +                "Thread: "+ Thread.currentThread().getName() + "\n");        }        if ((size - responseBodyStart) > Integer.MAX_VALUE) {            throw new IllegalArgumentException("Length is bigger than we  can" +               " handle: " + (size - responseBodyStart));        }    }    /**     * Provides a (Replay)CharSequence view on recorded stream bytes (a prefix     * buffer and overflow backing file).     *     * Treats the byte stream as 8-bit.     *     * <p>Uses a wraparound rolling buffer of the last windowSize bytes read     * from disk in memory; as long as the 'random access' of a CharSequence     * user stays within this window, access should remain fairly efficient.     * (So design any regexps pointed at these CharSequences to work within     * that range!)     *     * <p>When rereading of a location is necessary, the whole window is     * recentered around the location requested. (TODO: More research     * into whether this is the best strategy.)     *     * <p>An implementation of a ReplayCharSequence done with ByteBuffers -- one     * to wrap the passed prefix buffer and the second, a memory-mapped     * ByteBuffer view into the backing file -- was consistently slower: ~10%.     * My tests did the following. Made a buffer filled w/ regular content.     * This buffer was used as the prefix buffer.  The buffer content was     * written MULTIPLER times to a backing file.  I then did accesses w/ the     * following pattern: Skip forward 32 bytes, then back 16 bytes, and then     * read forward from byte 16-32.  Repeat.  Though I varied the size of the     * buffer to the size of the backing file,from 3-10, the difference of 10%     * or so seemed to persist.  Same if I tried to favor get() over get(index).     * I used a profiler, JMP, to study times taken (St.Ack did above comment).     *     * <p>TODO determine in memory mapped files is better way to do this;     * probably not -- they don't offer the level of control over     * total memory used that this approach does.     *     * @author Gordon Mohr     * @version $Revision: 1.39 $, $Date: 2006/06/01 05:58:37 $     */    private class ByteReplayCharSequence implements ReplayCharSequence {        /**         * Buffer that holds the first bit of content.         *         * Once this is exhausted we go to the backing file.         */        private byte[] prefixBuffer;        /**         * Total length of character stream to replay minus the HTTP headers         * if present.         *         * Used to find EOS.         */        protected int length;        /**         * Absolute length of the stream.         *         * Includes HTTP headers.  Needed doing calc. in the below figuring         * how much to load into buffer.         */        private int absoluteLength = -1;        /**         * Buffer window on to backing file.         */        private byte[] wraparoundBuffer;        /**         * Absolute index into underlying bytestream where wrap starts.         */        private int wrapOrigin;        /**         * Index in wraparoundBuffer that corresponds to wrapOrigin         */        private int wrapOffset;        /**         * Name of backing file we go to when we've exhausted content from the         * prefix buffer.         */        private String backingFilename;        /**         * Random access to the backing file.         */        private RandomAccessFile raFile;        /**         * Offset into prefix buffer at which content beings.         */        private int contentOffset;        /**         * 8-bit encoding used reading single bytes from buffer and         * stream.         */        private static final String DEFAULT_SINGLE_BYTE_ENCODING =            "ISO-8859-1";        /**         * Constructor.         *         * @param buffer In-memory buffer of recordings prefix.  We read from         * here first and will only go to the backing file if <code>size</code>         * requested is greater than <code>buffer.length</code>.         * @param size Total size of stream to replay in bytes.  Used to find         * EOS. This is total length of content including HTTP headers if         * present.         * @param responseBodyStart Where the response body starts in bytes.         * Used to skip over the HTTP headers if present.         * @param backingFilename Path to backing file with content in excess of         * whats in <code>buffer</code>.         *         * @throws IOException         */        private ByteReplayCharSequence(byte[] buffer, long size,                long responseBodyStart, String backingFilename)            throws IOException {            this.length = (int)(size - responseBodyStart);            this.absoluteLength = (int)size;            this.prefixBuffer = buffer;            this.contentOffset = (int)responseBodyStart;            // If amount to read is > than what is in our prefix buffer, then            // open the backing file.            if (size > buffer.length) {                this.backingFilename = backingFilename;                this.raFile = new RandomAccessFile(backingFilename, "r");                this.wraparoundBuffer = new byte[this.prefixBuffer.length];                this.wrapOrigin = this.prefixBuffer.length;                this.wrapOffset = 0;                loadBuffer();
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -