⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 replaycharsequencefactory.java

📁 最强的爬虫工程
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
            }        }        /**         * @return Length of characters in stream to replay.  Starts counting         * at the HTTP header/body boundary.         */        public int length() {            return this.length;        }        /**         * Get character at passed absolute position.         *         * Called by {@link #charAt(int)} which has a relative index into the         * content, one that doesn't account for HTTP header if present.         *         * @param index Index into content adjusted to accomodate initial offset         * to get us past the HTTP header if present (i.e.         * {@link #contentOffset}).         *         * @return Characater at offset <code>index</code>.         */        public char charAt(int index) {            int c = -1;            // Add to index start-of-content offset to get us over HTTP header            // if present.            index += this.contentOffset;            if (index < this.prefixBuffer.length) {                // If index is into our prefix buffer.                c = this.prefixBuffer[index];            } else if (index >= this.wrapOrigin &&                (index - this.wrapOrigin) < this.wraparoundBuffer.length) {                // If index is into our buffer window on underlying backing file.                c = this.wraparoundBuffer[                        ((index - this.wrapOrigin) + this.wrapOffset) %                            this.wraparoundBuffer.length];            } else {                // Index is outside of both prefix buffer and our buffer window                // onto the underlying backing file.  Fix the buffer window                // location.                c = faultCharAt(index);            }            // Stream is treated as single byte.  Make sure characters returned            // are not negative.            return (char)(c & 0xff);        }        /**         * Get a character that's outside the current buffers.         *         * will cause the wraparoundBuffer to be changed to         * cover a region including the index         *         * if index is higher than the highest index in the         * wraparound buffer, buffer is moved forward such         * that requested char is last item in buffer         *         * if index is lower than lowest index in the         * wraparound buffer, buffet is reset centered around         * index         *         * @param index Index of character to fetch.         * @return A character that's outside the current buffers         */        private int faultCharAt(int index) {            if(Thread.interrupted()) {                throw new RuntimeException("thread interrupted");            }            if(index >= this.wrapOrigin + this.wraparoundBuffer.length) {                // Moving forward                while (index >= this.wrapOrigin + this.wraparoundBuffer.length)                {                    // TODO optimize this                    advanceBuffer();                }                return charAt(index - this.contentOffset);            }            // Moving backward            recenterBuffer(index);            return charAt(index - this.contentOffset);        }        /**         * Move the buffer window on backing file back centering current access         * position in middle of window.         *         * @param index Index of character to access.         */        private void recenterBuffer(int index) {            if (logger.isLoggable(Level.FINE)) {                logger.fine("Recentering around " + index + " in " +                    this.backingFilename);            }            this.wrapOrigin = index - (this.wraparoundBuffer.length / 2);            if(this.wrapOrigin < this.prefixBuffer.length) {                this.wrapOrigin = this.prefixBuffer.length;            }            this.wrapOffset = 0;            loadBuffer();        }        /**         * Load from backing file into the wrapper buffer.         */        private void loadBuffer()        {            long len = -1;            try {                len = this.raFile.length();                this.raFile.seek(this.wrapOrigin - this.prefixBuffer.length);                this.raFile.readFully(this.wraparoundBuffer, 0,                    Math.min(this.wraparoundBuffer.length,                         this.absoluteLength - this.wrapOrigin));            }            catch (IOException e) {                // TODO convert this to a runtime error?                DevUtils.logger.log (                    Level.SEVERE,                    "raFile.seek(" +                    (this.wrapOrigin - this.prefixBuffer.length) +                    ")\n" +                    "raFile.readFully(wraparoundBuffer,0," +                    (Math.min(this.wraparoundBuffer.length,                        this.length - this.wrapOrigin )) +                    ")\n"+                    "raFile.length()" + len + "\n" +                    DevUtils.extraInfo(),                    e);                throw new RuntimeException(e);            }        }        /**         * Roll the wraparound buffer forward one position         */        private void advanceBuffer() {            try {                this.wraparoundBuffer[this.wrapOffset] =                    (byte)this.raFile.read();                this.wrapOffset++;                this.wrapOffset %= this.wraparoundBuffer.length;                this.wrapOrigin++;            } catch (IOException e) {                DevUtils.logger.log(Level.SEVERE, "advanceBuffer()" +                    DevUtils.extraInfo(), e);                throw new RuntimeException(e);            }        }        public CharSequence subSequence(int start, int end) {            return new CharSubSequence(this, start, end);        }        /**         * Cleanup resources.         *         * @exception IOException Failed close of random access file.         */        public void close() throws IOException        {            this.prefixBuffer = null;            if (this.raFile != null) {                this.raFile.close();                this.raFile = null;            }        }        /* (non-Javadoc)         * @see java.lang.Object#finalize()         */        protected void finalize() throws Throwable        {            super.finalize();            close();        }        /* (non-Javadoc)         * @see org.archive.io.EnhancedCharSequence#substring(int, int)         */        public String substring(int offset, int len) {            StringBuffer ret = new StringBuffer(len);            // Add to offset start-of-content offset to get us over HTTP header            // if present.            offset += this.contentOffset;            if (offset < this.prefixBuffer.length) {                // Need something from the prefix buffer.                int from = offset;                // To the end of the buffer                int count = this.prefixBuffer.length - from;                if (offset + len < this.prefixBuffer.length) {                    count = len; // length falls within the buffer.                } else {                    // Will need more then is in the prefixBuffer.                    offset = this.prefixBuffer.length + 1;                    len = len - count;                }                // Since we are dealing with a byte buffer we'll have to use                // a String and then wrap up in a StringBuffer to concat with                // the backing file. TODO: This can probably be optimized.                //                // Also, force an 8-bit encoding.  Default jvm encoding is                // usually -- us context -- 7 bit ascii.  If we don't force                // 8-bit, characters above 127 are considered rubbish.                try {                    ret.append(new String(this.prefixBuffer,from,count,                        DEFAULT_SINGLE_BYTE_ENCODING));                }                catch (UnsupportedEncodingException e) {                    logger.severe("Failed encoding string: " + e.getMessage());                }            }            if (offset >= this.prefixBuffer.length) {                // TODO: Maybe better performance can be gained by reading                // blocks from files.                int to = offset + len;                for(int i = offset ; i < to ; i++) {                    ret.append(charAt(i - this.contentOffset));                }            }            return ret.toString();        }                public String toString() {            return substring(0, length());        }    }    /**     * Provides a (Replay)CharSequence view on recorded streams (a prefix     * buffer and overflow backing file) that can handle streams of multibyte     * characters.     *     * If possible, use {@link ByteReplayCharSequence}.  It performs better even     * for the single byte case (Decoding is an expensive process).     *     * <p>Call close on this class when done so can clean up resources.     *     * <p>Implementation currently works by checking to see if content to read     * all fits the in-memory buffer.  If so, we decode into a CharBuffer and     * keep this around for CharSequence operations.  This CharBuffer is     * discarded on close.     *     * <p>If content length is greater than in-memory buffer, we decode the     * buffer plus backing file into a new file named for the backing file w/     * a suffix of the encoding we write the file as. We then run w/ a     * memory-mapped CharBuffer against this file to implement CharSequence.     * Reasons for this implemenation are that CharSequence wants to return the     * length of the CharSequence.     *     * <p>Obvious optimizations would keep around decodings whether the     * in-memory decoded buffer or the file of decodings written to disk but the     * general usage pattern processing URIs is that the decoding is used by one     * processor only.  Also of note, files usually fit into the in-memory     * buffer.     *     * <p>We might also be able to keep up 3 windows that moved across the file     * decoding a window at a time trying to keep one of the buffers just in     * front of the regex processing returning it a length that would be only     * the length of current position to end of current block or else the length     * could be got by multipling the backing files length by the decoders'     * estimate of average character size.  This would save us writing out the     * decoded file.  We'd have to do the latter for files that are     * > Integer.MAX_VALUE.     *     * @author stack     * @version $Revision: 1.39 $, $Date: 2006/06/01 05:58:37 $     */    private class MultiByteReplayCharSequence implements ReplayCharSequence {        /**         * Name of the encoding we use writing out concatenated decoded prefix         * buffer and decoded backing file.         *         * <p>This define is also used as suffix for the file that holds the         * decodings.  The name of the file that holds the decoding is the name         * of the backing file w/ this encoding for a suffix.         *         * <p>See <a ref="http://java.sun.com/j2se/1.4.2/docs/guide/intl/encoding.doc.html">Encoding</a>.         */        private static final String WRITE_ENCODING = "UTF-16BE";        /**         * CharBuffer of decoded content.         *         * Content of this buffer is unicode.         */        private CharBuffer content = null;        /**         * File that has decoded content.         *         * Keep it around so we can remove on close.         */        private File decodedFile = null;        /**         * Constructor.         *         * @param buffer In-memory buffer of recordings prefix.  We read from         * here first and will only go to the backing file if <code>size</code>         * requested is greater than <code>buffer.length</code>.         * @param size Total size of stream to replay in bytes.  Used to find         * EOS. This is total length of content including HTTP headers if         * present.         * @param responseBodyStart Where the response body starts in bytes.         * Used to skip over the HTTP headers if present.         * @param backingFilename Path to backing file with content in excess of         * whats in <code>buffer</code>.         * @param encoding Encoding to use reading the passed prefix buffer and         * backing file.  For now, should be java canonical name for the         * encoding. (If null is passed, we will default to         * ByteReplayCharSequence).         *         * @throws IOException         */        private MultiByteReplayCharSequence(byte[] buffer, long size,                long responseBodyStart, String backingFilename, String encoding)            throws IOException {            super();            if (encoding == null) {                throw new NullPointerException("Character encoding is null.");            }            this.content = decode(buffer, backingFilename, size,                responseBodyStart, encoding);         }        /**         * Decode passed buffer and backing file into a CharBuffer.         *         * This method writes a new file made of the decoded concatenation of         * the in-memory prefix buffer and the backing file.  Returns a         * charSequence view onto this new file.         *         * @param buffer In-memory buffer of recordings prefix.  We read from         * here first and will only go to the backing file if <code>size</code>         * requested is greater than <code>buffer.length</code>.         * @param size Total size of stream to replay in bytes.  Used to find         * EOS. This is total length of content including HTTP headers if         * present.         * @param responseBodyStart Where the response body starts in bytes.         * Used to skip over the HTTP headers if present.         * @param backingFilename Path to backing file with content in excess of         * whats in <code>buffer</code>.         * @param encoding Encoding to use reading the passed prefix buffer and         * backing file.  For now, should be java canonical name for the         * encoding. (If null is passed, we will default to         * ByteReplayCharSequence).         *

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -