📄 replaycharsequencefactory.java
字号:
/* ReplayCharSequenceFactory * * Created on Mar 8, 2004 * * Copyright (C) 2004 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package org.archive.io;import java.io.BufferedWriter;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.OutputStreamWriter;import java.io.RandomAccessFile;import java.io.UnsupportedEncodingException;import java.io.Writer;import java.nio.ByteBuffer;import java.nio.CharBuffer;import java.nio.channels.FileChannel;import java.nio.charset.Charset;import java.nio.charset.CharsetDecoder;import java.nio.charset.CoderResult;import java.nio.charset.CodingErrorAction;import java.nio.charset.IllegalCharsetNameException;import java.nio.charset.UnsupportedCharsetException;import java.util.logging.Level;import java.util.logging.Logger;import org.archive.util.DevUtils;/** * Factory that returns a ReplayCharSequence view on to a recording stream. * * This factory encapsulates the decision-making figuring which * ReplayCharSequence to return whether the single byte or multibyte handling * ReplayCharSequence implementations. Get instance of this factory * using {@link #getInstance()} and then call * {@link #getReplayCharSequence(byte [], long, long, String, String)}. * * @author stack * @version $Revision: 1.39 $, $Date: 2006/06/01 05:58:37 $ */public class ReplayCharSequenceFactory { /** * Logger. * * Logger used by this factory and by the ReplayCharSequence's returned. */ protected static Logger logger = Logger.getLogger("org.archive.io.ReplayCharSequenceFactory"); /** * Singleton instance of this factory. */ private static final ReplayCharSequenceFactory factory = new ReplayCharSequenceFactory(); /** * Private constructor. * * Private ensures only one singleton instance. */ private ReplayCharSequenceFactory() { super(); } /** * @return Instance of the singleton ReplayCharSequenceFactory. */ public static ReplayCharSequenceFactory getInstance() { return ReplayCharSequenceFactory.factory; } /** * Return appropriate ReplayCharSequence switching off passed encoding. * * We look at the encoding and try to figure whether to pass back a * byte-orientated ReplayCharSequence or a character-orientated * ReplayCharStream. * * @param buffer In-memory buffer of recordings prefix. We read from here * first and will only go to the backing file if <code>size</code> requested * is greater than <code>buffer.length</code>. * @param size Total size of stream to replay in bytes. Used to find EOS. * This is total length of content including HTTP headers if present. * @param responseBodyStart Where the response body starts in bytes. Used to * skip over the HTTP headers if present. * @param backingFilename Full path to backing file with content in excess * of whats in <code>buffer</code>. * @param encoding Encoding to use reading the passed prefix buffer and * backing file. For now, should be java canonical name for the encoding. * (If null is passed, we will default to ByteReplayCharSequence). * * @return A ReplayCharSequence. * * @throws IOException Problems accessing backing file or writing new file * of the decoded content. */ public ReplayCharSequence getReplayCharSequence(byte[] buffer, long size, long responseBodyStart, String backingFilename, String encoding) throws IOException { checkParameters(buffer, size, responseBodyStart); ReplayCharSequence rcs = null; if (isMultibyteEncoding(encoding)) { rcs = new MultiByteReplayCharSequence(buffer, size, responseBodyStart, backingFilename, encoding); } else { rcs = new ByteReplayCharSequence(buffer, size, responseBodyStart, backingFilename); } return rcs; } /** * Make decision as to whether encoding warrants single-byte replay char * sequence or multi-byte. * * @param encoding Encoding to use reading the passed prefix buffer and * backing file. For now, should be java canonical name for the encoding. * (If null is passed, we will default to ByteReplayCharSequence). * * @return True if multibyte encoding. */ private boolean isMultibyteEncoding(String encoding) { boolean isMultibyte = false; final Charset cs; try { if (encoding != null && encoding.length() > 0) { cs = Charset.forName(encoding); if(cs.canEncode()) { isMultibyte = cs.newEncoder().maxBytesPerChar() > 1; } else { isMultibyte = false; logger.info("Encoding not fully supported: " + encoding + ". Defaulting to single byte."); } } } catch (IllegalCharsetNameException e) { // Unsupported encoding. Default to singlebyte. isMultibyte = false; logger.info("Illegal encoding name: " + encoding + ". Defaulting to single byte."); } catch (UnsupportedCharsetException e) { // Unsupported encoding. Default to singlebyte. isMultibyte = false; logger.info("Unsupported encoding " + encoding + ". Defaulting to single byte."); } logger.fine("Encoding " + encoding + " is multibyte: " + ((isMultibyte) ? Boolean.TRUE : Boolean.FALSE)); return isMultibyte; } /** * Test passed arguments. * * @param buffer In-memory buffer of recordings prefix. We read from here * first and will only go to the backing file if <code>size</code> requested * is greater than <code>buffer.length</code>. * @param size Total size of stream to replay in bytes. Used to find EOS. * This is total length of content including HTTP headers if present. * @param responseBodyStart Where the response body starts in bytes. Used to * skip over the HTTP headers if present. * * @throws IllegalArgumentException Thrown if passed an illegal argument. */ protected void checkParameters(byte[] buffer, long size, long responseBodyStart) throws IllegalArgumentException { if (responseBodyStart > size) { throw new IllegalArgumentException("Illegal response body offset" + " of " + responseBodyStart + " whereas size is only " + size); } if (responseBodyStart > Integer.MAX_VALUE) { // A value of this size will mess up math below. throw new IllegalArgumentException("Response body start " + " of " + responseBodyStart + " > Integer.MAX_VALUE."); } if (responseBodyStart > buffer.length) { logger.log(Level.WARNING, "Unexpected response body offset " + responseBodyStart + ",\n" + "beyond the first buffer of length "+buffer.length+".\n" + "Thread: "+ Thread.currentThread().getName() + "\n"); } if ((size - responseBodyStart) > Integer.MAX_VALUE) { throw new IllegalArgumentException("Length is bigger than we can" + " handle: " + (size - responseBodyStart)); } } /** * Provides a (Replay)CharSequence view on recorded stream bytes (a prefix * buffer and overflow backing file). * * Treats the byte stream as 8-bit. * * <p>Uses a wraparound rolling buffer of the last windowSize bytes read * from disk in memory; as long as the 'random access' of a CharSequence * user stays within this window, access should remain fairly efficient. * (So design any regexps pointed at these CharSequences to work within * that range!) * * <p>When rereading of a location is necessary, the whole window is * recentered around the location requested. (TODO: More research * into whether this is the best strategy.) * * <p>An implementation of a ReplayCharSequence done with ByteBuffers -- one * to wrap the passed prefix buffer and the second, a memory-mapped * ByteBuffer view into the backing file -- was consistently slower: ~10%. * My tests did the following. Made a buffer filled w/ regular content. * This buffer was used as the prefix buffer. The buffer content was * written MULTIPLER times to a backing file. I then did accesses w/ the * following pattern: Skip forward 32 bytes, then back 16 bytes, and then * read forward from byte 16-32. Repeat. Though I varied the size of the * buffer to the size of the backing file,from 3-10, the difference of 10% * or so seemed to persist. Same if I tried to favor get() over get(index). * I used a profiler, JMP, to study times taken (St.Ack did above comment). * * <p>TODO determine in memory mapped files is better way to do this; * probably not -- they don't offer the level of control over * total memory used that this approach does. * * @author Gordon Mohr * @version $Revision: 1.39 $, $Date: 2006/06/01 05:58:37 $ */ private class ByteReplayCharSequence implements ReplayCharSequence { /** * Buffer that holds the first bit of content. * * Once this is exhausted we go to the backing file. */ private byte[] prefixBuffer; /** * Total length of character stream to replay minus the HTTP headers * if present. * * Used to find EOS. */ protected int length; /** * Absolute length of the stream. * * Includes HTTP headers. Needed doing calc. in the below figuring * how much to load into buffer. */ private int absoluteLength = -1; /** * Buffer window on to backing file. */ private byte[] wraparoundBuffer; /** * Absolute index into underlying bytestream where wrap starts. */ private int wrapOrigin; /** * Index in wraparoundBuffer that corresponds to wrapOrigin */ private int wrapOffset; /** * Name of backing file we go to when we've exhausted content from the * prefix buffer. */ private String backingFilename; /** * Random access to the backing file. */ private RandomAccessFile raFile; /** * Offset into prefix buffer at which content beings. */ private int contentOffset; /** * 8-bit encoding used reading single bytes from buffer and * stream. */ private static final String DEFAULT_SINGLE_BYTE_ENCODING = "ISO-8859-1"; /** * Constructor. * * @param buffer In-memory buffer of recordings prefix. We read from * here first and will only go to the backing file if <code>size</code> * requested is greater than <code>buffer.length</code>. * @param size Total size of stream to replay in bytes. Used to find * EOS. This is total length of content including HTTP headers if * present. * @param responseBodyStart Where the response body starts in bytes. * Used to skip over the HTTP headers if present. * @param backingFilename Path to backing file with content in excess of * whats in <code>buffer</code>. * * @throws IOException */ private ByteReplayCharSequence(byte[] buffer, long size, long responseBodyStart, String backingFilename) throws IOException { this.length = (int)(size - responseBodyStart); this.absoluteLength = (int)size; this.prefixBuffer = buffer; this.contentOffset = (int)responseBodyStart; // If amount to read is > than what is in our prefix buffer, then // open the backing file. if (size > buffer.length) { this.backingFilename = backingFilename; this.raFile = new RandomAccessFile(backingFilename, "r"); this.wraparoundBuffer = new byte[this.prefixBuffer.length]; this.wrapOrigin = this.prefixBuffer.length; this.wrapOffset = 0; loadBuffer();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -