⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 writerpoolmember.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/* $Id: WriterPoolMember.java 5032 2007-04-02 22:02:14Z gojomo $ * * Created on July 21st, 2006 * * Copyright (C) 2006 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.io;import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.OutputStream;import java.text.DecimalFormat;import java.text.NumberFormat;import java.util.Iterator;import java.util.List;import java.util.concurrent.atomic.AtomicInteger;import java.util.logging.Logger;import java.util.zip.GZIPOutputStream;import org.archive.util.ArchiveUtils;import org.archive.util.IoUtils;import org.archive.util.TimestampSerialno;/** * Member of {@link WriterPool}. * Implements rotating off files, file naming with some guarantee of * uniqueness, and position in file. Subclass to pick up functionality for a * particular Writer type. * @author stack * @version $Date: 2007-04-02 22:02:14 +0000 (Mon, 02 Apr 2007) $ $Revision: 5032 $ */public abstract class WriterPoolMember implements ArchiveFileConstants {    private final Logger logger = Logger.getLogger(this.getClass().getName());        public static final String UTF8 = "UTF-8";        /**     * Default file prefix.     *      * Stands for Internet Archive Heritrix.     */    public static final String DEFAULT_PREFIX = "IAH";        /**     * Value to interpolate with actual hostname.     */    public static final String HOSTNAME_VARIABLE = "${HOSTNAME}";        /**     * Default for file suffix.     */    public static final String DEFAULT_SUFFIX = HOSTNAME_VARIABLE;    /**     * Reference to file we're currently writing.     */    private File f = null;    /**     *  Output stream for file.     */    private OutputStream out = null;        /**     * File output stream.     * This is needed so can get at channel to find current position in file.     */    private FileOutputStream fos;        private final boolean compressed;    private List<File> writeDirs = null;    private String prefix = DEFAULT_PREFIX;    private String suffix = DEFAULT_SUFFIX;    private final long maxSize;    private final String extension;    /**     * Creation date for the current file.     * Set by {@link #createFile()}.     */	private String createTimestamp = "UNSET!!!";        /**     * A running sequence used making unique file names.     */    final private AtomicInteger serialNo;        /**     * Directories round-robin index.     */    private static int roundRobinIndex = 0;    /**     * NumberFormat instance for formatting serial number.     *     * Pads serial number with zeros.     */    private static NumberFormat serialNoFormatter = new DecimalFormat("00000");        /**     * Constructor.     * Takes a stream. Use with caution. There is no upperbound check on size.     * Will just keep writing.     *      * @param serialNo  used to create unique filename sequences     * @param out Where to write.     * @param file File the <code>out</code> is connected to.     * @param cmprs Compress the content written.     * @param a14DigitDate If null, we'll write current time.     * @throws IOException     */    protected WriterPoolMember(AtomicInteger serialNo,             final OutputStream out, final File file,            final boolean cmprs, String a14DigitDate)    throws IOException {        this(serialNo, null, null, cmprs, -1, null);        this.out = out;        this.f = file;    }        /**     * Constructor.     *     * @param serialNo  used to create unique filename sequences     * @param dirs Where to drop files.     * @param prefix File prefix to use.     * @param cmprs Compress the records written.      * @param maxSize Maximum size for ARC files written.     * @param extension Extension to give file.     */    public WriterPoolMember(AtomicInteger serialNo,             final List<File> dirs, final String prefix,             final boolean cmprs, final long maxSize, final String extension) {        this(serialNo, dirs, prefix, "", cmprs, maxSize, extension);    }                /**     * Constructor.     *     * @param serialNo  used to create unique filename sequences     * @param dirs Where to drop files.     * @param prefix File prefix to use.     * @param cmprs Compress the records written.      * @param maxSize Maximum size for ARC files written.     * @param suffix File tail to use.  If null, unused.     * @param extension Extension to give file.     */    public WriterPoolMember(AtomicInteger serialNo,            final List<File> dirs, final String prefix,             final String suffix, final boolean cmprs,            final long maxSize, final String extension) {        this.suffix = suffix;        this.prefix = prefix;        this.maxSize = maxSize;        this.writeDirs = dirs;        this.compressed = cmprs;        this.extension = extension;        this.serialNo = serialNo;    }	/**	 * Call this method just before/after any significant write.	 *	 * Call at the end of the writing of a record or just before we start	 * writing a new record.  Will close current file and open a new file	 * if file size has passed out maxSize.	 * 	 * <p>Creates and opens a file if none already open.  One use of this method	 * then is after construction, call this method to add the metadata, then	 * call {@link #getPosition()} to find offset of first record.	 *	 * @exception IOException	 */    public void checkSize() throws IOException {        if (this.out == null ||                (this.maxSize != -1 && (this.f.length() > this.maxSize))) {            createFile();        }    }    /**     * Create a new file.     * Rotates off the current Writer and creates a new in its place     * to take subsequent writes.  Usually called from {@link #checkSize()}.     * @return Name of file created.     * @throws IOException     */    protected String createFile() throws IOException {        TimestampSerialno tsn = getTimestampSerialNo();        String name = this.prefix + '-' + getUniqueBasename(tsn) +            ((this.suffix == null || this.suffix.length() <= 0)?                "": "-" + this.suffix) + '.' + this.extension  +            ((this.compressed)? '.' + COMPRESSED_FILE_EXTENSION: "") +            OCCUPIED_SUFFIX;        this.createTimestamp = tsn.getTimestamp();        File dir = getNextDirectory(this.writeDirs);        return createFile(new File(dir, name));    }        protected String createFile(final File file) throws IOException {    	close();        this.f = file;        this.fos = new FileOutputStream(this.f);        this.out = new FastBufferedOutputStream(this.fos);        logger.info("Opened " + this.f.getAbsolutePath());        return this.f.getName();    }        /**     * @param dirs List of File objects that point at directories.     * @return Find next directory to write an arc too.  If more     * than one, it tries to round-robin through each in turn.     * @throws IOException     */    protected File getNextDirectory(List<File> dirs)    throws IOException {        if (WriterPoolMember.roundRobinIndex >= dirs.size()) {            WriterPoolMember.roundRobinIndex = 0;        }        File d = null;        try {            d = checkWriteable((File)dirs.                get(WriterPoolMember.roundRobinIndex));        } catch (IndexOutOfBoundsException e) {            // Dirs list might be altered underneath us.            // If so, we get this exception -- just keep on going.        }        if (d == null && dirs.size() > 1) {            for (Iterator i = dirs.iterator(); d == null && i.hasNext();) {                d = checkWriteable((File)i.next());            }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -