⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 experimentalwarcwriter.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/*  $Id: ExperimentalWARCWriter.java 4604 2006-09-06 05:38:18Z stack-sf $ * * Created on July 27th, 2006 * * Copyright (C) 2006 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.io.warc;import java.io.ByteArrayInputStream;import java.io.ByteArrayOutputStream;import java.io.File;import java.io.IOException;import java.io.InputStream;import java.io.OutputStream;import java.net.URI;import java.net.URISyntaxException;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.concurrent.atomic.AtomicInteger;import org.archive.io.UTF8Bytes;import org.archive.io.WriterPoolMember;import org.archive.uid.GeneratorFactory;import org.archive.util.ArchiveUtils;import org.archive.util.anvl.ANVLRecord;/** * <b>Experimental</b> WARC implementation. * * <p>Assumption is that the caller is managing access to this * ExperimentalWARCWriter ensuring only one thread accessing this WARC instance * at any one time. *  * <p>While being written, WARCs have a '.open' suffix appended. * * @author stack * @version $Revision: 4604 $ $Date: 2006-09-05 22:38:18 -0700 (Tue, 05 Sep 2006) $ */public class ExperimentalWARCWriter extends WriterPoolMemberimplements WARCConstants {    /**     * Buffer to reuse writing streams.     */    private final byte [] readbuffer = new byte[16 * 1024];        /**     * NEWLINE as bytes.     */    public static byte [] CRLF_BYTES;    static {        try {            CRLF_BYTES = CRLF.getBytes(DEFAULT_ENCODING);        } catch(Exception e) {            e.printStackTrace();        }    };        /**     * Metadata.     * TODO: Exploit writing warcinfo record.  Currently unused.     */    private final List fileMetadata;            /**     * Shutdown Constructor     * Has default access so can make instance to test utility methods.     */    ExperimentalWARCWriter() {        this(null, null, "", "", true, -1, null);    }        /**     * Constructor.     * Takes a stream. Use with caution. There is no upperbound check on size.     * Will just keep writing.  Only pass Streams that are bounded.      * @param serialNo  used to generate unique file name sequences     * @param out Where to write.     * @param f File the <code>out</code> is connected to.     * @param cmprs Compress the content written.     * @param a14DigitDate If null, we'll write current time.     * @throws IOException     */    public ExperimentalWARCWriter(final AtomicInteger serialNo,    		final OutputStream out, final File f,    		final boolean cmprs, final String a14DigitDate,            final List warcinfoData)    throws IOException {        super(serialNo, out, f, cmprs, a14DigitDate);        this.fileMetadata = warcinfoData;    }                /**     * Constructor.     *     * @param dirs Where to drop files.     * @param prefix File prefix to use.     * @param cmprs Compress the records written.      * @param maxSize Maximum size for ARC files written.     * @param suffix File tail to use.  If null, unused.     * @param warcinfoData File metadata for warcinfo record.     */    public ExperimentalWARCWriter(final AtomicInteger serialNo,    		final List<File> dirs, final String prefix,             final String suffix, final boolean cmprs,            final long maxSize, final List warcinfoData) {        super(serialNo, dirs, prefix, suffix, cmprs, maxSize,        	WARC_FILE_EXTENSION);        this.fileMetadata = warcinfoData;    }        @Override    protected String createFile(File file) throws IOException {    	String filename = super.createFile(file);    	writeWarcinfoRecord(filename);        return filename;    }        protected void baseCharacterCheck(final char c, final String parameter)    throws IOException {        // TODO: Too strict?  UNICODE control characters?        if (Character.isISOControl(c) || !Character.isValidCodePoint(c)) {            throw new IOException("Contains illegal character 0x" +                Integer.toHexString(c) + ": " + parameter);        }    }        protected String checkHeaderValue(final String value)    throws IOException {        for (int i = 0; i < value.length(); i++) {        	final char c = value.charAt(i);        	baseCharacterCheck(c, value);        	if (Character.isWhitespace(c)) {                throw new IOException("Contains disallowed white space 0x" +                    Integer.toHexString(c) + ": " + value);        	}        }        return value;    }        protected String checkHeaderLineMimetypeParameter(final String parameter)    throws IOException {    	StringBuilder sb = new StringBuilder(parameter.length());    	boolean wasWhitespace = false;        for (int i = 0; i < parameter.length(); i++) {        	char c = parameter.charAt(i);        	if (Character.isWhitespace(c)) {        		// Map all to ' ' and collapse multiples into one.        		// TODO: Make sure white space occurs in legal location --        		// before parameter or inside quoted-string.        		if (wasWhitespace) {        			continue;        		}        		wasWhitespace = true;        		c = ' ';        	} else {        		wasWhitespace = false;        		baseCharacterCheck(c, parameter);        	}        	sb.append(c);        }                return sb.toString();    }    protected String createRecordHeader(final String type,    		final String url, final String create14DigitDate,    		final String mimetype, final URI recordId,    		final ANVLRecord xtraHeaders, final long contentLength)    throws IOException {    	final StringBuilder sb =    		new StringBuilder(2048/*A SWAG: TODO: Do analysis.*/);    	sb.append(WARC_ID).append(CRLF);        sb.append(HEADER_KEY_TYPE).append(COLON_SPACE).append(type).            append(CRLF);        // Do not write a subject-uri if not one present.        if (url != null && url.length() > 0) {            sb.append(HEADER_KEY_URI).append(COLON_SPACE).                append(checkHeaderValue(url)).append(CRLF);        }        sb.append(HEADER_KEY_DATE).append(COLON_SPACE).            append(create14DigitDate).append(CRLF);        if (xtraHeaders != null) {            for (final Iterator i = xtraHeaders.iterator(); i.hasNext();) {                sb.append(i.next()).append(CRLF);            }        }        // TODO: Is MIME Version needed.        sb.append(MIME_VERSION).append(CRLF);        sb.append(CONTENT_ID).append(COLON_SPACE).append('<').            append(recordId.toString()).append('>').append(CRLF);        if (contentLength > 0) {            sb.append(CONTENT_TYPE).append(COLON_SPACE).append(                checkHeaderLineMimetypeParameter(mimetype)).append(CRLF);        }        sb.append(CONTENT_LENGTH).append(COLON_SPACE).            append(Long.toString(contentLength)).append(CRLF);    	    	return sb.toString();    }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -