📄 experimentalwarcwriter.java
字号:
/* $Id: ExperimentalWARCWriter.java 4604 2006-09-06 05:38:18Z stack-sf $ * * Created on July 27th, 2006 * * Copyright (C) 2006 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package org.archive.io.warc;import java.io.ByteArrayInputStream;import java.io.ByteArrayOutputStream;import java.io.File;import java.io.IOException;import java.io.InputStream;import java.io.OutputStream;import java.net.URI;import java.net.URISyntaxException;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.concurrent.atomic.AtomicInteger;import org.archive.io.UTF8Bytes;import org.archive.io.WriterPoolMember;import org.archive.uid.GeneratorFactory;import org.archive.util.ArchiveUtils;import org.archive.util.anvl.ANVLRecord;/** * <b>Experimental</b> WARC implementation. * * <p>Assumption is that the caller is managing access to this * ExperimentalWARCWriter ensuring only one thread accessing this WARC instance * at any one time. * * <p>While being written, WARCs have a '.open' suffix appended. * * @author stack * @version $Revision: 4604 $ $Date: 2006-09-05 22:38:18 -0700 (Tue, 05 Sep 2006) $ */public class ExperimentalWARCWriter extends WriterPoolMemberimplements WARCConstants { /** * Buffer to reuse writing streams. */ private final byte [] readbuffer = new byte[16 * 1024]; /** * NEWLINE as bytes. */ public static byte [] CRLF_BYTES; static { try { CRLF_BYTES = CRLF.getBytes(DEFAULT_ENCODING); } catch(Exception e) { e.printStackTrace(); } }; /** * Metadata. * TODO: Exploit writing warcinfo record. Currently unused. */ private final List fileMetadata; /** * Shutdown Constructor * Has default access so can make instance to test utility methods. */ ExperimentalWARCWriter() { this(null, null, "", "", true, -1, null); } /** * Constructor. * Takes a stream. Use with caution. There is no upperbound check on size. * Will just keep writing. Only pass Streams that are bounded. * @param serialNo used to generate unique file name sequences * @param out Where to write. * @param f File the <code>out</code> is connected to. * @param cmprs Compress the content written. * @param a14DigitDate If null, we'll write current time. * @throws IOException */ public ExperimentalWARCWriter(final AtomicInteger serialNo, final OutputStream out, final File f, final boolean cmprs, final String a14DigitDate, final List warcinfoData) throws IOException { super(serialNo, out, f, cmprs, a14DigitDate); this.fileMetadata = warcinfoData; } /** * Constructor. * * @param dirs Where to drop files. * @param prefix File prefix to use. * @param cmprs Compress the records written. * @param maxSize Maximum size for ARC files written. * @param suffix File tail to use. If null, unused. * @param warcinfoData File metadata for warcinfo record. */ public ExperimentalWARCWriter(final AtomicInteger serialNo, final List<File> dirs, final String prefix, final String suffix, final boolean cmprs, final long maxSize, final List warcinfoData) { super(serialNo, dirs, prefix, suffix, cmprs, maxSize, WARC_FILE_EXTENSION); this.fileMetadata = warcinfoData; } @Override protected String createFile(File file) throws IOException { String filename = super.createFile(file); writeWarcinfoRecord(filename); return filename; } protected void baseCharacterCheck(final char c, final String parameter) throws IOException { // TODO: Too strict? UNICODE control characters? if (Character.isISOControl(c) || !Character.isValidCodePoint(c)) { throw new IOException("Contains illegal character 0x" + Integer.toHexString(c) + ": " + parameter); } } protected String checkHeaderValue(final String value) throws IOException { for (int i = 0; i < value.length(); i++) { final char c = value.charAt(i); baseCharacterCheck(c, value); if (Character.isWhitespace(c)) { throw new IOException("Contains disallowed white space 0x" + Integer.toHexString(c) + ": " + value); } } return value; } protected String checkHeaderLineMimetypeParameter(final String parameter) throws IOException { StringBuilder sb = new StringBuilder(parameter.length()); boolean wasWhitespace = false; for (int i = 0; i < parameter.length(); i++) { char c = parameter.charAt(i); if (Character.isWhitespace(c)) { // Map all to ' ' and collapse multiples into one. // TODO: Make sure white space occurs in legal location -- // before parameter or inside quoted-string. if (wasWhitespace) { continue; } wasWhitespace = true; c = ' '; } else { wasWhitespace = false; baseCharacterCheck(c, parameter); } sb.append(c); } return sb.toString(); } protected String createRecordHeader(final String type, final String url, final String create14DigitDate, final String mimetype, final URI recordId, final ANVLRecord xtraHeaders, final long contentLength) throws IOException { final StringBuilder sb = new StringBuilder(2048/*A SWAG: TODO: Do analysis.*/); sb.append(WARC_ID).append(CRLF); sb.append(HEADER_KEY_TYPE).append(COLON_SPACE).append(type). append(CRLF); // Do not write a subject-uri if not one present. if (url != null && url.length() > 0) { sb.append(HEADER_KEY_URI).append(COLON_SPACE). append(checkHeaderValue(url)).append(CRLF); } sb.append(HEADER_KEY_DATE).append(COLON_SPACE). append(create14DigitDate).append(CRLF); if (xtraHeaders != null) { for (final Iterator i = xtraHeaders.iterator(); i.hasNext();) { sb.append(i.next()).append(CRLF); } } // TODO: Is MIME Version needed. sb.append(MIME_VERSION).append(CRLF); sb.append(CONTENT_ID).append(COLON_SPACE).append('<'). append(recordId.toString()).append('>').append(CRLF); if (contentLength > 0) { sb.append(CONTENT_TYPE).append(COLON_SPACE).append( checkHeaderLineMimetypeParameter(mimetype)).append(CRLF); } sb.append(CONTENT_LENGTH).append(COLON_SPACE). append(Long.toString(contentLength)).append(CRLF); return sb.toString(); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -