⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 experimentalwarcwritertest.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/* * ExperimentalWARCWriterTest * * $Id: ExperimentalWARCWriterTest.java 4554 2006-08-30 02:35:48Z stack-sf $ * * Created on July 27th, 2006 * * Copyright (C) 2006 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.io.warc.v10;import java.io.ByteArrayInputStream;import java.io.ByteArrayOutputStream;import java.io.File;import java.io.FileNotFoundException;import java.io.IOException;import java.net.URI;import java.net.URISyntaxException;import java.util.Arrays;import java.util.Iterator;import java.util.List;import java.util.concurrent.atomic.AtomicInteger;import org.archive.io.ArchiveRecord;import org.archive.io.ArchiveRecordHeader;import org.archive.io.UTF8Bytes;import org.archive.io.WriterPoolMember;import org.archive.io.warc.WARCConstants;import org.archive.uid.GeneratorFactory;import org.archive.util.ArchiveUtils;import org.archive.util.TmpDirTestCase;import org.archive.util.anvl.ANVLRecord;/** * Test Writer and Reader. * @author stack * @version $Date: 2006-08-29 19:35:48 -0700 (Tue, 29 Aug 2006) $ $Version$ */public class ExperimentalWARCWriterTestextends TmpDirTestCase implements WARCConstants {    private static final AtomicInteger SERIAL_NO = new AtomicInteger();        /**     * Prefix to use for ARC files made by JUNIT.     */    private static final String PREFIX = "IAH";        private static final String SOME_URL = "http://www.archive.org/test/";        public void testCheckHeaderLineValue() throws Exception {        ExperimentalWARCWriter writer = new ExperimentalWARCWriter();        writer.checkHeaderLineParameters("one");        IOException exception = null;        try {            writer.checkHeaderLineParameters("with space");        } catch(IOException e) {            exception = e;        }       assertNotNull(exception);       exception = null;       try {           writer.checkHeaderLineParameters("with\0x0000controlcharacter");       } catch(IOException e) {           exception = e;       }      assertNotNull(exception);    }    public void testMimetypes() throws IOException {        ExperimentalWARCWriter writer = new ExperimentalWARCWriter();        writer.checkHeaderLineMimetypeParameter("text/xml");        writer.checkHeaderLineMimetypeParameter("text/xml+rdf");        writer.checkHeaderLineMimetypeParameter(        	"text/plain; charset=SHIFT-JIS");        System.out.println(writer.checkHeaderLineMimetypeParameter(    		"multipart/mixed; \r\n        boundary=\"simple boundary\""));    }        public void testWriteRecord() throws IOException {    	File [] files = {getTmpDir()};            	// Write uncompressed.        ExperimentalWARCWriter writer =        	new ExperimentalWARCWriter(SERIAL_NO, Arrays.asList(files),        			this.getClass().getName(), "suffix", false, -1, null);        writeFile(writer);                // Write compressed.        writer = new ExperimentalWARCWriter(SERIAL_NO, Arrays.asList(files),        		this.getClass().getName(), "suffix", true, -1, null);        writeFile(writer);    }        private void writeFile(final ExperimentalWARCWriter writer)    throws IOException {        try {            writeWarcinfoRecord(writer);            writeBasicRecords(writer);        } finally {            writer.close();            writer.getFile().delete();        }    }        private void writeWarcinfoRecord(ExperimentalWARCWriter writer)    throws IOException {    	ANVLRecord meta = new ANVLRecord();    	meta.addLabelValue("size", "1G");    	meta.addLabelValue("operator", "igor");    	byte [] bytes = meta.getUTF8Bytes();    	writer.writeWarcinfoRecord(ANVLRecord.MIMETYPE, null,    		new ByteArrayInputStream(bytes), bytes.length);	}	protected void writeBasicRecords(final ExperimentalWARCWriter writer)    throws IOException {    	ANVLRecord headerFields = new ANVLRecord();    	headerFields.addLabelValue("x", "y");    	headerFields.addLabelValue("a", "b");    	    	URI rid = null;    	try {    		rid = GeneratorFactory.getFactory().    			getQualifiedRecordID(TYPE, METADATA);    	} catch (URISyntaxException e) {    		// Convert to IOE so can let it out.    		throw new IOException(e.getMessage());    	}    	final String content = "Any old content.";    	for (int i = 0; i < 10; i++) {    		String body = i + ". " + content;    		byte [] bodyBytes = body.getBytes(UTF8Bytes.UTF8);    		writer.writeRecord(METADATA, "http://www.archive.org/",    			ArchiveUtils.get14DigitDate(), "no/type",    			rid, headerFields, new ByteArrayInputStream(bodyBytes),    			(long)bodyBytes.length);    	}    }    /**     * @return Generic HTML Content.     */    protected static String getContent() {        return getContent(null);    }        /**     * @return Generic HTML Content with mention of passed <code>indexStr</code>     * in title and body.     */    protected static String getContent(String indexStr) {        String page = (indexStr != null)? "Page #" + indexStr: "Some Page";        return "HTTP/1.1 200 OK\r\n" +        "Content-Type: text/html\r\n\r\n" +        "<html><head><title>" + page +        "</title></head>" +        "<body>" + page +        "</body></html>";    }    /**     * Write random HTML Record.     * @param w Where to write.     * @param index An index to put into content.     * @return Length of record written.     * @throws IOException     */    protected int writeRandomHTTPRecord(ExperimentalWARCWriter w, int index)    throws IOException {        ByteArrayOutputStream baos = new ByteArrayOutputStream();        String indexStr = Integer.toString(index);        byte[] record = (getContent(indexStr)).getBytes();        int recordLength = record.length;        baos.write(record);        // Add named fields for ip, checksum, and relate the metadata        // and request to the resource field.        ANVLRecord r = new ANVLRecord(1);        r.addLabelValue(NAMED_FIELD_IP_LABEL, "127.0.0.1");        w.writeResourceRecord(            "http://www.one.net/id=" + indexStr,            ArchiveUtils.get14DigitDate(),            "text/html; charset=UTF-8",            r,            new ByteArrayInputStream(baos.toByteArray()),            recordLength);        return recordLength;    }    /**     * Fill a WARC with HTML Records.     * @param baseName WARC basename.     * @param compress Whether to compress or not.     * @param maxSize Maximum WARC size.     * @param recordCount How many records.     * @return The written file.     * @throws IOException     */    private File writeRecords(String baseName, boolean compress,        int maxSize, int recordCount)    throws IOException {        cleanUpOldFiles(baseName);        File [] files = {getTmpDir()};        ExperimentalWARCWriter w = new ExperimentalWARCWriter(SERIAL_NO,            Arrays.asList(files), baseName + '-' + PREFIX, "", compress,            maxSize, null);        assertNotNull(w);        for (int i = 0; i < recordCount; i++) {            writeRandomHTTPRecord(w, i);        }        w.close();        assertTrue("Doesn't exist: " +  w.getFile().getAbsolutePath(),             w.getFile().exists());        return w.getFile();    }    /**     * Run validation of passed file.     * @param f File to validate.

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -