⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 arcwritertest.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/* ARCWriterTest * * $Id: ARCWriterTest.java 5029 2007-03-29 23:53:50Z gojomo $ * * Created on Dec 31, 2003. * * Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.io.arc;import java.io.ByteArrayOutputStream;import java.io.File;import java.io.FileNotFoundException;import java.io.IOException;import java.io.OutputStream;import java.io.PrintStream;import java.util.Arrays;import java.util.Date;import java.util.Iterator;import java.util.List;import java.util.concurrent.atomic.AtomicInteger;import org.archive.io.ArchiveRecord;import org.archive.io.ReplayInputStream;import org.archive.io.WriterPoolMember;import org.archive.util.ArchiveUtils;import org.archive.util.FileUtils;import org.archive.util.TmpDirTestCase;/** * Test ARCWriter class. * * This code exercises ARCWriter AND ARCReader.  First it writes ARCs w/ * ARCWriter.  Then it validates what was written w/ ARCReader. * * @author stack */public class ARCWriterTestextends TmpDirTestCase implements ARCConstants {    /**     * Prefix to use for ARC files made by JUNIT.     */    private static final String PREFIX =        /* TODO DEFAULT_ARC_FILE_PREFIX*/ "IAH";        private static final String SOME_URL = "http://www.archive.org/test/";        private static final AtomicInteger SERIAL_NO = new AtomicInteger();    /*     * @see TestCase#setUp()     */    protected void setUp() throws Exception {        super.setUp();    }    /*     * @see TestCase#tearDown()     */    protected void tearDown() throws Exception {        super.tearDown();    }        protected static String getContent() {        return getContent(null);    }        protected static String getContent(String indexStr) {        String page = (indexStr != null)? "Page #" + indexStr: "Some Page";        return "HTTP/1.1 200 OK\r\n" +        "Content-Type: text/html\r\n\r\n" +        "<html><head><title>" + page +        "</title></head>" +        "<body>" + page +        "</body></html>";    }    protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index)    throws IOException {        String indexStr = Integer.toString(index);        ByteArrayOutputStream baos = new ByteArrayOutputStream();        // Start the record with an arbitrary 14-digit date per RFC2540        String now = ArchiveUtils.get14DigitDate();        int recordLength = 0;        byte[] record = (getContent(indexStr)).getBytes();        recordLength += record.length;        baos.write(record);        // Add the newline between records back in        baos.write("\n".getBytes());        recordLength += 1;        arcWriter.write("http://www.one.net/id=" + indexStr, "text/html",            "0.1.2.3", Long.parseLong(now), recordLength, baos);        return recordLength;    }    private File writeRecords(String baseName, boolean compress,        long maxSize, int recordCount)    throws IOException {        cleanUpOldFiles(baseName);        File [] files = {getTmpDir()};        ARCWriter arcWriter = new ARCWriter(SERIAL_NO, Arrays.asList(files),            baseName + '-' + PREFIX, compress, maxSize);        assertNotNull(arcWriter);        for (int i = 0; i < recordCount; i++) {            writeRandomHTTPRecord(arcWriter, i);        }        arcWriter.close();        assertTrue("Doesn't exist: " +                arcWriter.getFile().getAbsolutePath(),             arcWriter.getFile().exists());        return arcWriter.getFile();    }    private void validate(File arcFile, int recordCount)    throws FileNotFoundException, IOException {        ARCReader reader = ARCReaderFactory.get(arcFile);        assertNotNull(reader);        List metaDatas = null;        if (recordCount == -1) {            metaDatas = reader.validate();        } else {            metaDatas = reader.validate(recordCount);        }        reader.close();        // Now, run through each of the records doing absolute get going from        // the end to start.  Reopen the arc so no context between this test        // and the previous.        reader = ARCReaderFactory.get(arcFile);        for (int i = metaDatas.size() - 1; i >= 0; i--) {            ARCRecordMetaData meta = (ARCRecordMetaData)metaDatas.get(i);            ArchiveRecord r = reader.get(meta.getOffset());            String mimeType = r.getHeader().getMimetype();            assertTrue("Record is bogus",                mimeType != null && mimeType.length() > 0);        }        reader.close();        assertTrue("Metadatas not equal", metaDatas.size() == recordCount);        for (Iterator i = metaDatas.iterator(); i.hasNext();) {                ARCRecordMetaData r = (ARCRecordMetaData)i.next();                assertTrue("Record is empty", r.getLength() > 0);        }    }    public void testCheckARCFileSize()    throws IOException {        runCheckARCFileSizeTest("checkARCFileSize", false);    }    public void testCheckARCFileSizeCompressed()    throws IOException {        runCheckARCFileSizeTest("checkARCFileSize", true);    }    public void testWriteRecord() throws IOException {        final int recordCount = 2;        File arcFile = writeRecords("writeRecord", false,                DEFAULT_MAX_ARC_FILE_SIZE, recordCount);        validate(arcFile, recordCount  + 1); // Header record.    }        public void testRandomAccess() throws IOException {        final int recordCount = 3;        File arcFile = writeRecords("writeRecord", true,            DEFAULT_MAX_ARC_FILE_SIZE, recordCount);        ARCReader reader = ARCReaderFactory.get(arcFile);        // Get to second record.  Get its offset for later use.        boolean readFirst = false;        String url = null;        long offset = -1;        long totalRecords = 0;        boolean readSecond = false;        for (final Iterator i = reader.iterator(); i.hasNext(); totalRecords++) {            ARCRecord ar = (ARCRecord)i.next();            if (!readFirst) {                readFirst = true;                continue;            }            if (!readSecond) {                url = ar.getMetaData().getUrl();                offset = ar.getMetaData().getOffset();                readSecond = true;            }        }                reader = ARCReaderFactory.get(arcFile, offset);        ArchiveRecord ar = reader.get();        assertEquals(ar.getHeader().getUrl(), url);        ar.close();                // Get reader again.  See how iterator works with offset        reader = ARCReaderFactory.get(arcFile, offset);        int count = 0;        for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) {            count++;        }        reader.close();        assertEquals(totalRecords - 1, count);    }    public void testWriteRecordCompressed() throws IOException {        final int recordCount = 2;        File arcFile = writeRecords("writeRecordCompressed", true,                DEFAULT_MAX_ARC_FILE_SIZE, recordCount);        validate(arcFile, recordCount + 1 /*Header record*/);    }        private void runCheckARCFileSizeTest(String baseName, boolean compress)    throws FileNotFoundException, IOException  {        writeRecords(baseName, compress, 1024, 15);        // Now validate all files just created.        File [] files = FileUtils.getFilesWithPrefix(getTmpDir(), PREFIX);        for (int i = 0; i < files.length; i++) {            validate(files[i], -1);        }    }        protected ARCWriter createARCWriter(String NAME, boolean compress) {        File [] files = {getTmpDir()};        return new ARCWriter(SERIAL_NO, Arrays.asList(files), NAME,            compress, DEFAULT_MAX_ARC_FILE_SIZE);    }        protected static ByteArrayOutputStream getBaos(String str)    throws IOException {        ByteArrayOutputStream baos = new ByteArrayOutputStream();        baos.write(str.getBytes());        return baos;    }        protected static void writeRecord(ARCWriter writer, String url,        String type, int len, ByteArrayOutputStream baos)    throws IOException {        writer.write(url, type, "192.168.1.1", (new Date()).getTime(), len,            baos);    }        protected int iterateRecords(ARCReader r)    throws IOException {        int count = 0;        for (Iterator i = r.iterator(); i.hasNext();) {            ARCRecord rec = (ARCRecord)i.next();            rec.close();            if (count != 0) {                assertTrue("Unexpected URL " + rec.getMetaData().getUrl(),

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -