📄 kw3writerprocessor.java
字号:
/* Created on 2006-okt-03** Copyright (C) 2006 National Library of Sweden.** This program is free software; you can redistribute it and/or* modify it under the terms of the GNU Lesser General Public License* as published by the Free Software Foundation; either version 2* of the License, or (at your option) any later version.** This program is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the* GNU Lesser General Public License for more details.** You should have received a copy of the GNU Lesser General Public License* along with this program; if not, write to the Free Software* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.*/package org.archive.crawler.writer;import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;import java.io.ByteArrayOutputStream;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.OutputStream;import java.net.InetAddress;import java.security.MessageDigest;import java.security.NoSuchAlgorithmException;import java.util.logging.Level;import java.util.logging.Logger;import javax.management.AttributeNotFoundException;import javax.management.MBeanException;import javax.management.ReflectionException;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.CrawlHost;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.framework.Processor;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.Type;import org.archive.io.ReplayInputStream;import org.archive.crawler.writer.Kw3Constants;/** * Processor module that writes the results of successful fetches to * files on disk. These files are MIME-files of the type used by the * Swedish National Library's Kulturarw3 web harvesting [http://www.kb.se/kw3/]. * * Each URI gets written to its own file and has a path consisting of: * <ul> * <li> A dir named with the first two chars of the website's md5. </li> * <li> A dir named after the website. </li> * <li> 'current' - a dir indicating that this is the directory being written * to by the ongoing crawl. </li> * <li> A file on the format <md5 of url>.<fetchtime in seconds> </li> * </ul> * Example: '/53/www.kb.se/current/6879ad79c0ccf886ee8ca55d80e5d6a1.1169211837' * * The MIME-file itself consists of three parts: * <ul> * <li> 1. ArchiveInfo - Metadata about the file and its content. </li> * <li> 2. Header - The HTTP response header. </li> * <li> 3. Content - The HTTP response content, plus content-type. </li> * </ul> * * @author oskar */public class Kw3WriterProcessor extends Processor implements CoreAttributeConstants, Kw3Constants { private static final long serialVersionUID = 7171448068924684594L; private static String COLON = ":"; private static String WS = " "; private static String LF = "\n"; /** * Logger. */ private static final Logger logger = Logger.getLogger(Kw3WriterProcessor.class.getName()); /** * Key to use asking settings for arc path value. */ public static final String ATTR_PATH ="path"; /** * Default path. */ private static final String DEFAULT_PATH = "arcs"; /** * Key to use asking settings for max size value. */ public static final String ATTR_MAX_SIZE_BYTES = "max-size-bytes"; /** * Default max file size. */ public static final int DEFAULT_MAX_FILE_SIZE = 10000000; /** * Key to use asking settings if chmod should be execuated . */ public static final String ATTR_CHMOD = "chmod"; /** * Key to use asking settings for the new chmod value. */ public static final String ATTR_CHMOD_VALUE = "chmod-value"; /** * Default value for permissions. */ public static final String DEFAULT_CHMOD_VALUE = "777"; /** * Key for the maximum ARC bytes to write attribute. */ public static final String ATTR_MAX_BYTES_WRITTEN = "total-bytes-to-write"; /** * Key for the collection attribute. */ public static final String ATTR_COLLECTION = "collection"; /** * Default value for collection. */ public static final String DEFAULT_COLLECTION_VALUE = "kw3"; /** * Key for the harvester attribute. */ public static final String ATTR_HARVESTER = "harvester"; /** * Default value for harvester. */ public static final String DEFAULT_HARVESTER_VALUE = "heritrix"; private static String BOUNDARY_START = "KulturArw3_"; /* * Private members for settings */ private File arcsDir; private boolean chmod; private String chmodValue; private int maxSize; private String collection; private String harvester; /** * @param name Name of this processor. */ public Kw3WriterProcessor(String name) { super(name, "Kw3Writer processor. " + "A writer that writes files in the MIME format of The " + "Swedish National Library. See this class's javadoc for" + "format exposition."); Type e; e = addElementToDefinition(new SimpleType(ATTR_PATH, "Top-level directory for archive files.", DEFAULT_PATH)); e.setOverrideable(false); e = addElementToDefinition(new SimpleType(ATTR_COLLECTION, "Name of collection.", DEFAULT_COLLECTION_VALUE)); e.setOverrideable(false); e = addElementToDefinition(new SimpleType(ATTR_HARVESTER, "Name of the harvester that is used for the web harvesting.", DEFAULT_HARVESTER_VALUE)); e.setOverrideable(false); e = addElementToDefinition(new SimpleType(ATTR_MAX_SIZE_BYTES, "Max size of each file", new Integer(DEFAULT_MAX_FILE_SIZE))); e.setOverrideable(false); e = addElementToDefinition(new SimpleType(ATTR_CHMOD, "Should permissions be changed for the newly created dirs", new Boolean(true))); e.setOverrideable(false); e = addElementToDefinition(new SimpleType(ATTR_CHMOD_VALUE, "What should the permissions be set to." + " Given as three octal digits, as to the UNIX 'chmod' command." + " Ex. 777 for all permissions to everyone.", DEFAULT_CHMOD_VALUE)); e.setOverrideable(false); } protected void initialTasks () { try { String arcsDirPath = (String) getAttribute(ATTR_PATH); this.arcsDir = new File(arcsDirPath); if (!this.arcsDir.isAbsolute()) this.arcsDir = new File(getController().getDisk(), arcsDirPath); this.collection = (String) getAttribute(ATTR_COLLECTION); this.harvester = (String) getAttribute(ATTR_HARVESTER); this.chmod = (Boolean) getAttribute(ATTR_CHMOD); this.chmodValue = (String) getAttribute(ATTR_CHMOD_VALUE); this.maxSize = (Integer) getAttribute(ATTR_MAX_SIZE_BYTES); } catch (AttributeNotFoundException e) { logger.log(Level.WARNING, "attribute error", e); } catch (MBeanException e) { logger.log(Level.WARNING, "attribute error", e); } catch (ReflectionException e) { logger.log(Level.WARNING, "attribute error", e); } } protected void innerProcess(CrawlURI curi) { // Only successful fetches are written. if (!curi.isSuccess()) return; // Only http and https schemes are supported. String scheme = curi.getUURI().getScheme().toLowerCase(); if (!"http".equalsIgnoreCase(scheme) && !"https".equalsIgnoreCase(scheme)) return; // Write the MIME-file try { writeMimeFile(curi); } catch (IOException e) { logger.log(Level.WARNING, "i/o error", e);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -