📄 xmlsettingshandler.java
字号:
/* XMLSettingsHandler * * $Id: XMLSettingsHandler.java 4662 2006-09-25 23:45:21Z paul_jack $ * * Created on Dec 18, 2003 * * Copyright (C) 2004 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package org.archive.crawler.settings;import java.io.BufferedInputStream;import java.io.BufferedOutputStream;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.util.ArrayList;import java.util.Collection;import java.util.List;import java.util.TreeSet;import java.util.logging.Logger;import javax.management.Attribute;import javax.management.AttributeNotFoundException;import javax.management.InvalidAttributeValueException;import javax.management.MBeanAttributeInfo;import javax.management.MBeanException;import javax.management.MBeanInfo;import javax.management.ReflectionException;import javax.xml.parsers.FactoryConfigurationError;import javax.xml.parsers.ParserConfigurationException;import javax.xml.parsers.SAXParserFactory;import javax.xml.transform.Source;import javax.xml.transform.Transformer;import javax.xml.transform.TransformerFactory;import javax.xml.transform.stream.StreamResult;import org.archive.crawler.datamodel.CrawlOrder;import org.archive.util.ArchiveUtils;import org.archive.util.FileUtils;import org.xml.sax.InputSource;import org.xml.sax.SAXException;import org.xml.sax.SAXParseException;import org.xml.sax.XMLReader;/** A SettingsHandler which uses XML files as persistent storage. * * @author John Erik Halse */public class XMLSettingsHandler extends SettingsHandler { private static Logger logger = Logger.getLogger( "org.archive.crawler.settings.XMLSettingsHandler"); // XML element name constants protected static final String XML_SCHEMA = "heritrix_settings.xsd"; protected static final String XML_ROOT_ORDER = "crawl-order"; protected static final String XML_ROOT_HOST_SETTINGS = "crawl-settings"; protected static final String XML_ROOT_REFINEMENT = "crawl-refinement"; protected static final String XML_ELEMENT_CONTROLLER = "controller"; protected static final String XML_ELEMENT_META = "meta"; protected static final String XML_ELEMENT_NAME = "name"; protected static final String XML_ELEMENT_DESCRIPTION = "description"; protected static final String XML_ELEMENT_OPERATOR = "operator"; protected static final String XML_ELEMENT_ORGANIZATION = "organization"; protected static final String XML_ELEMENT_AUDIENCE = "audience"; protected static final String XML_ELEMENT_DATE = "date"; protected static final String XML_ELEMENT_REFINEMENTLIST = "refinement-list"; protected static final String XML_ELEMENT_REFINEMENT = "refinement"; protected static final String XML_ELEMENT_REFERENCE = "reference"; protected static final String XML_ELEMENT_LIMITS = "limits"; protected static final String XML_ELEMENT_TIMESPAN = "timespan"; protected static final String XML_ELEMENT_PORTNUMBER = "portnumber"; protected static final String XML_ELEMENT_URIMATCHES = "uri-matches"; protected static final String XML_ELEMENT_CONTENTMATCHES = "content-type-matches"; protected static final String XML_ELEMENT_OBJECT = "object"; protected static final String XML_ELEMENT_NEW_OBJECT = "newObject"; protected static final String XML_ATTRIBUTE_NAME = "name"; protected static final String XML_ATTRIBUTE_CLASS = "class"; protected static final String XML_ATTRIBUTE_FROM = "from"; protected static final String XML_ATTRIBUTE_TO = "to"; private File orderFile; private final static String settingsFilename = "settings"; private final static String settingsFilenameSuffix = "xml"; private final static String REFINEMENT_DIR = "_refinements"; /** Create a new XMLSettingsHandler object. * * @param orderFile where the order file is located. * @throws InvalidAttributeValueException */ public XMLSettingsHandler(File orderFile) throws InvalidAttributeValueException { super(); this.orderFile = orderFile.getAbsoluteFile(); } /** Initialize the SettingsHandler. * * This method builds the settings data structure and initializes it with * settings from the order file given to the constructor. */ public void initialize() { super.initialize(); } /** * Initialize the SettingsHandler from a source. * * This method builds the settings data structure and initializes it with * settings from the order file given as a parameter. The intended use is * to create a new order file based on a default (template) order file. * * @param source the order file to initialize from. */ public void initialize(File source) { File tmpOrderFile = orderFile; orderFile = source.getAbsoluteFile(); this.initialize(); orderFile = tmpOrderFile; } private File getSettingsDirectory() { String settingsDirectoryName = null; try { settingsDirectoryName = (String) getOrder().getAttribute( CrawlOrder.ATTR_SETTINGS_DIRECTORY); } catch (AttributeNotFoundException e) { e.printStackTrace(); } catch (MBeanException e) { e.printStackTrace(); } catch (ReflectionException e) { e.printStackTrace(); } return getPathRelativeToWorkingDirectory(settingsDirectoryName); } /** Resolves the filename for a settings object into a file path. * * It will also create the directory structure leading to this file * if it doesn't exist. * * @param settings the settings object to get file path for. * @return the file path for this settings object. */ protected final File settingsToFilename(CrawlerSettings settings) { File file; if (settings.getScope() == null || settings.getScope().equals("")) { if (settings.isRefinement()) { file = new File(getSettingsDirectory(), File.separatorChar + REFINEMENT_DIR + File.separatorChar + settings.getName() + '.' + settingsFilenameSuffix); } else { file = orderFile; } } else { String elements[] = settings.getScope().split("\\."); if (elements.length == 0) { return orderFile; } StringBuffer path = new StringBuffer(); for (int i = elements.length - 1; i > 0; i--) { path.append(elements[i]); path.append(File.separatorChar); } path.append(elements[0]); if (settings.isRefinement()) { file = new File(getSettingsDirectory(), path.toString() + File.separatorChar + REFINEMENT_DIR + File.separatorChar + settings.getName() + '.' + settingsFilenameSuffix); } else { file = new File(getSettingsDirectory(), path.toString() + File.separatorChar + settingsFilename + "." + settingsFilenameSuffix); } } return file; } public final void writeSettingsObject(CrawlerSettings settings) { File filename = settingsToFilename(settings); writeSettingsObject(settings, filename); } /** Write a CrawlerSettings object to a specified file. * * This method is similar to {@link #writeSettingsObject(CrawlerSettings)} * except that it uses the submitted File object instead of trying to * resolve where the file should be written. * * @param settings the settings object to be serialized. * @param filename the file to which the settings object should be written. */ public final void writeSettingsObject( CrawlerSettings settings, File filename) { logger.fine("Writing " + filename.getAbsolutePath()); filename.getParentFile().mkdirs(); try { long lastSaved = 0L; File backup = null; if (getOrder().getController() != null && filename.exists()) { // The crawler is running and file exists - make backup first. String name = filename.getName(); lastSaved = settings.getLastSavedTime().getTime(); name = name.substring(0, name.lastIndexOf('.')) + '_' + ArchiveUtils.get14DigitDate(lastSaved) + "." + settingsFilenameSuffix; backup = new File(filename.getParentFile(), name); FileUtils.copyFiles(filename, backup); } StreamResult result = new StreamResult( new BufferedOutputStream(new FileOutputStream(filename))); Transformer transformer = TransformerFactory.newInstance().newTransformer(); Source source = new CrawlSettingsSAXSource(settings); transformer.transform(source, result); // Hack to get rid of unnesessary backupfiles. // What happens is that the WUI often saves settings files // several times during a settings change. This code removes the // last backup file if its no more than 2 minutes old. if (lastSaved > (System.currentTimeMillis() - 2 * 60 * 1000)) { backup.delete(); } } catch (Exception e) { e.printStackTrace(); } } /** Read the CrawlerSettings object from a specific file. * * @param settings the settings object to be updated with data from the * persistent storage. * @param f the file to read from. * @return the updated settings object or null if there was no data for this * in the persistent storage. */ protected final CrawlerSettings readSettingsObject(CrawlerSettings settings, File f) { CrawlerSettings result = null; try { InputStream is = null; if (!f.exists()) { // Perhaps the file we're looking for is on the CLASSPATH. // DON'T look on the CLASSPATH for 'settings.xml' files. The // look for 'settings.xml' files happens frequently. Not looking // on classpath for 'settings.xml' is an optimization based on // ASSUMPTION that there will never be a 'settings.xml' saved // on classpath. if (!f.getName().startsWith(settingsFilename)) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -