⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 surtprefixeddeciderule.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
字号:
/* SurtPrefixedDecideRule** $Id: SurtPrefixedDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $** Created on Apr 5, 2005** Copyright (C) 2005 Internet Archive.** This file is part of the Heritrix web crawler (crawler.archive.org).** Heritrix is free software; you can redistribute it and/or modify* it under the terms of the GNU Lesser Public License as published by* the Free Software Foundation; either version 2.1 of the License, or* any later version.** Heritrix is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the* GNU Lesser Public License for more details.** You should have received a copy of the GNU Lesser Public License* along with Heritrix; if not, write to the Free Software* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/package org.archive.crawler.deciderules;import java.io.File;import java.io.FileReader;import java.io.FileWriter;import java.io.IOException;import org.archive.crawler.datamodel.CandidateURI;import org.archive.crawler.framework.CrawlScope;import org.archive.crawler.scope.SeedListener;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.Type;import org.archive.util.SurtPrefixSet;/** * Rule applies configured decision to any URIs that, when  * expressed in SURT form, begin with one of the prefixes * in the configured set.  *  * The set can be filled with SURT prefixes implied or * listed in the seeds file, or another external file.  * * The "also-check-via" option to implement "one hop off"  * scoping derives from a contribution by Shifra Raffel * of the California Digital Library.  *  * @author gojomo */public class SurtPrefixedDecideRule extends PredicatedDecideRule         implements SeedListener {    private static final long serialVersionUID = 2075790126085405015L;    //private static final Logger logger =    //    Logger.getLogger(SurtPrefixedDecideRule.class.getName());        public static final String ATTR_SURTS_SOURCE_FILE = "surts-source-file";    public static final String ATTR_SEEDS_AS_SURT_PREFIXES =        "seeds-as-surt-prefixes";    public static final String ATTR_SURTS_DUMP_FILE = "surts-dump-file";        private static final Boolean DEFAULT_SEEDS_AS_SURT_PREFIXES =        new Boolean(true);    /**     * Whether every config change should trigger a      * rebuilding of the prefix set.     */    public static final String         ATTR_REBUILD_ON_RECONFIG = "rebuild-on-reconfig";    public static final Boolean        DEFAULT_REBUILD_ON_RECONFIG = Boolean.TRUE;        /**     * Whether the 'via' of CrawlURIs should also be checked     * to see if it is prefixed by the set of SURT prefixes     */    public static final String         ATTR_ALSO_CHECK_VIA = "also-check-via";    public static final Boolean        DEFAULT_ALSO_CHECK_VIA = Boolean.FALSE;        protected SurtPrefixSet surtPrefixes = null;    /**     * Usual constructor.      * @param name     */    public SurtPrefixedDecideRule(String name) {        super(name);        setDescription("SurtPrefixedDecideRule. Makes the configured decision "                + "for any URI which, when expressed in SURT form, begins "                + "with any of the established prefixes (from either seeds "                + "specification or an external file).");        addElementToDefinition(new SimpleType(ATTR_SURTS_SOURCE_FILE,                "Source file from which to infer SURT prefixes. Any URLs " +                "in file will be converted to the implied SURT prefix, and " +                "literal SURT prefixes may be listed on lines beginning " +                "with a '+' character.",                ""));        addElementToDefinition(new SimpleType(ATTR_SEEDS_AS_SURT_PREFIXES,                "Should seeds also be interpreted as SURT prefixes.",                DEFAULT_SEEDS_AS_SURT_PREFIXES));        Type t = addElementToDefinition(new SimpleType(ATTR_SURTS_DUMP_FILE,                "Dump file to save SURT prefixes actually used: " +                "Useful debugging SURTs.", ""));        t.setExpertSetting(true);        t = addElementToDefinition(new SimpleType(ATTR_ALSO_CHECK_VIA,                "Whether to also make the configured decision if a " +                "URI's 'via' URI (the URI from which it was discovered) " +                "in SURT form begins with any of the established prefixes. " +                "For example, can be used to ACCEPT URIs that are 'one hop " +                "off' URIs fitting the SURT prefixes. Default is false.",                DEFAULT_ALSO_CHECK_VIA));        t.setOverrideable(false);        t.setExpertSetting(true);        t = addElementToDefinition(new SimpleType(ATTR_REBUILD_ON_RECONFIG,                "Whether to rebuild the internal structures from source " +                "files (including seeds if appropriate) every time any " +                "configuration change occurs. If true, " +                "rule is rebuilt from sources even when (for example) " +                "unrelated new domain overrides are set. Rereading large" +                "source files can take a long time.",                 DEFAULT_REBUILD_ON_RECONFIG));        t.setOverrideable(false);        t.setExpertSetting(true);    }    /**     * Evaluate whether given object's URI is covered by the SURT prefix set     *      * @param object Item to evaluate.     * @return true if item, as SURT form URI, is prefixed by an item in the set     */    protected boolean evaluate(Object object) {        if ( (object instanceof CandidateURI) &&                 ((Boolean) getUncheckedAttribute(null, ATTR_ALSO_CHECK_VIA))                    .booleanValue()) {            if(evaluate(((CandidateURI)object).getVia())) {                return true;            }        }        String candidateSurt;        candidateSurt = SurtPrefixSet.getCandidateSurt(object);        if (candidateSurt == null) {            return false;        }        return getPrefixes().containsPrefixOf(candidateSurt);    }    /**     * Synchronized get of prefix set to use     *      * @return SurtPrefixSet to use for check     */    private synchronized SurtPrefixSet getPrefixes() {        if (surtPrefixes == null) {            readPrefixes();        }        return surtPrefixes;    }    protected void readPrefixes() {        buildSurtPrefixSet();        dumpSurtPrefixSet();    }        /**     * Dump the current prefixes in use to configured dump file (if any)     */    protected void dumpSurtPrefixSet() {        // dump surts to file, if appropriate        String dumpPath = (String)getUncheckedAttribute(null,            ATTR_SURTS_DUMP_FILE);        if (dumpPath.length() > 0) {            File dump = new File(dumpPath);            if (!dump.isAbsolute()) {                dump = new File(getSettingsHandler().getOrder().getController()                    .getDisk(), dumpPath);            }            try {                FileWriter fw = new FileWriter(dump);                try {                    surtPrefixes.exportTo(fw);                } finally {                    fw.close();                }            } catch (IOException e) {                e.printStackTrace();                throw new RuntimeException(e);            }        }    }    /**     * Construct the set of prefixes to use, from the seed list (     * which may include both URIs and '+'-prefixed directives).     */    protected void buildSurtPrefixSet() {        SurtPrefixSet newSurtPrefixes = new SurtPrefixSet();        FileReader fr = null;        // read SURTs from file, if appropriate        String sourcePath = (String)getUncheckedAttribute(null,                ATTR_SURTS_SOURCE_FILE);        if (sourcePath.length() > 0) {            File source = new File(sourcePath);            if (!source.isAbsolute()) {                source = new File(getSettingsHandler().getOrder()                    .getController().getDisk(), sourcePath);            }            try {                fr = new FileReader(source);                try {                    newSurtPrefixes.importFromMixed(fr, true);                } finally {                    fr.close();                }            } catch (IOException e) {                e.printStackTrace();                throw new RuntimeException(e);            }        }                // interpret seeds as surts, if appropriate        boolean deduceFromSeeds = ((Boolean)getUncheckedAttribute(null,                ATTR_SEEDS_AS_SURT_PREFIXES)).booleanValue();        if(deduceFromSeeds) {            try {                fr = new FileReader(getSeedfile());                try {                    newSurtPrefixes.importFromMixed(fr, deduceFromSeeds);                } finally {                    fr.close();                }            } catch (IOException e) {                e.printStackTrace();                throw new RuntimeException(e);            }        }        surtPrefixes = newSurtPrefixes;    }    /**     * Re-read prefixes after an update.     *      * @see org.archive.crawler.framework.CrawlScope#kickUpdate()     */    public synchronized void kickUpdate() {        super.kickUpdate();        if (((Boolean) getUncheckedAttribute(null, ATTR_REBUILD_ON_RECONFIG))                .booleanValue()) {            readPrefixes();        }        // TODO: make conditional on file having actually changed,        // perhaps by remembering mod-time    }    /**     * Dig through everything to get the crawl-global seeds file.      * Add self as listener while at it.      *      * @return Seed list file     */    protected File getSeedfile() {        CrawlScope scope =            getSettingsHandler().getOrder().getController().getScope();        scope.addSeedListener(this);        return scope.getSeedfile();    }    public synchronized void addedSeed(final CandidateURI curi) {        SurtPrefixSet newSurtPrefixes = (SurtPrefixSet) surtPrefixes.clone();        newSurtPrefixes.add(prefixFrom(curi.toString()));        surtPrefixes = newSurtPrefixes;    }        protected String prefixFrom(String uri) {    	return SurtPrefixSet.prefixFromPlain(uri);    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -