⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 supplementarylinksscoper.java

📁 高性能分词算法
💻 JAVA
字号:
/* SupplementaryLinksScoper *  * $Id: SupplementaryLinksScoper.java 4911 2007-02-18 19:55:55Z gojomo $ * * Created on Oct 2, 2003 *  * Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * */package org.archive.crawler.postprocessor;import java.util.Collection;import java.util.HashSet;import java.util.logging.Level;import java.util.logging.Logger;import javax.management.AttributeNotFoundException;import org.archive.crawler.datamodel.CandidateURI;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.deciderules.DecideRule;import org.archive.crawler.deciderules.DecideRuleSequence;import org.archive.crawler.framework.Filter;import org.archive.crawler.framework.Scoper;import org.archive.crawler.settings.MapType;/** * Run CandidateURI links carried in the passed CrawlURI through a filter * and 'handle' rejections. * Used to do supplementary processing of links after they've been scope * processed and ruled 'in-scope' by LinkScoper.  An example of * 'supplementary processing' would check that a Link is intended for * this host to crawl in a multimachine crawl setting. Configure filters to * rule on links.  Default handler writes rejected URLs to disk.  Subclass * to handle rejected URLs otherwise. * @author stack */public class SupplementaryLinksScoper extends Scoper {    private static final long serialVersionUID = -775819977752790418L;    private static Logger LOGGER =        Logger.getLogger(SupplementaryLinksScoper.class.getName());        public static final String ATTR_LINKS_DECIDE_RULES = "link-rules";    /**     * @param name Name of this filter.     */    public SupplementaryLinksScoper(String name) {        super(name, "SupplementaryLinksScoper. Use to do supplementary " +            "processing of in-scope links.  Will run each link through " +            "configured filters.  Must be run after LinkScoper and " +            "before FrontierScheduler. " +            "Optionally logs rejected links (Enable " +            ATTR_OVERRIDE_LOGGER_ENABLED + " and set logger level " +            "at INFO or above).");                addElementToDefinition(                new DecideRuleSequence(ATTR_LINKS_DECIDE_RULES,                    "DecideRules which if their final decision on a link is " +                    "REJECT, cause the link to be ruled out-of-scope, even " +                    "if it had previously been accepted by the main scope."));    }    protected void innerProcess(final CrawlURI curi) {        // If prerequisites or no links, nothing to be done in here.        if (curi.hasPrerequisiteUri() || curi.outlinksSize() <= 0) {            return;        }                Collection<CandidateURI> inScopeLinks = new HashSet<CandidateURI>();        for (CandidateURI cauri: curi.getOutCandidates()) {            if (isInScope(cauri)) {                inScopeLinks.add(cauri);            }        }        // Replace current links collection w/ inscopeLinks.  May be        // an empty collection.        curi.replaceOutlinks(inScopeLinks);    }        protected boolean isInScope(CandidateURI caUri) {        // TODO: Fix filters so work on CandidateURI.        CrawlURI curi = (caUri instanceof CrawlURI)?            (CrawlURI)caUri:            new CrawlURI(caUri.getUURI());        boolean result = false;        if (rulesAccept(getLinkRules(curi), curi)) {            result = true;            if (LOGGER.isLoggable(Level.FINER)) {                LOGGER.finer("Accepted: " + caUri);            }        } else {            outOfScope(caUri);        }        return result;    }        protected DecideRule getLinkRules(Object o) {        try {            return (DecideRule)getAttribute(o, ATTR_LINKS_DECIDE_RULES);        } catch (AttributeNotFoundException e) {            throw new RuntimeException(e);        }    }        /**     * Called when a CandidateUri is ruled out of scope.     * @param caUri CandidateURI that is out of scope.     */    protected void outOfScope(CandidateURI caUri) {        if (!LOGGER.isLoggable(Level.INFO)) {            return;        }        LOGGER.info(caUri.getUURI().toString());    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -