⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 matcheslistregexpdeciderule.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
字号:
/* MatchesListRegExpDecideRule *  * $Id: MatchesListRegExpDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $ *  * Created on 30.5.2005 * * Copyright (C) 2005 Kristinn Sigurdsson *  * This file is part of the Heritrix web crawler (crawler.archive.org). *  * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. *  * Heritrix is distributed in the hope that it will be useful,  * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. *  * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.crawler.deciderules;import java.util.Iterator;import java.util.List;import java.util.logging.Level;import java.util.logging.Logger;import javax.management.AttributeNotFoundException;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.StringList;import org.archive.util.TextUtils;/** * Rule applies configured decision to any CrawlURIs whose String URI * matches the supplied regexps. * <p> * The list of regular expressions can be considered logically AND or OR. * * @author Kristinn Sigurdsson *  * @see MatchesRegExpDecideRule */public class MatchesListRegExpDecideRule extends PredicatedDecideRule {    private static final long serialVersionUID = 3011579758573454930L;    private static final Logger logger =        Logger.getLogger(MatchesListRegExpDecideRule.class.getName());        public static final String ATTR_REGEXP_LIST = "regexp-list";    public static final String ATTR_LIST_LOGIC= "list-logic";        public static final String DEFAULT_LIST_LOGIC = "OR";    public static final String[] LEGAL_LIST_LOGIC = {"OR","AND"};    /**     * Usual constructor.      * @param name     */    public MatchesListRegExpDecideRule(String name) {        super(name);        setDescription("MatchesListRegExpDecideRule. Applies the configured " +            "decision to URIs matching the supplied regular expressions.\n" +            "The list of regular expressions can be considered logically AND " +            "or OR.");        addElementToDefinition(                new SimpleType(ATTR_LIST_LOGIC, "Should the list of regular " +                    "expressions be considered as logically AND or OR when " +                    "matching.",                     DEFAULT_LIST_LOGIC, LEGAL_LIST_LOGIC));        addElementToDefinition(new StringList(ATTR_REGEXP_LIST,"The list of " +             "regular expressions to evalute against the URI."));    }    /**     * Evaluate whether given object's string version     * matches configured regexps     *      * @param o     * @return true if regexps are matched     */    protected boolean evaluate(Object o) {        try {            List regexps = getRegexp(o);            if(regexps.size()==0){                return false;            }            String str = o.toString();            Iterator it = regexps.iterator();                        boolean listLogicOR = isListLogicOR(o);            // Result is initialized so that if OR based the default assumption is            // false (find no matches) but if AND based the default assumption is            // true (finds no non-matches)            boolean result = listLogicOR == false;                        while(it.hasNext()){                String regexp = (String)it.next();                boolean matches = TextUtils.matches(regexp, str);                if (logger.isLoggable(Level.FINER)) {                    logger.finer("Tested '" + str + "' match with regex '" +                        regexp + " and result was " + matches);                }                                if(matches){                    if(listLogicOR){                        // OR based and we just got a match, done!                        result = true;                        break;                    }                } else {                    if(listLogicOR == false){                        // AND based and we just found a non-match, done!                        result = false;                        break;                    }                }            }                        if (logger.isLoggable(Level.FINE) && result){                logger.fine("Matched: " + str);            }                        return result;        } catch (ClassCastException e) {            // if not CrawlURI, always disregard            return false;         }    }        /**      * Get the regular expressions list to match the URI against.     *     * @param o the object for which the regular expression should be     *          matched against.     * @return the regular expression to match against.     */    protected List getRegexp(Object o) {        try {            return (StringList) getAttribute(o, ATTR_REGEXP_LIST);        } catch (AttributeNotFoundException e) {            logger.severe(e.getMessage());            // Basically the filter is inactive if this occurs            // (The caller should be returning false when regexp is null).            return null;          }    }        protected boolean isListLogicOR(Object o){        String logic = DEFAULT_LIST_LOGIC;        try {            logic = (String) getAttribute(o, ATTR_LIST_LOGIC);        } catch (AttributeNotFoundException e) {            logger.severe(e.getMessage());        }        return logic.equals("OR") ? true : false;    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -