📄 regexurlnormalizer.java

📁 nutch0.8源码
💻 JAVA
字号:
/** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.nutch.net;import java.net.URL;import java.net.MalformedURLException;import java.io.IOException;import java.util.List;import java.util.ArrayList;import java.util.Iterator;import org.apache.hadoop.conf.Configuration;import org.apache.nutch.util.NutchConfiguration;import javax.xml.parsers.*;import org.w3c.dom.*;import org.apache.oro.text.regex.*;/** Allows users to do regex substitutions on all/any URLs that are encountered, which * is useful for stripping session IDs from URLs. * * <p>This class must be specified as the URL normalizer to be used in <tt>nutch-site.xml</tt> * or <tt>nutch-default.xml</tt>.  To do this specify the <tt>urlnormalizer.class</tt> property to * have the value:  <tt>org.apache.nutch.net.RegexUrlNormalizer</tt>.  The <tt>urlnormalizer.regex.file</tt> * property should also be set to the file name of an xml file which should contain the patterns * and substitutions to be done on encountered URLs.</p> * * @author Luke Baker */public class RegexUrlNormalizer extends BasicUrlNormalizer  implements UrlNormalizer {    /** Class which holds a compiled pattern and its corresponding substition string. */    private static class Rule {      public Perl5Pattern pattern;      public String substitution;	    }        private List rules;    private PatternMatcher matcher = new Perl5Matcher();        /** The default constructor which is called from UrlNormalizerFactory (normalizerClass.newInstance()) in method: getNormalizer()**/    public RegexUrlNormalizer()  {}        /** Constructor which can be passed the file name, so it doesn't look in the configuration files for it. */    public RegexUrlNormalizer(String filename)      throws IOException, MalformedPatternException {      //URL url= Configuration.get().getResource(filename);      rules = readConfigurationFile(filename);    }            /** This function does the replacements by iterating through all the regex patterns.      * It accepts a string url as input and returns the altered string. */    public synchronized String regexNormalize(String urlString) {      Iterator i=rules.iterator();      while(i.hasNext()) {        Rule r=(Rule) i.next();        urlString = Util.substitute(matcher, r.pattern,           new Perl5Substitution(r.substitution), urlString, Util.SUBSTITUTE_ALL); // actual substitution      }      return urlString;    }       /** Normalizes any URLs by calling super.basicNormalize()      * and regexSub(). This is the function that gets called      * elsewhere in Nutch. */    public synchronized String normalize(String urlString)      throws MalformedURLException {        urlString = super.normalize(urlString); // run basicNormalize first to ready for regexNormalize        urlString = regexNormalize(urlString);        urlString = super.normalize(urlString); // make sure regexNormalize didn't screw up the URL        return urlString;  }        /** Reads the configuration file and populates a List of Rules. */  private List readConfigurationFile(String filename)    throws IOException, MalformedPatternException {    Perl5Compiler compiler=new Perl5Compiler();    List rules=new ArrayList();    try {            if (LOG.isInfoEnabled()) { LOG.info("loading " + filename); }      // borrowed heavily from code in Configuration.java      Document doc =        DocumentBuilderFactory.newInstance().newDocumentBuilder()        .parse(filename);      Element root = doc.getDocumentElement();      if ((!"regex-normalize".equals(root.getTagName())) &&          (LOG.isFatalEnabled())) {        LOG.fatal("bad conf file: top-level element not <regex-normalize>");      }      NodeList regexes = root.getChildNodes();      for (int i = 0; i < regexes.getLength(); i++) {        Node regexNode = regexes.item(i);        if (!(regexNode instanceof Element))          continue;        Element regex = (Element)regexNode;        if ((!"regex".equals(regex.getTagName())) && (LOG.isWarnEnabled())) {          LOG.warn("bad conf file: element not <regex>");        }        NodeList fields = regex.getChildNodes();        String patternValue = null;        String subValue = null;        for (int j = 0; j < fields.getLength(); j++) {          Node fieldNode = fields.item(j);          if (!(fieldNode instanceof Element))            continue;          Element field = (Element)fieldNode;          if ("pattern".equals(field.getTagName()) && field.hasChildNodes())            patternValue = ((Text)field.getFirstChild()).getData();          if ("substitution".equals(field.getTagName()) && field.hasChildNodes())            subValue = ((Text)field.getFirstChild()).getData();          if (!field.hasChildNodes())            subValue = "";        }        if (patternValue != null && subValue != null) {          Rule rule=new Rule();          rule.pattern=(Perl5Pattern) compiler.compile(patternValue);          rule.substitution=subValue;          rules.add(rule);        }      }            } catch (Exception e) {      if (LOG.isFatalEnabled()) {        LOG.fatal("error parsing " + filename +" conf file: " + e);      }    }    return rules;  }    public void setConf(Configuration conf) {    super.setConf(conf);    // the default constructor was called    if (this.rules == null) {      String filename = getConf().get("urlnormalizer.regex.file");      URL url = getConf().getResource(filename);      try {        this.rules = readConfigurationFile(url.toString());      } catch (IOException e) {        // TODO mb@media-style.com: throw Exception? Because broken api.        throw new RuntimeException(e.getMessage(), e);      } catch (MalformedPatternException e) {        // TODO mb@media-style.com: throw Exception? Because broken api.        throw new RuntimeException(e.getMessage(), e);      }    }  }      /** Spits out patterns and substitutions that are in the configuration file. */  public static void main(String args[])    throws MalformedPatternException, IOException {      RegexUrlNormalizer normalizer = new RegexUrlNormalizer();      normalizer.setConf(NutchConfiguration.create());      Iterator i=normalizer.rules.iterator();      while(i.hasNext()) {        Rule r=(Rule) i.next();        System.out.print(r.pattern.getPattern() + "  ");        System.out.println(r.substitution);      }    }  }
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -