📄 regexurlnormalizer.java
字号:
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.net;
import java.net.URL;
import java.net.MalformedURLException;
import java.io.IOException;
// import java.net.URI;
// import java.net.URISyntaxException;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.logging.Logger;
import net.nutch.util.LogFormatter;
import javax.xml.parsers.*;
import org.w3c.dom.*;
import org.apache.oro.text.regex.*;
import net.nutch.util.*;
/** Allows users to do regex substitutions on all/any URLs that are encountered, which
* is useful for stripping session IDs from URLs.
*
* <p>This class must be specified as the URL normalizer to be used in <tt>nutch-site.xml</tt>
* or <tt>nutch-default.xml</tt>. To do this specify the <tt>urlnormalizer.class</tt> property to
* have the value: <tt>net.nutch.net.RegexUrlNormalizer</tt>. The <tt>urlnormalizer.regex.file</tt>
* property should also be set to the file name of an xml file which should contain the patterns
* and substitutions to be done on encountered URLs.</p>
*
* @author Luke Baker
*/
public class RegexUrlNormalizer extends BasicUrlNormalizer
implements UrlNormalizer {
/** Class which holds a compiled pattern and its corresponding substition string. */
private static class Rule {
public Perl5Pattern pattern;
public String substitution;
}
private List rules;
private PatternMatcher matcher = new Perl5Matcher();
/** Default constructor which gets the file name from either <tt>nutch-site.xml</tt>
* or <tt>nutch-default.xml</tt> and reads that configuration file. It stores the regex patterns
* and corresponding substitutions in a List. The file should be in the CLASSPATH. */
public RegexUrlNormalizer() throws IOException, MalformedPatternException {
String filename = NutchConf.get("urlnormalizer.regex.file");
URL url= NutchConf.class.getClassLoader().getResource(filename);
rules=readConfigurationFile(url.toString());
}
/** Constructor which can be passed the file name, so it doesn't look in the configuration files for it. */
public RegexUrlNormalizer(String filename)
throws IOException, MalformedPatternException {
//URL url= NutchConf.class.getClassLoader().getResource(filename);
rules = readConfigurationFile(filename);
}
/** This function does the replacements by iterating through all the regex patterns.
* It accepts a string url as input and returns the altered string. */
public synchronized String regexNormalize(String urlString) {
Iterator i=rules.iterator();
while(i.hasNext()) {
Rule r=(Rule) i.next();
urlString = Util.substitute(matcher, r.pattern,
new Perl5Substitution(r.substitution), urlString, Util.SUBSTITUTE_ALL); // actual substitution
}
return urlString;
}
/** Normalizes any URLs by calling super.basicNormalize()
* and regexSub(). This is the function that gets called
* elsewhere in Nutch. */
public synchronized String normalize(String urlString)
throws MalformedURLException {
urlString = super.normalize(urlString); // run basicNormalize first to ready for regexNormalize
urlString = regexNormalize(urlString);
urlString = super.normalize(urlString); // make sure regexNormalize didn't screw up the URL
return urlString;
}
/** Reads the configuration file and populates a List of Rules. */
private static List readConfigurationFile(String filename)
throws IOException, MalformedPatternException {
Perl5Compiler compiler=new Perl5Compiler();
List rules=new ArrayList();
try {
LOG.info("loading " + filename);
// borrowed heavily from code in NutchConf.java
Document doc =
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.parse(filename);
Element root = doc.getDocumentElement();
if (!"regex-normalize".equals(root.getTagName()))
LOG.severe("bad conf file: top-level element not <regex-normalize>");
NodeList regexes = root.getChildNodes();
for (int i = 0; i < regexes.getLength(); i++) {
Node regexNode = regexes.item(i);
if (!(regexNode instanceof Element))
continue;
Element regex = (Element)regexNode;
if (!"regex".equals(regex.getTagName()))
LOG.warning("bad conf file: element not <regex>");
NodeList fields = regex.getChildNodes();
String patternValue = null;
String subValue = null;
for (int j = 0; j < fields.getLength(); j++) {
Node fieldNode = fields.item(j);
if (!(fieldNode instanceof Element))
continue;
Element field = (Element)fieldNode;
if ("pattern".equals(field.getTagName()) && field.hasChildNodes())
patternValue = ((Text)field.getFirstChild()).getData();
if ("substitution".equals(field.getTagName()) && field.hasChildNodes())
subValue = ((Text)field.getFirstChild()).getData();
if (!field.hasChildNodes())
subValue = "";
}
if (patternValue != null && subValue != null) {
Rule rule=new Rule();
rule.pattern=(Perl5Pattern) compiler.compile(patternValue);
rule.substitution=subValue;
rules.add(rule);
}
}
} catch (Exception e) {
LOG.severe("error parsing " + filename +" conf file: " + e);
}
return rules;
}
/** Spits out patterns and substitutions that are in the configuration file. */
public static void main(String args[])
throws MalformedPatternException, IOException {
RegexUrlNormalizer normalizer = new RegexUrlNormalizer();
Iterator i=normalizer.rules.iterator();
while(i.hasNext()) {
Rule r=(Rule) i.next();
System.out.print(r.pattern.getPattern() + " ");
System.out.println(r.substitution);
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -