📄 basicurlnormalizer.java
字号:
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.net;
import java.net.URL;
import java.net.MalformedURLException;
// import java.net.URI;
// import java.net.URISyntaxException;
import java.util.logging.Logger;
import net.nutch.util.LogFormatter;
import org.apache.oro.text.regex.*;
/** Converts URLs to a normal form . */
public class BasicUrlNormalizer implements UrlNormalizer {
public static final Logger LOG =
LogFormatter.getLogger("net.nutch.net.BasicUrlNormalizer");
private Perl5Compiler compiler = new Perl5Compiler();
private ThreadLocal matchers = new ThreadLocal() {
protected synchronized Object initialValue() {
return new Perl5Matcher();
}
};
private Rule relativePathRule = null;
private Rule leadingRelativePathRule = null;
public BasicUrlNormalizer() {
try {
// this pattern tries to find spots like "/xx/../" in the url, which
// could be replaced by "/" xx consists of chars, different then "/"
// (slash) and needs to have at least one char different from "."
relativePathRule = new Rule();
relativePathRule.pattern = (Perl5Pattern)
compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)",
Perl5Compiler.READ_ONLY_MASK);
relativePathRule.substitution = new Perl5Substitution("/");
// this pattern tries to find spots like leading "/../" in the url,
// which could be replaced by "/"
leadingRelativePathRule = new Rule();
leadingRelativePathRule.pattern = (Perl5Pattern)
compiler.compile("^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK);
leadingRelativePathRule.substitution = new Perl5Substitution("/");
} catch (MalformedPatternException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}
public String normalize(String urlString)
throws MalformedURLException {
if ("".equals(urlString)) // permit empty
return urlString;
urlString = urlString.trim(); // remove extra spaces
URL url = new URL(urlString);
String protocol = url.getProtocol();
String host = url.getHost();
int port = url.getPort();
String file = url.getFile();
boolean changed = false;
if (!urlString.startsWith(protocol)) // protocol was lowercased
changed = true;
if ("http".equals(protocol) || "ftp".equals(protocol)) {
if (host != null) {
String newHost = host.toLowerCase(); // lowercase host
if (!host.equals(newHost)) {
host = newHost;
changed = true;
}
}
if (port == url.getDefaultPort()) { // uses default port
port = -1; // so don't specify it
changed = true;
}
if (file == null || "".equals(file)) { // add a slash
file = "/";
changed = true;
}
if (url.getRef() != null) { // remove the ref
changed = true;
}
// check for unnecessary use of "/../"
String file2 = substituteUnnecessaryRelativePaths(file);
if (!file.equals(file2)) {
changed = true;
file = file2;
}
}
if (changed)
urlString = new URL(protocol, host, port, file).toString();
return urlString;
}
private String substituteUnnecessaryRelativePaths(String file) {
String fileWorkCopy = file;
int oldLen = file.length();
int newLen = oldLen - 1;
// All substitutions will be done step by step, to ensure that certain
// constellations will be normalized, too
//
// For example: "/aa/bb/../../cc/../foo.html will be normalized in the
// following manner:
// "/aa/bb/../../cc/../foo.html"
// "/aa/../cc/../foo.html"
// "/cc/../foo.html"
// "/foo.html"
//
// The normalization also takes care of leading "/../", which will be
// replaced by "/", because this is a rather a sign of bad webserver
// configuration than of a wanted link. For example, urls like
// "http://www.foo.com/../" should return a http 404 error instead of
// redirecting to "http://www.foo.com".
//
Perl5Matcher matcher = (Perl5Matcher)matchers.get();
while (oldLen != newLen) {
// substitue first occurence of "/xx/../" by "/"
oldLen = fileWorkCopy.length();
fileWorkCopy = Util.substitute
(matcher, relativePathRule.pattern,
relativePathRule.substitution, fileWorkCopy, 1);
// remove leading "/../"
fileWorkCopy = Util.substitute
(matcher, leadingRelativePathRule.pattern,
leadingRelativePathRule.substitution, fileWorkCopy, 1);
newLen = fileWorkCopy.length();
}
return fileWorkCopy;
}
/**
* Class which holds a compiled pattern and its corresponding substition
* string.
*/
private static class Rule {
public Perl5Pattern pattern;
public Perl5Substitution substitution;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -