📄 urltodownload.java
字号:
/*
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
*
* Copyright (c) 2001 Brian Pitcher
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
// $Header: /CVSRepository/spider/cn/yicha/subject/spider/URLToDownload.java,v 1.3 2006/02/16 06:57:00 zhangdi Exp $
package cn.yicha.subject.spider;
import java.util.Set;
import java.net.URL;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import org.apache.log4j.Category;
import cn.yicha.common.util.Log4j;
import cn.yicha.common.util.StringParser;
import cn.yicha.subject.spdier.url.SpUrl;
public class URLToDownload implements java.io.Serializable
{
private URL url;
private final URL referer;
private final int depth;
private String serviceID; // 对梦网搜索而言,serviceID表示SP服务标识,对免费网页搜索而言,serviceID表示二级域名
private boolean isBeforeSubs = true;
private URL endUrl; // 如果URL不变,则是原有的URL,否则是重定向之后的URL
private String filterPattern; // URL过滤模式串
private final static Category _logClass = Category.getInstance(URLToDownload.class);
static
{
Log4j.init();
}
public URLToDownload(URL url, int depth, String serviceID)
{
this(url, null, depth, serviceID, null, true);
}
public URLToDownload(URL url, int depth, String serviceID, boolean subsStatus)
{
this(url, null, depth, serviceID, null, subsStatus);
}
public URLToDownload(URL url, int depth, String serviceID, String filterPattern)
{
this(url, null, depth, serviceID, filterPattern, true);
}
public URLToDownload(URL url, URL referer, int depth, String serviceID, String filterPattern, boolean subsStatus)
{
this.url = url;
this.referer = referer;
this.depth = depth;
this.serviceID = serviceID;
this.filterPattern = filterPattern;
this.isBeforeSubs = subsStatus;
}
public URL getURL()
{
return url;
}
public URL getReferer()
{
return referer;
}
public URL getEndURL()
{
return endUrl;
}
public void setEndURL(URL endUrl)
{
this.endUrl = endUrl;
}
public int getDepth()
{
return depth;
}
public String getServiceID()
{
return serviceID;
}
public void setServiceID(String serviceID)
{
this.serviceID = serviceID;
}
public boolean getIsBeforeSubs()
{
return isBeforeSubs;
}
public void setIsBeforeSubs(boolean isBeforeSubs)
{
this.isBeforeSubs = isBeforeSubs;
}
public String getFilterPattern()
{
return filterPattern;
}
public void setFilterPattern(String filterPattern)
{
this.filterPattern = filterPattern;
}
public String toString()
{
return url + ", referer " + referer + ", depth " + depth;
}
public boolean isHtml()
{
String str = url.toExternalForm().toLowerCase();
if (str.indexOf(".htm") > 0)
return true;
return false;
}
public static boolean isMatchUrl(String pattern, String url) {
//return url.indexOf(pattern) >= 0;
if (pattern.length() <= 0) {
return false;
}
String[] subPatterns = pattern.split("\\*");
for (int i = 0; i < subPatterns.length; i ++) {
String sp = subPatterns[i];
// 找子串
int pos = url.indexOf(sp);
if (pos >= 0) { // 是否有匹配
url = url.substring(pos + sp.length());
} else {
return false;
}
}
return true;
}
// public static void main(String[] args) {
// String url = "http://www.joyes.com/(aadsfdafa)/game/a.jsp?lxt=ads&id=888";
//
// String pattern = "www.*yes.com/*)/game/a.jsp?lxt=*&id=";
//
// System.out.println(isMatchUrl(pattern, url));
// }
/**
* 判断URL地址是否匹配设置的过滤模式
*/
public boolean isSatisfiedUrl()
{
final char _NOT_MATCH_PREFIX = '!';
// 如果为空,则任何均不匹配
if (getFilterPattern() == null) {
return false;
}
String urlAddr = url.toExternalForm().toLowerCase();
String[] patterns = getFilterPattern().split(";");
// 判断是否匹配模式:
// 1。在非匹配模式中的,排除;
// 2。在匹配模式中同时不在非匹配模式中的,保留;
// 3。其余排除。
boolean haveMatched = false;
for (int i=0; i < patterns.length; i++)
{
String pattern = patterns[i].toLowerCase();
if (pattern.charAt(0) == _NOT_MATCH_PREFIX) // 属于非匹配模式
{
pattern = pattern.substring(1);
if (isMatchUrl(pattern, urlAddr)){ // 直接排除
return false;
}
} else {
if (isMatchUrl(pattern, urlAddr)){
haveMatched = true; // 匹配则标记已有匹配
}
}
}
return haveMatched;
}
/**
* 判断URL地址是否是能够下载的合法地址
*/
public boolean isValidUrl(boolean downloadMonternet)
{
// 判断URL地址是否匹配设置的过滤模式
if (!isSatisfiedUrl()) {
return false;
}
// 判断是否匹配默认不必访问的地址模式
String pattern;
if (downloadMonternet) {
pattern = "wap.monternet.com/*\\s*$";
}
else {
pattern = "wap.monternet.com";
}
if (StringParser.matchPattern(getURL().toExternalForm(), pattern)) {
return false;
}
return true;
}
/**
* 判断URL地址是否是能够下载的合法地址
*/
public boolean isValidUrl()
{
// 判断URL地址是否匹配设置的过滤模式
return isSatisfiedUrl();
}
public boolean isValidAnchor(Set ringExtensions, Set gameExtensions, Set invalidExtensions)
{
/*String urlAddr = url.toExternalForm();
// 去除URL后缀不是多媒体格式,但含有多媒体格式串的URL
String[] invalidSuffix = (String[]) invalidExtensions.toArray(new String[0]);
for (int m=0; m < invalidSuffix.length; m++)
{
String suffix = "." + invalidSuffix[m];
if (urlAddr.toLowerCase().indexOf(suffix) >= 0) {
_logClass.info("invalid anchor --> " + urlAddr);
return false;
}
}
String urlSuffix = getUrlSuffix(urlAddr.toLowerCase());
// 判断游戏后缀
String[] extensions = (String[]) gameExtensions.toArray(new String[0]);
for (int i=0; i < extensions.length; i++) {
String suffix = "." + extensions[i];
if (urlSuffix.indexOf(suffix.toLowerCase()) >= 0) {
_logClass.info("[game anchor] --> " + urlAddr);
return false;
}
}
// 判断铃声后缀
extensions = (String[]) ringExtensions.toArray(new String[0]);
for (int i=0; i < extensions.length; i++) {
String suffix = "." + extensions[i];
if (urlSuffix.indexOf(suffix.toLowerCase()) >= 0) {
_logClass.info("[ring anchor] --> " + urlAddr);
return false;
}
}*/
return true;
}
/**
* 分析获取目的URL后缀
*/
private String getUrlSuffix(String urlAbs)
{
final String _DEFAULT_SUFFIX = "wml";
final String _HTTP_PREFIX = "http://";
String suffix = _DEFAULT_SUFFIX;
try {
urlAbs = URLDecoder.decode(urlAbs);
}
catch (Exception ex) {
ex.printStackTrace();
_logClass.info("exception url --> " + urlAbs);
}
// 去除尾部的"/"
if (urlAbs.endsWith("/")) {
urlAbs = urlAbs.substring(0, urlAbs.length() - 1);
}
// 取?或;之前的URL路径
int pos1 = urlAbs.indexOf("?");
if (pos1 >= 0) {
urlAbs = urlAbs.substring(0, pos1);
}
pos1 = urlAbs.indexOf(";");
if (pos1 >= 0) {
urlAbs = urlAbs.substring(0, pos1);
}
// 过滤掉http://前缀
if (urlAbs.startsWith(_HTTP_PREFIX)) {
urlAbs = urlAbs.substring(_HTTP_PREFIX.length());
}
// 取最后出现的"/"
int pos2 = urlAbs.lastIndexOf("/");
if (pos2 >= 0) {
urlAbs = urlAbs.substring(pos2);
}
else {
// 返回wml后缀
return suffix;
}
int dotPos = urlAbs.lastIndexOf(".");
if (dotPos >= 0) {
suffix = urlAbs.substring(dotPos);
}
// 过滤#及后缀
pos2 = suffix.indexOf("#");
if (pos2 >= 0) {
suffix = suffix.substring(0, pos2);
}
return suffix;
}
/**
* 如果是梦网地址,把地址转换为互联网可采集的地址
*/
public void applyProperUrl()
{
String properUrl = getURL().toExternalForm();
_logClass.debug("apply proper url --> " + properUrl);
if (SpUrl.isMonternetUrl(properUrl)) {
properUrl = SpUrl.transSpUrl(properUrl);
_logClass.info("trans url --> " + properUrl);
try {
this.url = new URL(properUrl);
}
catch (MalformedURLException ex) {
ex.printStackTrace();
}
}
}
/**
* 根据URL地址,提取URL中的ServiceID并返回,如果没有则返回空串
*/
public String fetchServiceIDFromUrl()
{
String serviceID = null;
String sourceUrl = getURL().toExternalForm();
if (SpUrl.isMonternetUrl(sourceUrl)) {
serviceID = SpUrl.getServiceIDFromUrl(sourceUrl);
}
return serviceID;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -