📄 link.java
字号:
/* * WebSphinx web-crawling toolkit * * Copyright (c) 1998-2002 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */package websphinx;import java.util.Enumeration;import java.io.File;import java.net.URL;import java.net.MalformedURLException;import rcm.util.Prioritized;/** * Link to a Web page. * * @author Rob Miller * @see Page */public class Link extends Element implements Prioritized { protected URL url; private String directory; private String filename; private String query; private String ref; private Page page; private int depth; private String text = ""; // stored text of link anchor private int status = LinkEvent.NONE; private float priority; private DownloadParameters dp; // timeouts, etc. to use when downloading this link /** * Make a Link from a start tag and end tag and a base URL (for relative references). * The tags must be on the same page. * @param startTag Start tag of element * @param endTag End tag of element * @param base Base URL used for relative references */ public Link (Tag startTag, Tag endTag, URL base) throws MalformedURLException { super (startTag, endTag); url = urlFromHref (startTag, base); depth = startTag.getSource().getDepth() + 1; } /** * Make a Link from a URL. */ public Link (URL url) { super (new Tag (new Page (""), 0, 0, "", true), null); this.url = url; depth = 0; } /** * Make a Link from a File. */ public Link (File file) throws MalformedURLException { this (FileToURL (file)); } /** * Make a Link from a string URL. * @exception java.net.MalformedURLException if the URL is invalid */ public Link (String href) throws MalformedURLException { this (new URL (href)); depth = 0; } /** * Eliminate all references to page content. */ public void discardContent () { parent = null; child = null; sibling = null; } /** * Disconnect this link from its downloaded page (throwing away the page). */ public void disconnect () { page = null; status = LinkEvent.NONE; } /** * Get depth of link in crawl. * @return depth of link from root (depth of roots is 0) */ public int getDepth () { return depth; } /** * Get the URL. * @return the URL of the link */ public URL getURL () { return url; } /** * Get the network protocol of the link, like "ftp" or "http". * @return the protocol portion of the link's URL */ public String getProtocol () { return getURL().getProtocol (); } /** * Get the hostname of the link, like "www.cs.cmu.edu". * @return the hostname portion of the link's URL */ public String getHost () { return getURL().getHost (); } /** * Get the port number of the link. * @return the port number of the link's URL, or -1 if no port number * is explicitly specified in the URL */ public int getPort () { return getURL().getPort (); } /** * Get the filename part of the link, which includes the pathname * and query but not the anchor reference. * Equivalent to getURL().getFile(). * @return the filename portion of the link's URL */ public String getFile () { return getURL().getFile (); } /** * Get the directory part of the link, like "/home/dir/". * Always starts and ends with '/'. * @return the directory portion of the link's URL */ public String getDirectory () { if (directory == null) parseURL (); return directory; } /** * Get the filename part of the link, like "index.html". * Never contains '/'; may be the empty string. * @return the filename portion of the link's URL */ public String getFilename () { if (filename == null) parseURL (); return filename; } /** * Get the query part of the link. * Either starts with a '?', or is empty. * @return the query portion of the link's URL */ public String getQuery () { if (query == null) parseURL (); return query; } /** * Get the anchor reference of the link, like "#ref". * Either starts with '#', or is empty. * @return the anchor reference portion of the link's URL */ public String getRef () { if (ref == null) parseURL (); return ref; } /** * Get the URL of a page, omitting any anchor reference (like #ref). * @return the URL sans anchor reference */ public URL getPageURL () { return getPageURL (getURL()); } /** * Get the URL of a page, omitting any anchor reference (like #ref). * @return the URL sans anchor reference */ public static URL getPageURL (URL url) { String href = url.toExternalForm (); int i = href.indexOf ('#'); try { return (i != -1) ? new URL(href.substring (0, i)) : url; } catch (MalformedURLException e) { return url; } } /** * Get the URL of a Web service, omitting any query or anchor reference. * @return the URL sans query and anchor reference */ public URL getServiceURL () { return getServiceURL (getURL()); } /** * Get the URL of a Web service, omitting any query or anchor reference. * @return the URL sans query and anchor reference */ public static URL getServiceURL (URL url) { String href = url.toExternalForm (); int i = href.indexOf ('?'); try { return (i != -1 && url.getProtocol().equals ("http")) ? new URL(href.substring (0, i)) : getPageURL(url); } catch (MalformedURLException e) { return url; } } /** * Get the URL of a page's directory. * @return the URL sans filename, query and anchor reference */ public URL getDirectoryURL () { return getDirectoryURL (getURL()); } /** * Get the URL of a page's directory. * @return the URL sans filename, query and anchor reference */ public static URL getDirectoryURL (URL url) { String file = url.getFile(); int qmark = file.indexOf ('?'); if (qmark == -1 || !url.getProtocol().equals ("http")) qmark = file.length(); // find pivotal separator (between directory and filename) int pivot = file.lastIndexOf ('/', Math.max(qmark-1, 0)); try { if (pivot == -1) return new URL (url, "/"); else if (pivot == file.length()-1) return url; else return new URL (url, file.substring (0, pivot+1)); } catch (MalformedURLException e) { return url; } } /** * Get the URL of a page's parent directory. * @return the URL sans filename, query and anchor reference */ public URL getParentURL () { return getParentURL (getURL()); } /** * Get the URL of a page's parent directory. * @return the URL sans filename, query and anchor reference */ public static URL getParentURL (URL url) { URL dirURL = getDirectoryURL (url); if (!dirURL.equals (url)) return dirURL; String dir = dirURL.getFile (); int lastSlash = dir.length()-1; if (lastSlash == 0) return dirURL; int penultSlash = dir.lastIndexOf ('/', lastSlash-1); if (penultSlash == -1) return dirURL; try {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -