📄 link.java
字号:
/* * WebSPHINX web crawling toolkit * Copyright (C) 1998,1999 Carnegie Mellon University * * This library is free software; you can redistribute it * and/or modify it under the terms of the GNU Library * General Public License as published by the Free Software * Foundation, version 2. * * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/ */package websphinx;import java.util.Enumeration;import java.io.File;import java.net.URL;import java.net.MalformedURLException;import websphinx.util.Prioritized;/** * Link to a Web page. * * @author Rob Miller * @see Page */public class Link extends Element implements Prioritized { protected URL url; private String directory; private String filename; private String query; private String ref; private Page page; private int depth; private String text = ""; // stored text of link anchor private int status = LinkEvent.NONE; private float priority; private DownloadParameters dp; // timeouts, etc. to use when downloading this link /** * Make a Link from a start tag and end tag and a base URL (for relative references). * The tags must be on the same page. * @param startTag Start tag of element * @param endTag End tag of element * @param base Base URL used for relative references */ public Link (Tag startTag, Tag endTag, URL base) throws MalformedURLException { super (startTag, endTag); url = urlFromHref (startTag, base); depth = startTag.getSource().getDepth() + 1; } /** * Make a Link from a URL. */ public Link (URL url) { super (new Tag (new Page (""), 0, 0, "", true), null); this.url = url; depth = 0; } /** * Make a Link from a File. */ public Link (File file) throws MalformedURLException { this (FileToURL (file)); } /** * Make a Link from a string URL. * @exception java.net.MalformedURLException if the URL is invalid */ public Link (String href) throws MalformedURLException { this (new URL (href)); depth = 0; } /** * Eliminate all references to page content. */ public void discardContent () { parent = null; child = null; sibling = null; } /** * Disconnect this link from its downloaded page (throwing away the page). */ public void disconnect () { page = null; status = LinkEvent.NONE; } /** * Get depth of link in crawl. * @return depth of link from root (depth of roots is 0) */ public int getDepth () { return depth; } /** * Get the URL. * @return the URL of the link */ public URL getURL () { return url; } /** * Get the network protocol of the link, like "ftp" or "http". * @return the protocol portion of the link's URL */ public String getProtocol () { return getURL().getProtocol (); } /** * Get the hostname of the link, like "www.cs.cmu.edu". * @return the hostname portion of the link's URL */ public String getHost () { return getURL().getHost (); } /** * Get the port number of the link. * @return the port number of the link's URL, or -1 if no port number * is explicitly specified in the URL */ public int getPort () { return getURL().getPort (); } /** * Get the information part of the link, like * "/home/dir/index.html?query". Equivalent to getURL().getFile(). * @return the filename portion of the link's URL */ public String getFile () { return getURL().getFile (); } /** * Get the directory part of the link, like "/home/dir/". * Always starts and ends with '/'. * @return the directory portion of the link's URL */ public String getDirectory () { if (directory == null) parseURL (); return directory; } /** * Get the filename part of the link, like "index.html". * Never contains '/'; may be the empty string. * @return the filename portion of the link's URL */ public String getFilename () { if (filename == null) parseURL (); return filename; } /** * Get the query part of the link, like "?query". * Either starts with a '?', or is empty. * @return the query portion of the link's URL */ public String getQuery () { if (query == null) parseURL (); return query; } /** * Get the anchor reference of the link, like "#ref". * Either starts with '#', or is empty. * @return the anchor reference portion of the link's URL */ public String getRef () { if (ref == null) parseURL (); return ref; } /** * Get the URL of a page, omitting any anchor reference (like #ref). * @return the URL sans anchor reference */ public URL getPageURL () { return getPageURL (getURL()); } /** * Get the URL of a page, omitting any anchor reference (like #ref). * @return the URL sans anchor reference */ public static URL getPageURL (URL url) { String href = url.toExternalForm (); int i = href.indexOf ('#'); try { return (i != -1) ? new URL(href.substring (0, i)) : url; } catch (MalformedURLException e) { return url; } } /** * Get the URL of a Web service, omitting any query or anchor reference. * @return the URL sans query and anchor reference */ public URL getServiceURL () { return getServiceURL (getURL()); } /** * Get the URL of a Web service, omitting any query or anchor reference. * @return the URL sans query and anchor reference */ public static URL getServiceURL (URL url) { String href = url.toExternalForm (); int i = href.indexOf ('?'); try { return (i != -1 && url.getProtocol().equals ("http")) ? new URL(href.substring (0, i)) : getPageURL(url); } catch (MalformedURLException e) { return url; } } /** * Get the URL of a page's directory. * @return the URL sans filename, query and anchor reference */ public URL getDirectoryURL () { return getDirectoryURL (getURL()); } /** * Get the URL of a page's directory. * @return the URL sans filename, query and anchor reference */ public static URL getDirectoryURL (URL url) { String file = url.getFile(); int qmark = file.indexOf ('?'); if (qmark == -1 || !url.getProtocol().equals ("http")) qmark = file.length(); // find pivotal separator (between directory and filename) int pivot = file.lastIndexOf ('/', Math.max(qmark-1, 0)); try { if (pivot == -1) return new URL (url, "/"); else if (pivot == file.length()-1) return url; else return new URL (url, file.substring (0, pivot+1)); } catch (MalformedURLException e) { return url; } } /** * Get the URL of a page's parent directory. * @return the URL sans filename, query and anchor reference */ public URL getParentURL () { return getParentURL (getURL()); } /** * Get the URL of a page's parent directory. * @return the URL sans filename, query and anchor reference */ public static URL getParentURL (URL url) { URL dirURL = getDirectoryURL (url); if (!dirURL.equals (url)) return dirURL; String dir = dirURL.getFile (); int lastSlash = dir.length()-1; if (lastSlash == 0) return dirURL; int penultSlash = dir.lastIndexOf ('/', lastSlash-1); if (penultSlash == -1) return dirURL; try { return new URL (url, dir.substring (0, penultSlash+1)); } catch (MalformedURLException e) { return dirURL; } } // computes relative HREF for URL <there> when the current location // is URL <here> public static String relativeTo (URL here, URL there) { if (here == null) return there.toString();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -