📄 link.java

📁 java写的crawler
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* * WebSphinx web-crawling toolkit * * Copyright (c) 1998-2002 Carnegie Mellon University.  All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */package websphinx;import java.util.Enumeration;import java.io.File;import java.net.URL;import java.net.MalformedURLException;import rcm.util.Prioritized;/** * Link to a Web page. *  * @author Rob Miller * @see Page */public class Link extends Element implements Prioritized {    protected URL url;        private String directory;    private String filename;    private String query;    private String ref;    private Page page;    private int depth;    private String text = "";   // stored text of link anchor     private int status = LinkEvent.NONE;    private float priority;    private DownloadParameters dp;          // timeouts, etc. to use when downloading this link    /**     * Make a Link from a start tag and end tag and a base URL (for relative references).       * The tags must be on the same page.     * @param startTag Start tag of element     * @param endTag End tag of element     * @param base Base URL used for relative references     */    public Link (Tag startTag, Tag endTag, URL base) throws MalformedURLException {        super (startTag, endTag);        url = urlFromHref (startTag, base);        depth = startTag.getSource().getDepth() + 1;    }    /**     * Make a Link from a URL.     */    public Link (URL url) {        super (new Tag (new Page (""), 0, 0, "", true), null);        this.url = url;        depth = 0;    }    /**     * Make a Link from a File.     */    public Link (File file) throws MalformedURLException {        this (FileToURL (file));    }    /**     * Make a Link from a string URL.     * @exception java.net.MalformedURLException if the URL is invalid     */    public Link (String href) throws MalformedURLException {        this (new URL (href));        depth = 0;    }    /**     * Eliminate all references to page content.     */    public void discardContent () {        parent = null;        child = null;        sibling = null;    }            /**     * Disconnect this link from its downloaded page (throwing away the page).     */    public void disconnect () {        page = null;        status = LinkEvent.NONE;    }            /**     * Get depth of link in crawl.     * @return depth of link from root (depth of roots is 0)     */    public int getDepth () {        return depth;    }        /**     * Get the URL.     * @return the URL of the link     */     public URL getURL () {        return url;    }    /**     * Get the network protocol of the link, like "ftp" or "http".     * @return the protocol portion of the link's URL     */    public String getProtocol () {        return getURL().getProtocol ();    }    /**     * Get the hostname of the link, like "www.cs.cmu.edu".     * @return the hostname portion of the link's URL     */    public String getHost () {        return getURL().getHost ();    }    /**     * Get the port number of the link.     * @return the port number of the link's URL, or -1 if no port number     * is explicitly specified in the URL     */    public int getPort () {        return getURL().getPort ();    }    /**     * Get the filename part of the link, which includes the pathname     * and query but not the anchor reference.     * Equivalent to getURL().getFile().     * @return the filename portion of the link's URL      */    public String getFile () {        return getURL().getFile ();    }    /**     * Get the directory part of the link, like "/home/dir/".     * Always starts and ends with '/'.     * @return the directory portion of the link's URL     */    public String getDirectory () {        if (directory == null)            parseURL ();        return directory;    }    /**     * Get the filename part of the link, like "index.html".     * Never contains '/'; may be the empty string.     * @return the filename portion of the link's URL     */    public String getFilename () {        if (filename == null)            parseURL ();        return filename;    }    /**     * Get the query part of the link.     * Either starts with a '?', or is empty.     * @return the query portion of the link's URL     */    public String getQuery () {        if (query == null)            parseURL ();        return query;    }    /**     * Get the anchor reference of the link, like "#ref".     * Either starts with '#', or is empty.     * @return the anchor reference portion of the link's URL     */    public String getRef () {        if (ref == null)            parseURL ();        return ref;    }    /**     * Get the URL of a page, omitting any anchor reference (like #ref).     * @return the URL sans anchor reference     */    public URL getPageURL () {        return getPageURL (getURL());    }    /**     * Get the URL of a page, omitting any anchor reference (like #ref).     * @return the URL sans anchor reference     */    public static URL getPageURL (URL url) {        String href = url.toExternalForm ();        int i = href.indexOf ('#');        try {            return (i != -1) ? new URL(href.substring (0, i)) : url;        } catch (MalformedURLException e) {            return url;        }    }    /**     * Get the URL of a Web service, omitting any query or anchor reference.     * @return the URL sans query and anchor reference     */    public URL getServiceURL () {        return getServiceURL (getURL());    }            /**     * Get the URL of a Web service, omitting any query or anchor reference.     * @return the URL sans query and anchor reference     */    public static URL getServiceURL (URL url) {        String href = url.toExternalForm ();        int i = href.indexOf ('?');        try {            return (i != -1 && url.getProtocol().equals ("http"))                 ? new URL(href.substring (0, i))                 : getPageURL(url);        } catch (MalformedURLException e) {            return url;        }    }    /**     * Get the URL of a page's directory.     * @return the URL sans filename, query and anchor reference     */    public URL getDirectoryURL () {        return getDirectoryURL (getURL());    }            /**     * Get the URL of a page's directory.     * @return the URL sans filename, query and anchor reference     */    public static URL getDirectoryURL (URL url) {        String file = url.getFile();        int qmark = file.indexOf ('?');        if (qmark == -1 || !url.getProtocol().equals ("http"))            qmark = file.length();        // find pivotal separator (between directory and filename)        int pivot = file.lastIndexOf ('/', Math.max(qmark-1, 0));        try {            if (pivot == -1)                return new URL (url, "/");            else if (pivot == file.length()-1)                return url;            else                return new URL (url, file.substring (0, pivot+1));        } catch (MalformedURLException e) {            return url;        }    }    /**     * Get the URL of a page's parent directory.     * @return the URL sans filename, query and anchor reference     */    public URL getParentURL () {        return getParentURL (getURL());    }            /**     * Get the URL of a page's parent directory.     * @return the URL sans filename, query and anchor reference     */    public static URL getParentURL (URL url) {        URL dirURL = getDirectoryURL (url);        if (!dirURL.equals (url))            return dirURL;        String dir = dirURL.getFile ();        int lastSlash = dir.length()-1;        if (lastSlash == 0)            return dirURL;                    int penultSlash = dir.lastIndexOf ('/', lastSlash-1);        if (penultSlash == -1)            return dirURL;        try {
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -