📄 link.java

📁 java写的crawler
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
            return new URL (url, dir.substring (0, penultSlash+1));        } catch (MalformedURLException e) {            return dirURL;        }    }        // computes relative HREF for URL <there> when the current location    // is URL <here>    public static String relativeTo (URL here, URL there) {        if (here == null)            return there.toString();        //System.err.println ("From: " + here);        //System.err.println ("To:   " + there);        if (here.getProtocol().equals (there.getProtocol())            && here.getHost().equals (there.getHost ())            && here.getPort() == there.getPort ()) {            String fn = relativeTo (here.getFile (),                                    there.getFile ());            String ref = there.getRef ();            return (ref != null) ? fn+ref : fn;        }        else {          //System.err.println ("Use: " + there);            return there.toString ();        }    }    // computes relative HREF for URL <there> when the current location    // is URL <here>    public static String relativeTo (URL here, String there) {        if (here == null)            return there;      try {        return relativeTo (here, new URL (here, there));      } catch (MalformedURLException e) {        return there;      }    }    // computes relative HREF for filename <there> when the current location    // is filename <here>    private static String relativeTo (String here, String there) {        StringBuffer result = new StringBuffer ();        int lcp = 0;        while (true) {            int i = here.indexOf ('/', lcp);            int j = there.indexOf ('/', lcp);            if (i == -1 || i != j || !here.regionMatches (lcp, there, lcp, i-lcp))                break;            lcp = i+1;        }        // assert: first lcp characters of here and there are identical        //         and (lcp==0 or here[lcp-1] == '/')        // here[0..lcp-1] is the common ancestor directory of here and there        // count hops up from here to the common ancestor        for (int i = here.indexOf ('/', lcp);             i != -1;             i = here.indexOf ('/', i+1)) {            result.append ("..");            result.append ('/');        }        // append path down from common ancestor to there        result.append (there.substring (lcp));        //System.out.println ("Use:   " + result);        //System.out.println ();        return result.toString ();    }    /**     * Convert a local filename to a URL.     * For example, if the filename is "C:\FOO\BAR\BAZ",     * the resulting URL is "file:/C:/FOO/BAR/BAZ".     * @param file File to convert     * @return URL corresponding to file     */    public static URL FileToURL (File file) throws MalformedURLException {        return new URL ("file:" + toURLDelimiters (file.getAbsolutePath ()));    }        /**     * Convert a file: URL to a filename appropriate to the     * current system platform.  For example, on MS Windows,     * if the URL is "file:/FOO/BAR/BAZ", the resulting     * filename is "\FOO\BAR\BAZ".     * @param url URL to convert     * @return File corresponding to url     * @exception MalformedURLException if url is not a     * file: URL.     */    public static File URLToFile (URL url) throws MalformedURLException {        if (!url.getProtocol().equals ("file"))            throw new MalformedURLException ();                    String path = url.getFile ();        path = path.replace ('/', File.separatorChar);        // for MSWindows: change pathnames of the        // form /X:/ to X:/        if (path.length () > 3            && path.charAt (0) == File.separatorChar            && path.charAt(2) == ':'            && path.charAt (3) == File.separatorChar)            path = path.substring (1);                    return new File (path);    }        public static String toURLDelimiters (String path) {        path = path.replace ('\\', '/');        if (!path.startsWith ("/"))            path = "/" + path;                   return path;    }    /**     * Get the downloaded page to which the link points.     * @return the Page object, or null if the page hasn't been downloaded.     */    public Page getPage () {        return page;    }    /**     * Set the page corresponding to this link.     * @param page Page to which this link points     */    public void setPage (Page page) {        this.page = page;    }    /**     * Use the HTTP GET method to download this link.     */    public static final int GET = 0;    /**     * Use the HTTP POST method to access this link.     */    public static final int POST = 1;    /**     * Get the method used to access this link.     * @return GET or POST.     */     public int getMethod () {        return GET;    }    /**     * Convert the link's URL to a String     * @return the URL represented as a string     */    public String toURL () {        return getURL().toExternalForm ();    }    /**     * Generate a human-readable description of the link.     * @return a description of the link, in the form "[url]".     */    public String toDescription () {        return (text.length() > 0 ? text + " " : "") + "[" + getURL() + "]";    }    /**     * Convert the region to tagless text.     * @return a string consisting of the text in the page contained by this region     */    public String toText () {        return text;    }        /**     * Set the tagless-text representation of this region.     * @param text a string consisting of the text in the page contained by this region     */    public void setText (String text) {        this.text = text;    }    private void parseURL () {        String protocol = getProtocol();        String file = getFile();                int qmark = file.indexOf ('?');        if (qmark == -1 || !protocol.equals ("http")) {            query = "";            qmark = file.length();        }        else {            query = file.substring (qmark+1);            file = file.substring (0, qmark);        }            int slash = file.lastIndexOf ('/', Math.max(qmark-1, 0));        if (slash == -1) {            directory = "";            filename = file;        }        else {            directory = file.substring (0, slash+1);            filename = file.substring (slash+1);        }        ref = getURL().getRef ();        if (ref == null)            ref = "";    }    /**     * Construct the URL for a link element, from its start tag and a base URL (for relative references).     * @param tag Start tag of link, such as &lt;A HREF="/foo/index.html"&gt;.     * @param base Base URL used for relative references     * @return URL to which the link points     */    protected URL urlFromHref (Tag tag, URL base) throws MalformedURLException {        // element is a link -- make an instance of Link.        String hrefAttr = getHrefAttributeName (tag);        String href = tag.getHTMLAttribute (hrefAttr);        if (tag.tagName == Tag.APPLET) {            String codebase = tag.getHTMLAttribute ("codebase");            if (codebase != null)                base = new URL (base, codebase);        }        return new URL (base, href);    }    /**     * Copy the link's start tag, replacing the URL.  Note that the name of the attribute containing the URL     * varies from tag to tag: sometimes it is called HREF, sometimes SRC, sometimes CODE, etc.     * This method changes the appropriate attribute for this tag.     * @param newHref New URL or relative reference; e.g. "http://www.cs.cmu.edu/" or "/foo/index.html".     * @return copy of this link's start tag with its URL attribute replaced.  The copy is      * a region of a fresh page containing only the tag.     */    public Tag replaceHref (String newHref) {        Tag tag = startTag;                if (tag.getTagName() == Tag.APPLET) {            int i = newHref.lastIndexOf ('/');            if (i != -1) {                tag = startTag.replaceHTMLAttribute ("codebase", newHref.substring (0, i+1));                newHref = newHref.substring (i+1);            }        }        String hrefAttrName = getHrefAttributeName (tag);        if (hrefAttrName == null)            return tag;        return tag.replaceHTMLAttribute (hrefAttrName, newHref);    }        private static String getHrefAttributeName (Tag tag) {        return (String)HTMLParser.linktag.get (tag.getTagName ());    }    /**     * Get the status of the link.  Possible values are defined in LinkEvent.     * @return last event that happened to this link     */    public int getStatus () {        return status;    }    /**     * Set the status of the link.  Possible values are defined in LinkEvent.     * @param event the event that just happened to this link     */    public void setStatus (int event) {        status = event;    }        /**     * Get the priority of the link in the crawl.     */    public float getPriority () {        return priority;    }    /**     * Set the priority of the link in the crawl.     */    public void setPriority (float priority) {        this.priority = priority;    }        /**     * Get the download parameters used for this link.  Default is null.     */    public DownloadParameters getDownloadParameters  () {        return dp;    }    /**     * Set the download parameters used for this link.     */    public void setDownloadParameters (DownloadParameters dp) {        this.dp = dp;    }    /*     * Testing     *       public static void main (String[] args) throws Exception {    if (args[0].equals ("file"))      System.out.println (Link.FileToURL (new File (args[1])));    else if (args[0].equals ("url"))      System.out.println (Link.URLToFile (new URL (args[1])));  }     *     *     */}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -