📄 page.java

📁 一个用java语言编写的网络爬虫程序
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* * WebSPHINX web crawling toolkit * Copyright (C) 1998,1999 Carnegie Mellon University  *  * This library is free software; you can redistribute it * and/or modify it under the terms of the GNU Library * General Public License as published by the Free Software  * Foundation, version 2. * * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/ */package websphinx;import java.net.URL;import java.net.URLConnection;//#ifdef JDK1.1 import java.net.HttpURLConnection;//#endif JDK1.1import java.io.IOException;import java.io.InputStream;import websphinx.util.Str;/** * A Web page.  Although a Page can represent any MIME type, it mainly * supports HTML pages, which are automatically parsed.  The parsing produces * a list of tags, a list of words, an HTML parse tree, and a list of links. */public class Page extends Region {    // Permanent content    Link origin;    long lastModified = 0;    long expiration = 0;    String contentType;    String contentEncoding;    int responseCode = -1;    String responseMessage = null;    URL base;    String title;    Link[] links;    int contentLock;         // If page was downloaded from Net, represents number of         //    callers who want to keep the content.        // If page was created from a string, set to -1.     // Discardable content (thrown away when contentLock falls to 0)    String content;    Region[] tokens;    Text[] words;    Tag[] tags;    Element[] elements;    Element root;    String canonicalTags;    /**     * Make a Page by downloading and parsing a Link.     * @param link Link to download     */    public Page (Link link) throws IOException {        this (link, new HTMLParser ());    }    /**     * Make a Page by downloading a Link.     * @param link Link to download     * @param parser HTML parser to use     */    public Page (Link link, HTMLParser parser) throws IOException {        super (null, 0, 0);        source = this;        origin = link;        base = getURL ();        download (parser);        link.setPage (this);    }    /**     * Make a Page from a URL and a string of HTML.     * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail.     * @param url URL to use as a base for relative links on the page     * @param html the HTML content of the page     */    public Page (URL url, String html) {        this (url, html, new HTMLParser ());    }    /**     * Make a Page from a URL and a string of HTML.     * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail.     * @param url URL to use as a base for relative links on the page     * @param html the HTML content of the page     * @param parser HTML parser to use     */    public Page (URL url, String html, HTMLParser parser) {        super (null, 0, html.length());        source = this;        base = url;        this.content = html;        contentLock = -1;        parse (parser);    }    /**     * Make a Page from a string of content.  The content is not parsed.      * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail.     * @param content HTML content of the page */    public Page (String content) {        super (null, 0, content.length());        // FIX: don't think base==null will work        source = this;        this.content = content;        contentLock = -1;    }    //    // Downloading    //    // This code generates SecurityExceptions in Netscape 4.0,    // and it doesn't seem to be necessary anyway: redirects are followed    // by Netscape and JDK by default, despite the fact that the JDK    // docs claim that setFollowRedirects() defaults to false        //static {      //try {      //  HttpURLConnection.setFollowRedirects (true);      //} catch (Throwable t) { }    //}    /*     * Download the page.  The downloaded page is parsed      * if its MIME type is HTML or unspecified.     * @param parser HTML parser to use     * @exception IOException if an error occurs in downloading the page     */    public void download (HTMLParser parser) throws IOException {        URLConnection conn =             SecurityPolicy.getPolicy ().openConnection (origin);                // fetch and store final redirected URL and response headers        base = conn.getURL ();        lastModified = conn.getLastModified ();        expiration = conn.getExpiration ();        contentType = conn.getContentType ();        contentEncoding = conn.getContentEncoding ();//#ifdef JDK1.1         // get HTTP response codes        if (conn instanceof HttpURLConnection) {            HttpURLConnection httpconn = (HttpURLConnection)conn;            responseCode = httpconn.getResponseCode ();            responseMessage = httpconn.getResponseMessage ();            if (responseMessage == null)                responseMessage = "unknown error";                        if (responseCode >= 300)                // HTTP failure                throw new IOException (responseCode + " " + responseMessage);         }//#endif JDK1.1    //System.err.println ("Original URL: " + origin.getURL());    //System.err.println ("Final URL: " + conn.getURL());        // download and parse the response        InputStream in = conn.getInputStream ();        if (contentType == null            || contentType.equals ("text/html")             || contentType.equals ("content/unknown"))            parser.parse (this, in);        else            parser.dontParse (this, in);        in.close ();        contentLock = 1;    }    void downloadSafely () {      try {          download (new HTMLParser ());      } catch (Throwable e) {      }    }    //    // Parsing    //    /**     * Parse the page.  Assumes the page has already been downloaded.     * @param parser HTML parser to use     * @exception IOException if an error occurs in downloading the page     */    public void parse (HTMLParser parser) {        if (!hasContent())            downloadSafely ();        try {            parser.parse (this, content);        } catch (IOException e) {            throw new RuntimeException (e.toString());        }    }        /**     * Test whether page has been parsed.  Pages are parsed during      * download only if its MIME type is HTML or unspecified.     * @return true if page was parsed, false if not     */    public boolean isParsed () {        return tokens != null;    }    /**     * Test whether page is HTML.     * @return true if page is HTML, false if not     */    public boolean isHTML () {        return root != null;    }    /**     * Test whether page is a GIF or JPEG image.     * @return true if page is a GIF or JPEG image, false if not     */    private static final String GIF_CODE = "GIF8";    private static final String JPG_CODE = "\377\330\377\340\0\020JFIF";    public boolean isImage () {        return content.startsWith (GIF_CODE) || content.startsWith (JPG_CODE);    }    //    // Content management    //    /**     * Lock the page's content (to prevent it from being discarded).     * This method increments a lock counter, representing all the      * callers interested in preserving the content.  The lock     * counter is set to 1 when the page is initially downloaded.     */    public void keepContent () {        if (contentLock > 0)            ++contentLock;    }    /**     * Unlock the page's content (allowing it to be garbage-collected, to     * save space during a Web crawl).  This method decrements a lock counter.     * If the counter falls to     * 0 (meaning no callers are interested in the content),      * the content is released.  At least the following     * fields are discarded: content, tokens, tags, words, elements, and     * root.  After the content has been discarded, calling getContent()     * (or getTokens(), getTags(), etc.) will force the page to be downloaded     * again.  Hopefully the download will come from the cache, however.     * <P> Links are not considered part of the content, and are not subject to     * discarding by this method.  Also, if the page was created from a string     * (rather than by downloading), its content is not subject to discarding      * (since there would be no way to recover it).      */    public void discardContent () {        if (contentLock == 0)    // already discarded            return;                    if (--contentLock > 0)   // somebody else still has a lock on the content            return;                    if (origin == null)            return;     // without an origin, we'd have no way to recover this page                    //System.err.println ("discarding content of " + toDescription());        content = null;        tokens = null;        tags = null;        words = null;        elements = null;        root = null;        canonicalTags = null;        // keep links, but isolate them from the element tree        if (links != null) {            for (int i=0; i<links.length; ++i)                 if (links[i] instanceof Link)                    ((Link)links[i]).discardContent ();        }                // FIX: debugging only: disconnect this page from its parent        //origin.page = null;        //origin = null;        contentLock = 0;    }    /**     * Test if page content is available.     * @return true if content is downloaded and available, false if content has not been downloaded      * or has been discarded.     */    public final boolean hasContent () {        return contentLock != 0;    }    //    // Page accessors    //    /**     * Get depth of page in crawl.     * @return depth of page from root (depth of page is same as depth of its originating link)     */    public int getDepth () {        return origin != null ? origin.getDepth () : 0;    }        /**     * Get the Link that points to this page.     * @return the Link object that was used to download this page.     */     public Link getOrigin () {        return origin;    }    /**     * Get the base URL, relative to which the page's links were interpreted.     * The base URL defaults to the URL of the      * Link that was used to download the page.  If any redirects occur     * while downloading the page, the final location becomes the new base     * URL.  Lastly, if a <BASE> element is found in the page, that     * becomes the new base URL.     * @return the page's base URL.     */     public URL getBase () {        return base;    }    /**     * Get the URL.     * @return the URL of the link that was used to download this page     */     public URL getURL () {        return origin != null ? origin.getURL() : null;
12 下一页
💿 文件大小 602 K
👤 上传用户 jwl119
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#java #语言 #编写 #程序
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -