📄 page.java

📁 java写的crawler
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* * WebSphinx web-crawling toolkit * * Copyright (c) 1998-2002 Carnegie Mellon University.  All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */package websphinx;import java.net.URL;import java.net.URLConnection;//#ifdef JDK1.1 import java.net.HttpURLConnection;//#endif JDK1.1import java.io.IOException;import java.io.InputStream;import rcm.util.Str;/** * A Web page.  Although a Page can represent any MIME type, it mainly * supports HTML pages, which are automatically parsed.  The parsing produces * a list of tags, a list of words, an HTML parse tree, and a list of links. */public class Page extends Region {    // typical page length, to optimize downloads    static final int TYPICAL_LENGTH = 20240;    // Permanent content    Link origin;    long lastModified = 0;    long expiration = 0;    String contentType;    String contentEncoding;    int responseCode = -1;    String responseMessage = null;    URL base;    String title;    Link[] links;    int contentLock;         // If page was downloaded from Net, represents number of         //    callers who want to keep the content.        // If page was created from a string, set to -1.     // Discardable content (thrown away when contentLock falls to 0)    byte[] contentBytes;    String content;    Region[] tokens;    Text[] words;    Tag[] tags;    Element[] elements;    Element root;    String canonicalTags;    /**     * Make a Page by downloading and parsing a Link.     * @param link Link to download     */    public Page (Link link) throws IOException {        this (link, DownloadParameters.NO_LIMITS, new HTMLParser ());    }    /**     * Make a Page by downloading a Link.     * @param link Link to download     * @param dp Download parameters to use     */    public Page (Link link, DownloadParameters dp) throws IOException {        this (link, dp, new HTMLParser ());    }    /**     * Make a Page by downloading a Link.     * @param link Link to download     * @param parser HTML parser to use     */    public Page (Link link, DownloadParameters dp, HTMLParser parser) throws IOException {        super (null, 0, 0);        source = this;        origin = link;        base = getURL ();        download (dp, parser);        link.setPage (this);    }    /**     * Make a Page from a URL and a string of HTML.     * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail.     * @param url URL to use as a base for relative links on the page     * @param html the HTML content of the page     */    public Page (URL url, String html) {        this (url, html, new HTMLParser ());    }    /**     * Make a Page from a URL and a string of HTML.     * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail.     * @param url URL to use as a base for relative links on the page     * @param html the HTML content of the page     * @param parser HTML parser to use     */    public Page (URL url, String html, HTMLParser parser) {        super (null, 0, html.length());        source = this;        base = url;        this.content = html;        this.contentBytes = html.getBytes ();        contentLock = -1;        parse (parser);    }    /**     * Make a Page from a string of content.  The content is not parsed.      * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail.     * @param content HTML content of the page */    public Page (String content) {        super (null, 0, content.length());        // FIX: don't think base==null will work        source = this;        this.content = content;        this.contentBytes = content.getBytes ();        contentLock = -1;    }    /**     * Make a Page from a byte array of content.  The content is not parsed.      * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail.     * @param content byte content of the page */    public Page (byte[] content) {        super (null, 0, content.length);        // FIX: don't think base==null will work        source = this;        this.contentBytes = new byte[content.length];        System.arraycopy (content, 0, this.contentBytes, 0, content.length);        this.content = new String (content);        contentLock = -1;    }    //    // Downloading    //    // This code generates SecurityExceptions in Netscape 4.0,    // and it doesn't seem to be necessary anyway: redirects are followed    // by Netscape and JDK by default, despite the fact that the JDK    // docs claim that setFollowRedirects() defaults to false        //static {      //try {      //  HttpURLConnection.setFollowRedirects (true);      //} catch (Throwable t) { }    //}    /*     * Download the page.  The downloaded page is parsed      * if its MIME type is HTML or unspecified.     * @param parser HTML parser to use     * @exception IOException if an error occurs in downloading the page     */    public void download (DownloadParameters dp, HTMLParser parser) throws IOException {        URLConnection conn =             Access.getAccess ().openConnection (origin);                // fetch and store final redirected URL and response headers        InputStream in = conn.getInputStream ();        base = conn.getURL ();        lastModified = conn.getLastModified ();        expiration = conn.getExpiration ();        contentType = conn.getContentType ();        contentEncoding = conn.getContentEncoding ();//#ifdef JDK1.1         // get HTTP response codes        if (conn instanceof HttpURLConnection) {            HttpURLConnection httpconn = (HttpURLConnection)conn;            responseCode = httpconn.getResponseCode ();            responseMessage = httpconn.getResponseMessage ();            if (responseMessage == null)                responseMessage = "unknown error";                        if (responseCode >= 300)                // HTTP failure                throw new IOException (responseCode + " " + responseMessage);         }//#endif JDK1.1//     System.err.println ("Original URL: " + origin.getURL());//     System.err.println ("Final URL: " + conn.getURL());        // download content        int maxKB = dp.getMaxPageSize ();        int maxBytes = (maxKB > 0) ? maxKB * 1024 : Integer.MAX_VALUE;        int expectedLength = conn.getContentLength ();        if (expectedLength > maxBytes)            throw new IOException ("Page greater than "                                    + maxBytes + " bytes");        if (expectedLength == -1)            expectedLength = TYPICAL_LENGTH;        byte[] buf = new byte[expectedLength];        int n;        int total = 0;        while ((n = in.read (buf, total, buf.length - total)) != -1) {            total += n;            if (total > maxBytes)                throw new IOException ("Page greater than "                                        + maxBytes + " bytes");            if (total == buf.length) {                // try to read one more character                int c = in.read ();                if (c == -1)                    break; // EOF, we're done                else {                    // need more space in array.  Double the array, but don't make                    // it bigger than maxBytes.                    byte[] newbuf = new byte[Math.min (buf.length * 2, maxBytes)];                    System.arraycopy (buf, 0, newbuf, 0, buf.length);                    buf = newbuf;                    buf[total++] = (byte) c;                }            }                            }        in.close ();                if (total != buf.length) {            // resize the array to be precisely total bytes long            byte[] newbuf = new byte[total];            System.arraycopy (buf, 0, newbuf, 0, total);            buf = newbuf;        }         contentBytes = buf;        content = new String (buf);        start = 0;        end = total;        contentLock = 1;        //  parse the response        if (contentType == null            || contentType.startsWith ("text/html")             || contentType.startsWith ("content/unknown"))            parse (parser);    }    void downloadSafely () {      try {          download (new DownloadParameters (), new HTMLParser ());      } catch (Throwable e) {      }    }    //    // Parsing    //    /**     * Parse the page.  Assumes the page has already been downloaded.     * @param parser HTML parser to use     * @exception RuntimeException if an error occurs in downloading the page     */    public void parse (HTMLParser parser) {        if (!hasContent())            downloadSafely ();        try {            parser.parse (this);        } catch (IOException e) {            throw new RuntimeException (e.toString());        }    }        /**     * Test whether page has been parsed.  Pages are parsed during      * download only if its MIME type is HTML or unspecified.     * @return true if page was parsed, false if not     */    public boolean isParsed () {        return tokens != null;    }    /**     * Test whether page is HTML.     * @return true if page is HTML.     */    public boolean isHTML () {        return root != null;    }    /**     * Test whether page is a GIF or JPEG image.     * @return true if page is a GIF or JPEG image, false if not     */    public boolean isImage () {        byte[] bytes = getContentBytes ();        return startsWith (bytes, GIF_MAGIC) || startsWith (bytes, JPG_MAGIC);    }    private static final byte[] GIF_MAGIC = {         (byte) 'G', (byte)'I', (byte)'F', (byte)'8'     };    private static final byte[] JPG_MAGIC = {        (byte) 0377, (byte) 0330, (byte) 0377,        (byte) 0340, (byte) 0, (byte) 020,        (byte) 'J', (byte) 'F', (byte) 'I', (byte) 'F'    };    private boolean startsWith (byte[] bytes, byte[] prefix) {        if (prefix.length > bytes.length)            return false;        for (int i = 0, n = prefix.length; i < n; ++i)            if (bytes[i] != prefix[i])                return false;        return true;    }    //    // Content management    //    /**     * Lock the page's content (to prevent it from being discarded).     * This method increments a lock counter, representing all the      * callers interested in preserving the content.  The lock     * counter is set to 1 when the page is initially downloaded.     */    public void keepContent () {        if (contentLock > 0)            ++contentLock;    }    /**     * Unlock the page's content (allowing it to be garbage-collected, to     * save space during a Web crawl).  This method decrements a lock counter.     * If the counter falls to     * 0 (meaning no callers are interested in the content),      * the content is released.  At least the following     * fields are discarded: content, tokens, tags, words, elements, and     * root.  After the content has been discarded, calling getContent()     * (or getTokens(), getTags(), etc.) will force the page to be downloaded     * again.  Hopefully the download will come from the cache, however.     * <P> Links are not considered part of the content, and are not subject to     * discarding by this method.  Also, if the page was created from a string     * (rather than by downloading), its content is not subject to discarding      * (since there would be no way to recover it).      */    public void discardContent () {        if (contentLock == 0)    // already discarded            return;                    if (--contentLock > 0)   // somebody else still has a lock on the content            return;                    if (origin == null)            return;     // without an origin, we'd have no way to recover this page                    //System.err.println ("discarding content of " + toDescription());        contentBytes = null;        content = null;        tokens = null;        tags = null;        words = null;        elements = null;        root = null;        canonicalTags = null;        // keep links, but isolate them from the element tree        if (links != null) {            for (int i=0; i<links.length; ++i)                 if (links[i] instanceof Link)                    ((Link)links[i]).discardContent ();        }                // FIX: debugging only: disconnect this page from its parent        //origin.page = null;        //origin = null;        contentLock = 0;    }
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -