📄 page.java
字号:
/* * WebSPHINX web crawling toolkit * Copyright (C) 1998,1999 Carnegie Mellon University * * This library is free software; you can redistribute it * and/or modify it under the terms of the GNU Library * General Public License as published by the Free Software * Foundation, version 2. * * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/ */package websphinx;import java.net.URL;import java.net.URLConnection;//#ifdef JDK1.1 import java.net.HttpURLConnection;//#endif JDK1.1import java.io.IOException;import java.io.InputStream;import websphinx.util.Str;/** * A Web page. Although a Page can represent any MIME type, it mainly * supports HTML pages, which are automatically parsed. The parsing produces * a list of tags, a list of words, an HTML parse tree, and a list of links. */public class Page extends Region { // Permanent content Link origin; long lastModified = 0; long expiration = 0; String contentType; String contentEncoding; int responseCode = -1; String responseMessage = null; URL base; String title; Link[] links; int contentLock; // If page was downloaded from Net, represents number of // callers who want to keep the content. // If page was created from a string, set to -1. // Discardable content (thrown away when contentLock falls to 0) String content; Region[] tokens; Text[] words; Tag[] tags; Element[] elements; Element root; String canonicalTags; /** * Make a Page by downloading and parsing a Link. * @param link Link to download */ public Page (Link link) throws IOException { this (link, new HTMLParser ()); } /** * Make a Page by downloading a Link. * @param link Link to download * @param parser HTML parser to use */ public Page (Link link, HTMLParser parser) throws IOException { super (null, 0, 0); source = this; origin = link; base = getURL (); download (parser); link.setPage (this); } /** * Make a Page from a URL and a string of HTML. * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail. * @param url URL to use as a base for relative links on the page * @param html the HTML content of the page */ public Page (URL url, String html) { this (url, html, new HTMLParser ()); } /** * Make a Page from a URL and a string of HTML. * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail. * @param url URL to use as a base for relative links on the page * @param html the HTML content of the page * @param parser HTML parser to use */ public Page (URL url, String html, HTMLParser parser) { super (null, 0, html.length()); source = this; base = url; this.content = html; contentLock = -1; parse (parser); } /** * Make a Page from a string of content. The content is not parsed. * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail. * @param content HTML content of the page */ public Page (String content) { super (null, 0, content.length()); // FIX: don't think base==null will work source = this; this.content = content; contentLock = -1; } // // Downloading // // This code generates SecurityExceptions in Netscape 4.0, // and it doesn't seem to be necessary anyway: redirects are followed // by Netscape and JDK by default, despite the fact that the JDK // docs claim that setFollowRedirects() defaults to false //static { //try { // HttpURLConnection.setFollowRedirects (true); //} catch (Throwable t) { } //} /* * Download the page. The downloaded page is parsed * if its MIME type is HTML or unspecified. * @param parser HTML parser to use * @exception IOException if an error occurs in downloading the page */ public void download (HTMLParser parser) throws IOException { URLConnection conn = SecurityPolicy.getPolicy ().openConnection (origin); // fetch and store final redirected URL and response headers base = conn.getURL (); lastModified = conn.getLastModified (); expiration = conn.getExpiration (); contentType = conn.getContentType (); contentEncoding = conn.getContentEncoding ();//#ifdef JDK1.1 // get HTTP response codes if (conn instanceof HttpURLConnection) { HttpURLConnection httpconn = (HttpURLConnection)conn; responseCode = httpconn.getResponseCode (); responseMessage = httpconn.getResponseMessage (); if (responseMessage == null) responseMessage = "unknown error"; if (responseCode >= 300) // HTTP failure throw new IOException (responseCode + " " + responseMessage); }//#endif JDK1.1 //System.err.println ("Original URL: " + origin.getURL()); //System.err.println ("Final URL: " + conn.getURL()); // download and parse the response InputStream in = conn.getInputStream (); if (contentType == null || contentType.equals ("text/html") || contentType.equals ("content/unknown")) parser.parse (this, in); else parser.dontParse (this, in); in.close (); contentLock = 1; } void downloadSafely () { try { download (new HTMLParser ()); } catch (Throwable e) { } } // // Parsing // /** * Parse the page. Assumes the page has already been downloaded. * @param parser HTML parser to use * @exception IOException if an error occurs in downloading the page */ public void parse (HTMLParser parser) { if (!hasContent()) downloadSafely (); try { parser.parse (this, content); } catch (IOException e) { throw new RuntimeException (e.toString()); } } /** * Test whether page has been parsed. Pages are parsed during * download only if its MIME type is HTML or unspecified. * @return true if page was parsed, false if not */ public boolean isParsed () { return tokens != null; } /** * Test whether page is HTML. * @return true if page is HTML, false if not */ public boolean isHTML () { return root != null; } /** * Test whether page is a GIF or JPEG image. * @return true if page is a GIF or JPEG image, false if not */ private static final String GIF_CODE = "GIF8"; private static final String JPG_CODE = "\377\330\377\340\0\020JFIF"; public boolean isImage () { return content.startsWith (GIF_CODE) || content.startsWith (JPG_CODE); } // // Content management // /** * Lock the page's content (to prevent it from being discarded). * This method increments a lock counter, representing all the * callers interested in preserving the content. The lock * counter is set to 1 when the page is initially downloaded. */ public void keepContent () { if (contentLock > 0) ++contentLock; } /** * Unlock the page's content (allowing it to be garbage-collected, to * save space during a Web crawl). This method decrements a lock counter. * If the counter falls to * 0 (meaning no callers are interested in the content), * the content is released. At least the following * fields are discarded: content, tokens, tags, words, elements, and * root. After the content has been discarded, calling getContent() * (or getTokens(), getTags(), etc.) will force the page to be downloaded * again. Hopefully the download will come from the cache, however. * <P> Links are not considered part of the content, and are not subject to * discarding by this method. Also, if the page was created from a string * (rather than by downloading), its content is not subject to discarding * (since there would be no way to recover it). */ public void discardContent () { if (contentLock == 0) // already discarded return; if (--contentLock > 0) // somebody else still has a lock on the content return; if (origin == null) return; // without an origin, we'd have no way to recover this page //System.err.println ("discarding content of " + toDescription()); content = null; tokens = null; tags = null; words = null; elements = null; root = null; canonicalTags = null; // keep links, but isolate them from the element tree if (links != null) { for (int i=0; i<links.length; ++i) if (links[i] instanceof Link) ((Link)links[i]).discardContent (); } // FIX: debugging only: disconnect this page from its parent //origin.page = null; //origin = null; contentLock = 0; } /** * Test if page content is available. * @return true if content is downloaded and available, false if content has not been downloaded * or has been discarded. */ public final boolean hasContent () { return contentLock != 0; } // // Page accessors // /** * Get depth of page in crawl. * @return depth of page from root (depth of page is same as depth of its originating link) */ public int getDepth () { return origin != null ? origin.getDepth () : 0; } /** * Get the Link that points to this page. * @return the Link object that was used to download this page. */ public Link getOrigin () { return origin; } /** * Get the base URL, relative to which the page's links were interpreted. * The base URL defaults to the URL of the * Link that was used to download the page. If any redirects occur * while downloading the page, the final location becomes the new base * URL. Lastly, if a <BASE> element is found in the page, that * becomes the new base URL. * @return the page's base URL. */ public URL getBase () { return base; } /** * Get the URL. * @return the URL of the link that was used to download this page */ public URL getURL () { return origin != null ? origin.getURL() : null;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -