📄 page.java
字号:
/* * WebSphinx web-crawling toolkit * * Copyright (c) 1998-2002 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */package websphinx;import java.net.URL;import java.net.URLConnection;//#ifdef JDK1.1 import java.net.HttpURLConnection;//#endif JDK1.1import java.io.IOException;import java.io.InputStream;import rcm.util.Str;/** * A Web page. Although a Page can represent any MIME type, it mainly * supports HTML pages, which are automatically parsed. The parsing produces * a list of tags, a list of words, an HTML parse tree, and a list of links. */public class Page extends Region { // typical page length, to optimize downloads static final int TYPICAL_LENGTH = 20240; // Permanent content Link origin; long lastModified = 0; long expiration = 0; String contentType; String contentEncoding; int responseCode = -1; String responseMessage = null; URL base; String title; Link[] links; int contentLock; // If page was downloaded from Net, represents number of // callers who want to keep the content. // If page was created from a string, set to -1. // Discardable content (thrown away when contentLock falls to 0) byte[] contentBytes; String content; Region[] tokens; Text[] words; Tag[] tags; Element[] elements; Element root; String canonicalTags; /** * Make a Page by downloading and parsing a Link. * @param link Link to download */ public Page (Link link) throws IOException { this (link, DownloadParameters.NO_LIMITS, new HTMLParser ()); } /** * Make a Page by downloading a Link. * @param link Link to download * @param dp Download parameters to use */ public Page (Link link, DownloadParameters dp) throws IOException { this (link, dp, new HTMLParser ()); } /** * Make a Page by downloading a Link. * @param link Link to download * @param parser HTML parser to use */ public Page (Link link, DownloadParameters dp, HTMLParser parser) throws IOException { super (null, 0, 0); source = this; origin = link; base = getURL (); download (dp, parser); link.setPage (this); } /** * Make a Page from a URL and a string of HTML. * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail. * @param url URL to use as a base for relative links on the page * @param html the HTML content of the page */ public Page (URL url, String html) { this (url, html, new HTMLParser ()); } /** * Make a Page from a URL and a string of HTML. * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail. * @param url URL to use as a base for relative links on the page * @param html the HTML content of the page * @param parser HTML parser to use */ public Page (URL url, String html, HTMLParser parser) { super (null, 0, html.length()); source = this; base = url; this.content = html; this.contentBytes = html.getBytes (); contentLock = -1; parse (parser); } /** * Make a Page from a string of content. The content is not parsed. * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail. * @param content HTML content of the page */ public Page (String content) { super (null, 0, content.length()); // FIX: don't think base==null will work source = this; this.content = content; this.contentBytes = content.getBytes (); contentLock = -1; } /** * Make a Page from a byte array of content. The content is not parsed. * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail. * @param content byte content of the page */ public Page (byte[] content) { super (null, 0, content.length); // FIX: don't think base==null will work source = this; this.contentBytes = new byte[content.length]; System.arraycopy (content, 0, this.contentBytes, 0, content.length); this.content = new String (content); contentLock = -1; } // // Downloading // // This code generates SecurityExceptions in Netscape 4.0, // and it doesn't seem to be necessary anyway: redirects are followed // by Netscape and JDK by default, despite the fact that the JDK // docs claim that setFollowRedirects() defaults to false //static { //try { // HttpURLConnection.setFollowRedirects (true); //} catch (Throwable t) { } //} /* * Download the page. The downloaded page is parsed * if its MIME type is HTML or unspecified. * @param parser HTML parser to use * @exception IOException if an error occurs in downloading the page */ public void download (DownloadParameters dp, HTMLParser parser) throws IOException { URLConnection conn = Access.getAccess ().openConnection (origin); // fetch and store final redirected URL and response headers InputStream in = conn.getInputStream (); base = conn.getURL (); lastModified = conn.getLastModified (); expiration = conn.getExpiration (); contentType = conn.getContentType (); contentEncoding = conn.getContentEncoding ();//#ifdef JDK1.1 // get HTTP response codes if (conn instanceof HttpURLConnection) { HttpURLConnection httpconn = (HttpURLConnection)conn; responseCode = httpconn.getResponseCode (); responseMessage = httpconn.getResponseMessage (); if (responseMessage == null) responseMessage = "unknown error"; if (responseCode >= 300) // HTTP failure throw new IOException (responseCode + " " + responseMessage); }//#endif JDK1.1// System.err.println ("Original URL: " + origin.getURL());// System.err.println ("Final URL: " + conn.getURL()); // download content int maxKB = dp.getMaxPageSize (); int maxBytes = (maxKB > 0) ? maxKB * 1024 : Integer.MAX_VALUE; int expectedLength = conn.getContentLength (); if (expectedLength > maxBytes) throw new IOException ("Page greater than " + maxBytes + " bytes"); if (expectedLength == -1) expectedLength = TYPICAL_LENGTH; byte[] buf = new byte[expectedLength]; int n; int total = 0; while ((n = in.read (buf, total, buf.length - total)) != -1) { total += n; if (total > maxBytes) throw new IOException ("Page greater than " + maxBytes + " bytes"); if (total == buf.length) { // try to read one more character int c = in.read (); if (c == -1) break; // EOF, we're done else { // need more space in array. Double the array, but don't make // it bigger than maxBytes. byte[] newbuf = new byte[Math.min (buf.length * 2, maxBytes)]; System.arraycopy (buf, 0, newbuf, 0, buf.length); buf = newbuf; buf[total++] = (byte) c; } } } in.close (); if (total != buf.length) { // resize the array to be precisely total bytes long byte[] newbuf = new byte[total]; System.arraycopy (buf, 0, newbuf, 0, total); buf = newbuf; } contentBytes = buf; content = new String (buf); start = 0; end = total; contentLock = 1; // parse the response if (contentType == null || contentType.startsWith ("text/html") || contentType.startsWith ("content/unknown")) parse (parser); } void downloadSafely () { try { download (new DownloadParameters (), new HTMLParser ()); } catch (Throwable e) { } } // // Parsing // /** * Parse the page. Assumes the page has already been downloaded. * @param parser HTML parser to use * @exception RuntimeException if an error occurs in downloading the page */ public void parse (HTMLParser parser) { if (!hasContent()) downloadSafely (); try { parser.parse (this); } catch (IOException e) { throw new RuntimeException (e.toString()); } } /** * Test whether page has been parsed. Pages are parsed during * download only if its MIME type is HTML or unspecified. * @return true if page was parsed, false if not */ public boolean isParsed () { return tokens != null; } /** * Test whether page is HTML. * @return true if page is HTML. */ public boolean isHTML () { return root != null; } /** * Test whether page is a GIF or JPEG image. * @return true if page is a GIF or JPEG image, false if not */ public boolean isImage () { byte[] bytes = getContentBytes (); return startsWith (bytes, GIF_MAGIC) || startsWith (bytes, JPG_MAGIC); } private static final byte[] GIF_MAGIC = { (byte) 'G', (byte)'I', (byte)'F', (byte)'8' }; private static final byte[] JPG_MAGIC = { (byte) 0377, (byte) 0330, (byte) 0377, (byte) 0340, (byte) 0, (byte) 020, (byte) 'J', (byte) 'F', (byte) 'I', (byte) 'F' }; private boolean startsWith (byte[] bytes, byte[] prefix) { if (prefix.length > bytes.length) return false; for (int i = 0, n = prefix.length; i < n; ++i) if (bytes[i] != prefix[i]) return false; return true; } // // Content management // /** * Lock the page's content (to prevent it from being discarded). * This method increments a lock counter, representing all the * callers interested in preserving the content. The lock * counter is set to 1 when the page is initially downloaded. */ public void keepContent () { if (contentLock > 0) ++contentLock; } /** * Unlock the page's content (allowing it to be garbage-collected, to * save space during a Web crawl). This method decrements a lock counter. * If the counter falls to * 0 (meaning no callers are interested in the content), * the content is released. At least the following * fields are discarded: content, tokens, tags, words, elements, and * root. After the content has been discarded, calling getContent() * (or getTokens(), getTags(), etc.) will force the page to be downloaded * again. Hopefully the download will come from the cache, however. * <P> Links are not considered part of the content, and are not subject to * discarding by this method. Also, if the page was created from a string * (rather than by downloading), its content is not subject to discarding * (since there would be no way to recover it). */ public void discardContent () { if (contentLock == 0) // already discarded return; if (--contentLock > 0) // somebody else still has a lock on the content return; if (origin == null) return; // without an origin, we'd have no way to recover this page //System.err.println ("discarding content of " + toDescription()); contentBytes = null; content = null; tokens = null; tags = null; words = null; elements = null; root = null; canonicalTags = null; // keep links, but isolate them from the element tree if (links != null) { for (int i=0; i<links.length; ++i) if (links[i] instanceof Link) ((Link)links[i]).discardContent (); } // FIX: debugging only: disconnect this page from its parent //origin.page = null; //origin = null; contentLock = 0; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -