📄 page.java
字号:
} /** * Get the title of the page. * @return the page's title, or null if the page hasn't been parsed. */ public String getTitle () { return title; } /** * Get the content of the page. * @return the Page object, or null if the page hasn't been downloaded. */ public String getContent () { if (!hasContent()) downloadSafely (); return content; } /** * Get the token sequence of the page. Tokens are tags and whitespace-delimited text. * @return token regions in the page, or null if the page hasn't been downloaded or parsed. */ public Region[] getTokens() { if (!hasContent ()) downloadSafely (); return tokens; } /** * Get the tag sequence of the page. * @return tags in the page, or null if the page hasn't been downloaded or parsed. */ public Tag[] getTags () { if (!hasContent ()) downloadSafely (); return tags; } /** * Get the words in the page. Words are whitespace- and tag-delimited text. * @return words in the page, or null if the page hasn't been downloaded or parsed. */ public Text[] getWords () { if (!hasContent ()) downloadSafely (); return words; } /** * Get the HTML elements in the page. All elements in the page * are included in the list, in the order they would appear in * an inorder traversal of the HTML parse tree. * @return HTML elements in the page ordered by inorder, or null if the page * hasn't been downloaded or parsed. */ public Element[] getElements () { if (!hasContent ()) downloadSafely (); return elements; } /** * Get the root HTML element of the page. * @return first top-level HTML element in the page, or null * if the page hasn't been downloaded or parsed. */ public Element getRootElement () { if (!hasContent ()) downloadSafely (); return root; } /** * Get the links found in the page. * @return links in the page, or null * if the page hasn't been downloaded or parsed. */ public Link[] getLinks() { return links; } /** * Convert the link's URL to a String * @return the URL represented as a string */ public String toURL () { return origin != null ? origin.toURL () : null; } /** * Generate a human-readable description of the page. * @return a description of the link, in the form "title [url]". */ public String toDescription () { return (title != null && title.length() > 0 ? title + " " : "") + "[" + getURL() + "]"; } /** * Get page containing the region. * @return page containing the region */ public String toString () { return getContent (); } /** * Get last-modified date of page. * @return the date when the page was last modified, or 0 if not known. * The value is number of seconds since January 1, 1970 GMT */ public long getLastModified () { return lastModified; } /** * Set last-modified date of page. * @param last the date when the page was last modified, or 0 if not known. * The value is number of seconds since January 1, 1970 GMT */ public void setLastModified (long last) { lastModified = last; } /** * Get expiration date of page. * @return the expiration date of the page, or 0 if not known. * The value is number of seconds since January 1, 1970 GMT. */ public long getExpiration () { return expiration; } /** * Set expiration date of page. * @param expire the expiration date of the page, or 0 if not known. * The value is number of seconds since January 1, 1970 GMT. */ public void setExpiration (long expire) { expiration = expire; } /** * Get MIME type of page. * @return the MIME type of page, such as "text/html", or null if not known. */ public String getContentType () { return contentType; } /** * Set MIME type of page. * @param type the MIME type of page, such as "text/html", or null if not known. */ public void setContentType (String type) { contentType = type; } /** * Get content encoding of page. * @return the encoding type of page, such as "base-64", or null if not known. */ public String getContentEncoding () { return contentEncoding; } /** * Set content encoding of page. * @param encoding the encoding type of page, such as "base-64", or null if not known. */ public void setContentEncoding (String encoding) { contentEncoding = encoding; } /** * Get response code returned by the Web server. For list of * possible values, see java.net.HttpURLConnection. * @return response code, such as 200 (for OK) or 404 (not found). * Code is -1 if unknown. * @see java.net.HttpURLConnection */ public int getResponseCode () { return responseCode; } /** * Get response message returned by the Web server. * @return response message, such as "OK" or "Not Found". The response message is null if the page failed to be fetched or not known. */ public String getResponseMessage () { return responseMessage; } /** * Get raw content found in a region. * @param start starting offset of region * @param end ending offset of region * @return raw HTML contained in the region */ public String substringContent (int start, int end) { return content.substring (start, end); } /** * Get HTML found in a region. * @param start starting offset of region * @param end ending offset of region * @return representation of region as HTML */ public String substringHTML (int start, int end) { String s = content.substring (start, end); if (!isHTML ()) { s = Str.replace (s, "&", "&"); s = Str.replace (s, "<", "<"); s = Str.replace (s, ">", ">"); s = "<PRE>" + s + "</PRE>"; } return s; } /** * Get tagless text found in a region. * Runs of whitespace and tags are reduced to a single space character. * @param start starting offset of region * @param end ending offset of region * @return tagless text contained in the region */ public String substringText (int start, int end) { if (words == null) return ""; // page is not parsed // FIX: find some other mapping StringBuffer buf = new StringBuffer(); for (int j = findStart (words, start); j<words.length; ++j) { if (words[j].end > end) break; else { if (buf.length() > 0) buf.append (' '); buf.append (words[j].text); } } return buf.toString(); } /** * Get HTML tags found in a region. Whitespace and text among the * tags are deleted. * @param start starting offset of region * @param end ending offset of region * @return tags contained in the region */ public String substringTags (int start, int end) { if (tags == null) return ""; // page is not parsed // FIX: find some other mapping StringBuffer buf = new StringBuffer(); for (int j = findStart (tags, start); j<tags.length; ++j) { if (tags[j].end > end) break; else { if (buf.length() > 0) buf.append (' '); buf.append (content.substring (tags[j].start, tags[j].end)); } } return buf.toString(); } /** * Get canonicalized HTML tags found in a region. * A canonicalized tag looks like the following: * <PRE> * <tagname#index attr=value attr=value attr=value ...> * <PRE> * where tagname and attr are all lowercase, index is the tag's * index in the page's tokens array. Attributes are sorted in * increasing order by attribute name. Attributes without values * omit the entire "=value" portion. Values are delimited by a * space. All occurences of <, >, space, and % characters * in a value are URL-encoded (e.g., space is converted to %20). * Thus the only occurences of these characters in the canonical * tag are the tag delimiters. * * <P>For example, raw HTML that looks like: * <PRE> * <IMG SRC="http://foo.com/map<>.gif" ISMAP>Image</IMG> * </PRE> * would be canonicalized to: * <PRE> * <img ismap src=http://foo.com/map%3C%3E.gif></img> * </PRE> * <P> * Comment and declaration tags (whose tag name is !) are omitted * from the canonicalization. * * @param start starting offset of region * @param end ending offset of region * @return canonicalized tags contained in the region */ public String substringCanonicalTags (int start, int end) { if (tokens == null) return ""; // page is not parsed boolean all = (start == this.start && end == this.end); if (all && canonicalTags != null) return canonicalTags; // FIX: find some other mapping StringBuffer buf = new StringBuffer(); for (int j = findStart (tokens, start); j<tokens.length; ++j) { if (tokens[j].end > end) break; else if (tokens[j] instanceof Tag) Tagexp.canonicalizeTag (buf, (Tag)tokens[j], j); } String result = buf.toString (); if (all) canonicalTags = result; return result; } public static void main (String[] args) throws Exception { int method = Link.GET; for (int i=0; i<args.length; ++i) { if (args[i].equals ("-post")) method = Link.POST; else if (args[i].equals ("-get")) method = Link.GET; else { Link link = method == Link.GET ? new Link (args[i]) : new Link (args[i]); // FIX: POST? try { System.out.print (new Page (link).getContent()); } catch (IOException e) { System.out.println (e); } } } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -