📄 page.java

📁 一个用java语言编写的网络爬虫程序
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
    }    /**     * Get the title of the page.     * @return the page's title, or null if the page hasn't been parsed.     */    public String getTitle () {        return title;    }    /**     * Get the content of the page.     * @return the Page object, or null if the page hasn't been downloaded.     */    public String getContent () {        if (!hasContent())            downloadSafely ();        return content;    }    /**     * Get the token sequence of the page.  Tokens are tags and whitespace-delimited text.     * @return token regions in the page, or null if the page hasn't been downloaded or parsed.     */    public Region[] getTokens() {        if (!hasContent ())            downloadSafely ();        return tokens;    }    /**     * Get the tag sequence of the page.     * @return tags in the page, or null if the page hasn't been downloaded or parsed.     */    public Tag[] getTags () {        if (!hasContent ())            downloadSafely ();        return tags;    }    /**     * Get the words in the page.  Words are whitespace- and tag-delimited text.     * @return words in the page, or null if the page hasn't been downloaded or parsed.     */    public Text[] getWords () {        if (!hasContent ())            downloadSafely ();        return words;    }    /**     * Get the HTML elements in the page.  All elements in the page     * are included in the list, in the order they would appear in     * an inorder traversal of the HTML parse tree.     * @return HTML elements in the page ordered by inorder, or null if the page     * hasn't been downloaded or parsed.     */    public Element[] getElements () {        if (!hasContent ())            downloadSafely ();        return elements;    }        /**     * Get the root HTML element of the page.     * @return first top-level HTML element in the page, or null      * if the page hasn't been downloaded or parsed.     */    public Element getRootElement () {        if (!hasContent ())            downloadSafely ();        return root;    }    /**     * Get the links found in the page.     * @return links in the page, or null      * if the page hasn't been downloaded or parsed.     */    public Link[] getLinks() {        return links;    }    /**     * Convert the link's URL to a String     * @return the URL represented as a string     */    public String toURL () {        return origin != null ? origin.toURL () : null;    }    /**     * Generate a human-readable description of the page.     * @return a description of the link, in the form "title [url]".     */    public String toDescription () {        return (title != null && title.length() > 0 ? title + " " : "") + "[" + getURL() + "]";    }    /**     * Get page containing the region.     * @return page containing the region     */    public String toString () {        return getContent ();    }    /**     * Get last-modified date of page.     * @return the date when the page was last modified, or 0 if not known.      * The value is number of seconds since January 1, 1970 GMT     */    public long getLastModified () {        return lastModified;    }    /**     * Set last-modified date of page.     * @param last the date when the page was last modified, or 0 if not known.      * The value is number of seconds since January 1, 1970 GMT     */    public void setLastModified (long last) {        lastModified = last;    }    /**     * Get expiration date of page.     * @return the expiration date of the page, or 0 if not known.      * The value is number of seconds since January 1, 1970 GMT.     */    public long getExpiration () {        return expiration;    }    /**     * Set expiration date of page.     * @param expire the expiration date of the page, or 0 if not known.      * The value is number of seconds since January 1, 1970 GMT.     */    public void setExpiration (long expire) {        expiration = expire;    }    /**     * Get MIME type of page.     * @return the MIME type of page, such as "text/html", or null if not known.      */    public String getContentType () {        return contentType;    }    /**     * Set MIME type of page.     * @param type the MIME type of page, such as "text/html", or null if not known.      */    public void setContentType (String type) {        contentType = type;    }    /**     * Get content encoding of page.     * @return the encoding type of page, such as "base-64", or null if not known.      */    public String getContentEncoding () {        return contentEncoding;    }    /**     * Set content encoding of page.     * @param encoding the encoding type of page, such as "base-64", or null if not known.      */    public void setContentEncoding (String encoding) {        contentEncoding = encoding;    }    /**     * Get response code returned by the Web server.  For list of     * possible values, see java.net.HttpURLConnection.     * @return response code, such as 200 (for OK) or 404 (not found).     * Code is -1 if unknown.     * @see java.net.HttpURLConnection     */    public int getResponseCode () {        return responseCode;    }    /**     * Get response message returned by the Web server.     * @return response message, such as "OK" or "Not Found".  The response message is null if the page failed to be fetched or not known.      */    public String getResponseMessage () {        return responseMessage;    }    /**     * Get raw content found in a region.     * @param start starting offset of region     * @param end ending offset of region     * @return raw HTML contained in the region     */    public String substringContent (int start, int end) {        return content.substring (start, end);    }    /**     * Get HTML found in a region.     * @param start starting offset of region     * @param end ending offset of region     * @return representation of region as HTML     */    public String substringHTML (int start, int end) {        String s = content.substring (start, end);        if (!isHTML ()) {            s = Str.replace (s, "&", "&amp;");            s = Str.replace (s, "<", "&lt;");            s = Str.replace (s, ">", "&gt;");            s = "<PRE>" + s + "</PRE>";        }        return s;    }    /**     * Get tagless text found in a region.     * Runs of whitespace and tags are reduced to a single space character.     * @param start starting offset of region     * @param end ending offset of region     * @return tagless text contained in the region     */    public String substringText (int start, int end) {        if (words == null)            return ""; // page is not parsed        // FIX: find some other mapping        StringBuffer buf = new StringBuffer();        for (int j = findStart (words, start); j<words.length; ++j) {            if (words[j].end > end)                break;            else {                if (buf.length() > 0)                    buf.append (' ');                buf.append (words[j].text);            }        }        return buf.toString();                 }    /**     * Get HTML tags found in a region.  Whitespace and text among the     * tags are deleted.     * @param start starting offset of region     * @param end ending offset of region     * @return tags contained in the region     */    public String substringTags (int start, int end) {        if (tags == null)            return ""; // page is not parsed        // FIX: find some other mapping        StringBuffer buf = new StringBuffer();        for (int j = findStart (tags, start); j<tags.length; ++j) {            if (tags[j].end > end)                break;            else {                if (buf.length() > 0)                    buf.append (' ');                buf.append (content.substring (tags[j].start, tags[j].end));            }        }        return buf.toString();                 }    /**     * Get canonicalized HTML tags found in a region.     * A canonicalized tag looks like the following:     * <PRE>     * &lt;tagname#index attr=value attr=value attr=value ...&gt     * <PRE>     * where tagname and attr are all lowercase, index is the tag's     * index in the page's tokens array.  Attributes are sorted in     * increasing order by attribute name. Attributes without values     * omit the entire "=value" portion.  Values are delimited by a      * space.  All occurences of &lt, &gt, space, and % characters      * in a value are URL-encoded (e.g., space is converted to %20).       * Thus the only occurences of these characters in the canonical      * tag are the tag delimiters.     *     * <P>For example, raw HTML that looks like:     * <PRE>     * &lt;IMG SRC="http://foo.com/map&lt;&gt;.gif" ISMAP&gt;Image&lt;/IMG&gt;     * </PRE>     * would be canonicalized to:     * <PRE>     * &lt;img ismap src=http://foo.com/map%3C%3E.gif&gt;&lt;/img&gt;     * </PRE>     * <P>     * Comment and declaration tags (whose tag name is !) are omitted     * from the canonicalization.     *     * @param start starting offset of region     * @param end ending offset of region     * @return canonicalized tags contained in the region     */    public String substringCanonicalTags (int start, int end) {        if (tokens == null)            return ""; // page is not parsed        boolean all = (start == this.start && end == this.end);        if (all && canonicalTags != null)            return canonicalTags;        // FIX: find some other mapping        StringBuffer buf = new StringBuffer();        for (int j = findStart (tokens, start); j<tokens.length; ++j) {            if (tokens[j].end > end)                break;            else if (tokens[j] instanceof Tag)                Tagexp.canonicalizeTag (buf, (Tag)tokens[j], j);        }        String result = buf.toString ();        if (all)            canonicalTags = result;        return result;    }    public static void main (String[] args) throws Exception {        int method = Link.GET;        for (int i=0; i<args.length; ++i) {            if (args[i].equals ("-post"))                method = Link.POST;            else if (args[i].equals ("-get"))                method = Link.GET;            else {                Link link = method == Link.GET                              ? new Link (args[i])                              : new Link (args[i]); // FIX: POST?                try {                    System.out.print (new Page (link).getContent());                } catch (IOException e) {                    System.out.println (e);                }            }        }    }}
上一页 12
💿 文件大小 602 K
👤 上传用户 jwl119
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#java #语言 #编写 #程序
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -