robotexclusion.java

来自「一个用java语言编写的网络爬虫程序」· Java 代码 · 共 213 行

JAVA

213 行

/* * WebSPHINX web crawling toolkit * Copyright (C) 1998,1999 Carnegie Mellon University  *  * This library is free software; you can redistribute it * and/or modify it under the terms of the GNU Library * General Public License as published by the Free Software  * Foundation, version 2. * * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/ */package websphinx;import java.net.URL;import java.net.URLConnection;import java.util.Hashtable;import java.io.PushbackInputStream;import java.io.BufferedInputStream;import java.util.Vector;public class RobotExclusion {    String myUserAgent;        // user agent (name) of this crawler, all lower-case    Hashtable entries = new Hashtable ();        // maps from a website ("host:port") to String[]    /**     * Make a RobotExclusion object.     * @param userAgent name of the robot using this object, as shown     *  in the User-Agent header fields of its HTTP requests.  Use     *  null for anonymous robots.     */    public RobotExclusion (String userAgent) {        myUserAgent = userAgent.toLowerCase ();    }    /**     * Check whether a URL is disallowed by robots.txt.     * @param url URL to test     * @return true if url's Web site denies robot access to the url     */    public boolean disallowed (URL url) {        if (!url.getProtocol().startsWith ("http"))            // only HTTP URLs are protected by robots.txt            return false;        String website = getWebSite (url);        String[] rules = (String[])entries.get (website);        if (rules == null) {            rules = getRobotsTxt (website, myUserAgent);            entries.put (website, rules);        }        String path = url.getFile ();        for (int i=0; i<rules.length; ++i) {            if (path.startsWith (rules[i])) {                //System.err.println ("disallowed by rule " + rules[i]);                return true;            }            //System.err.println ("allowed by rule " + rules[i]);        }        return false;    }    /**     * Clear the cache of robots.txt entries.     */    public void clear () {        entries.clear ();    }    /*     * Implementation     *     */    String getWebSite (URL url) {        String hostname = url.getHost ();        int port = url.getPort ();        return port != -1 ? hostname + ":" + port : hostname;    }    Vector rulebuf = new Vector ();    String[] getRobotsTxt (String website, String userAgent) {        try {            URL robotstxtURL = new URL ("http://" + website + "/robots.txt");            URLConnection uc = SecurityPolicy.getPolicy().openConnection (robotstxtURL);            PushbackInputStream in = new PushbackInputStream (new BufferedInputStream (uc.getInputStream ()));            rulebuf.setSize (0);            boolean relevant = false, specific = false;            String lastFieldName = null;            while (readField (in)) {                //System.err.println (fieldName + ":" + fieldValue);                if (fieldName == null) { // end of record                    if (specific)                        break; // while loop                    relevant = false;                }                else if (fieldName.equals ("user-agent")) {                    if (lastFieldName != null && lastFieldName.equals ("disallow")) {                        // end of record                        if (specific)                            break; // while loop                        relevant = false;                    }                    if (userAgent != null && userAgent.indexOf (fieldValue.toLowerCase()) != -1) {                        relevant = true;                        specific = true;                        rulebuf.setSize (0);                    }                    else if (fieldValue.equals ("*")) {                        relevant = true;                        rulebuf.setSize (0);                    }                }                else if (relevant && fieldName.equals ("disallow")) {                    rulebuf.addElement (fieldValue);                }                else { // end of record                    if (specific)                        break; // while loop                    relevant = false;                }                lastFieldName = fieldName;            }            in.close ();            String[] rules = new String[rulebuf.size ()];            rulebuf.copyInto (rules);            return rules;        } catch (Exception e) {            // debugging only            // System.err.println ("RobotExclusion: error while retrieving " + website + "/robots.txt:");            // e.printStackTrace ();            return new String[0];        }    }    String fieldName, fieldValue;    static final int MAX_LINE_LENGTH = 1024;    StringBuffer linebuf = new StringBuffer ();    // Reads one line from the input stream, parsing it into    // fieldName and fieldValue.  Field name is lower case;    // whitespace is stripped at both ends of name and value.    // e.g., User-agent: Webcrawler    // is parsed into fieldName="user-agent" and fieldValue="Webcrawler".    // Field-less lines are parsed as fieldName=null and fieldValue=null.    // Returns true if a line was read, false on end-of-file.    boolean readField (PushbackInputStream in) throws Exception {        fieldName = null;        fieldValue = null;        linebuf.setLength (0);        int c;        int n = 0;        boolean saw_eoln = false;        while (true) {            c = in.read ();            if (c == -1)                break;            else if (c == '\r' || c == '\n')                saw_eoln = true;            else if (saw_eoln) {                in.unread (c);                break;            }            else {                linebuf.append ((char)c);            }            ++n;            if (n == MAX_LINE_LENGTH)                break;        }        //System.err.println (linebuf);        if (n == 0)            return false;        // extract fields from line and return        String line = linebuf.toString ();        int colon = line.indexOf (':');        if (colon == -1) {            fieldName = null;            fieldValue = null;        }        else {            fieldName = line.substring (0, colon).trim ().toLowerCase ();            fieldValue = line.substring (colon+1).trim ();        }        return true;    }    public static void main (String argv[]) throws Exception {        RobotExclusion robot = new RobotExclusion (argv[0]);        for (int i=1; i<argv.length; ++i) {            System.out.println (argv[i] + ": "                    + (!robot.disallowed (new URL (argv[i])) ? "OK" : "disallowed"));        }        System.in.read ();    }}

robotexclusion.java - 源码说明

本页面展示了「一个用java语言编写的网络爬虫程序」中的 robotexclusion.java 源码文件，采用 Java 编程语言编写，共 213 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与java相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?