⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fetchftp.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
        }        curi.setFetchStatus(200);        if (dir) {            extract(curi, recorder);        }        addParent(curi);    }            /**     * Saves the given socket to the given recorder.     *      * @param curi      the curi that owns the recorder     * @param socket    the socket whose streams to save     * @param recorder  the recorder to save them to     * @throws IOException  if a network or file error occurs     * @throws InterruptedException  if the thread is interrupted     */    private void saveToRecorder(CrawlURI curi,            Socket socket, HttpRecorder recorder)     throws IOException, InterruptedException {        curi.setHttpRecorder(recorder);        recorder.markContentBegin();        recorder.inputWrap(socket.getInputStream());        recorder.outputWrap(socket.getOutputStream());        // Read the remote file/dir listing in its entirety.        long softMax = 0;        long hardMax = getMaxLength(curi);        long timeout = (long)getTimeout(curi) * 1000;        int maxRate = getFetchBandwidth(curi);        RecordingInputStream input = recorder.getRecordedInput();        input.setLimits(hardMax, timeout, maxRate);         input.readFullyOrUntil(softMax);    }            /**     * Extract FTP links in a directory listing.     * The listing must already be saved to the given recorder.     *      * @param curi      The curi to save extracted links to     * @param recorder  The recorder containing the directory listing     */    private void extract(CrawlURI curi, HttpRecorder recorder) {        if (!getExtractFromDirs(curi)) {            return;        }                ReplayCharSequence seq = null;        try {            seq = recorder.getReplayCharSequence();            extract(curi, seq);        } catch (IOException e) {            logger.log(Level.SEVERE, "IO error during extraction.", e);        } catch (RuntimeException e) {            logger.log(Level.SEVERE, "IO error during extraction.", e);        } finally {            close(seq);        }    }            /**     * Extracts FTP links in a directory listing.     *      * @param curi  The curi to save extracted links to     * @param dir   The directory listing to extract links from     * @throws URIException  if an extracted link is invalid     */    private void extract(CrawlURI curi, ReplayCharSequence dir) {        logger.log(Level.FINEST, "Extracting URIs from FTP directory.");        Matcher matcher = DIR.matcher(dir);        while (matcher.find()) {            String file = matcher.group(1);            addExtracted(curi, file);        }    }    /**     * Adds an extracted filename to the curi.  A new URI will be formed     * by taking the given curi (which should represent the directory the     * file lives in) and appending the file.     *      * @param curi  the curi to store the discovered link in     * @param file  the filename of the discovered link     */    private void addExtracted(CrawlURI curi, String file) {        try {            file = URLEncoder.encode(file, "UTF-8");        } catch (UnsupportedEncodingException e) {            throw new AssertionError(e);        }        if (logger.isLoggable(Level.FINEST)) {            logger.log(Level.FINEST, "Found " + file);        }        String base = curi.toString();        if (base.endsWith("/")) {            base = base.substring(0, base.length() - 1);        }        try {            UURI n = new UURI(base + "/" + file, true);            Link link = new Link(curi.getUURI(), n, NAVLINK_MISC, NAVLINK_HOP);            curi.addOutLink(link);        } catch (URIException e) {            logger.log(Level.WARNING, "URI error during extraction.", e);                    }    }        /**     * Extracts the parent URI from the given curi, then adds that parent     * URI as a discovered link to the curi.      *      * <p>If the <code>extract-parent</code> attribute is false, then this     * method does nothing.  Also, if the path of the given curi is      * <code>/</code>, then this method does nothing.     *      * <p>Otherwise the parent is determined by eliminated the lowest part     * of the URI's path.  Eg, the parent of <code>ftp://foo.com/one/two</code>     * is <code>ftp://foo.com/one</code>.     *      * @param curi  the curi whose parent to add     */    private void addParent(CrawlURI curi) {        if (!getExtractParent(curi)) {            return;        }        UURI uuri = curi.getUURI();        try {            if (uuri.getPath().equals("/")) {                // There's no parent to add.                return;            }            String scheme = uuri.getScheme();            String auth = uuri.getEscapedAuthority();            String path = uuri.getEscapedCurrentHierPath();            UURI parent = new UURI(scheme + "://" + auth + path, false);            Link link = new Link(uuri, parent, NAVLINK_MISC, NAVLINK_HOP);            curi.addOutLink(link);        } catch (URIException e) {            logger.log(Level.WARNING, "URI error during extraction.", e);        }    }            /**     * Returns the <code>extract.from.dirs</code> attribute for this     * <code>FetchFTP</code> and the given curi.     *      * @param curi  the curi whose attribute to return     * @return  that curi's <code>extract.from.dirs</code>     */    public boolean getExtractFromDirs(CrawlURI curi) {        return (Boolean)get(curi, ATTR_EXTRACT, DEFAULT_EXTRACT);    }            /**     * Returns the <code>extract.parent</code> attribute for this     * <code>FetchFTP</code> and the given curi.     *      * @param curi  the curi whose attribute to return     * @return  that curi's <code>extract-parent</code>     */    public boolean getExtractParent(CrawlURI curi) {        return (Boolean)get(curi, ATTR_EXTRACT_PARENT, DEFAULT_EXTRACT_PARENT);    }    /**     * Returns the <code>timeout-seconds</code> attribute for this     * <code>FetchFTP</code> and the given curi.     *      * @param curi   the curi whose attribute to return     * @return   that curi's <code>timeout-seconds</code>     */    public int getTimeout(CrawlURI curi) {        return (Integer)get(curi, ATTR_TIMEOUT, DEFAULT_TIMEOUT);    }    /**     * Returns the <code>max-length-bytes</code> attribute for this     * <code>FetchFTP</code> and the given curi.     *      * @param curi  the curi whose attribute to return     * @return  that curi's <code>max-length-bytes</code>     */    public long getMaxLength(CrawlURI curi) {        return (Long)get(curi, ATTR_MAX_LENGTH, DEFAULT_MAX_LENGTH);    }    /**     * Returns the <code>fetch-bandwidth</code> attribute for this     * <code>FetchFTP</code> and the given curi.     *      * @param curi  the curi whose attribute to return     * @return  that curi's <code>fetch-bandwidth</code>     */    public int getFetchBandwidth(CrawlURI curi) {        return (Integer)get(curi, ATTR_BANDWIDTH, DEFAULT_BANDWIDTH);    }    /**     * Returns the username and password for the given URI.  This method     * always returns an array of length 2.  The first element in the returned     * array is the username for the URI, and the second element is the     * password.     *      * <p>If the URI itself contains the username and password (i.e., it looks     * like <code>ftp://username:password@host/path</code>) then that username     * and password are returned.     *      * <p>Otherwise the settings system is probed for the <code>username</code>     * and <code>password</code> attributes for this <code>FTPFetch</code>     * and the given <code>curi</code> context.  The values of those      * attributes are then returned.     *      * @param curi  the curi whose username and password to return     * @return  an array containing the username and password     */    private String[] getAuth(CrawlURI curi) {        String[] result = new String[2];        UURI uuri = curi.getUURI();        String userinfo;        try {            userinfo = uuri.getUserinfo();        } catch (URIException e) {            assert false;            logger.finest("getUserinfo raised URIException.");            userinfo = null;        }        if (userinfo != null) {            int p = userinfo.indexOf(':');            if (p > 0) {                result[0] = userinfo.substring(0,p);                result[1] = userinfo.substring(p + 1);                return result;            }        }        result[0] = (String)get(curi, ATTR_USERNAME, DEFAULT_USERNAME);        result[1] = (String)get(curi, ATTR_PASSWORD, DEFAULT_PASSWORD);        return result;    }            /**     * Determines the password for the given URI.  If the URI itself contains     * a password, then that password is returned.  Otherwise the settings     * system is probed for the <code>password</code> attribute, and the value     * for that attribute is returned.     *      * @param curi  the curi whose password to return     * @return  that password     */    public String determinePassword(CrawlURI curi) {        return (String)get(curi, ATTR_PASSWORD, DEFAULT_PASSWORD);    }    /**     * Quietly closes the given socket.     *      * @param socket  the socket to close     */    private static void close(Socket socket) {        try {            socket.close();        } catch (IOException e) {            logger.log(Level.WARNING, "IO error closing socket.", e);        }    }    /**     * Quietly closes the given sequence.     * If an IOException is raised, this method logs it as a warning.     *      * @param seq  the sequence to close     */    private static void close(ReplayCharSequence seq) {        if (seq == null) {            return;        }        try {            seq.close();        } catch (IOException e) {            logger.log(Level.WARNING, "IO error closing ReplayCharSequence.",              e);        }    }        /**     * Quietly disconnects from the given FTP client.     * If an IOException is raised, this method logs it as a warning.     *      * @param client  the client to disconnect     */    private static void disconnect(ClientFTP client) {        if (client.isConnected()) try {            client.disconnect();        } catch (IOException e) {            if (logger.isLoggable(Level.WARNING)) {                logger.warning("Could not disconnect from FTP client: "                  + e.getMessage());            }        }            }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -