⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fetchftp.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/* FetchFTP.java * * $Id: FetchFTP.java 5080 2007-04-13 20:30:49Z gojomo $ * * Created on Jun 5, 2003 * * Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.crawler.fetcher;import java.io.IOException;import java.io.UnsupportedEncodingException;import java.net.Socket;import java.net.URLEncoder;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Matcher;import java.util.regex.Pattern;import javax.management.AttributeNotFoundException;import org.apache.commons.httpclient.URIException;import org.apache.commons.net.ftp.FTPCommand;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.FetchStatusCodes;import org.archive.crawler.extractor.Link;import static org.archive.crawler.extractor.Link.NAVLINK_HOP;import static org.archive.crawler.extractor.Link.NAVLINK_MISC;import org.archive.crawler.framework.Processor;import org.archive.crawler.settings.SimpleType;import org.archive.io.RecordingInputStream;import org.archive.io.ReplayCharSequence;import org.archive.net.ClientFTP;import org.archive.net.FTPException;import org.archive.net.UURI;import org.archive.util.ArchiveUtils;import org.archive.util.HttpRecorder;/** * Fetches documents and directory listings using FTP.  This class will also * try to extract FTP "links" from directory listings.  For this class to * archive a directory listing, the remote FTP server must support the NLIST * command.  Most modern FTP servers should. *  * @author pjack * */public class FetchFTP extends Processor implements CoreAttributeConstants {        /** Serialization ID; robust against trivial API changes. */    private static final long serialVersionUID =     ArchiveUtils.classnameBasedUID(FetchFTP.class,1);    /** Logger for this class. */    private static Logger logger = Logger.getLogger(FetchFTP.class.getName());    /** Pattern for matching directory entries. */    private static Pattern DIR =      Pattern.compile("(.+)$", Pattern.MULTILINE);        /** The name for the <code>username</code> attribute. */    final public static String ATTR_USERNAME = "username";       /** The description for the <code>username</code> attribute. */    final private static String DESC_USERNAME = "The username to send to " +     "FTP servers.  By convention, the default value of \"anonymous\" is " +     "used for publicly available FTP sites.";        /** The default value for the <code>username</code> attribute. */    final private static String DEFAULT_USERNAME = "anonymous";    /** The name for the <code>password</code> attribute. */    final public static String ATTR_PASSWORD = "password";       /** The description for the <code>password</code> attribute. */    final private static String DESC_PASSWORD = "The password to send to " +    "FTP servers.  By convention, anonymous users send their email address " +    "in this field.";        /** The default value for the <code>password</code> attribute. */    final private static String DEFAULT_PASSWORD = "";        /** The name for the <code>extract-from-dirs</code> attribute. */    final private static String ATTR_EXTRACT = "extract-from-dirs";        /** The description for the <code>extract-from-dirs</code> attribute. */    final private static String DESC_EXTRACT = "Set to true to extract "     + "further URIs from FTP directories.  Default is true.";        /** The default value for the <code>extract-from-dirs</code> attribute. */    final private static boolean DEFAULT_EXTRACT = true;        /** The name for the <code>extract-parent</code> attribute. */    final private static String ATTR_EXTRACT_PARENT = "extract_parent";        /** The description for the <code>extract-parent</code> attribute. */    final private static String DESC_EXTRACT_PARENT = "Set to true to extract "     + "the parent URI from all FTP URIs.  Default is true.";        /** The default value for the <code>extract-parent</code> attribute. */    final private static boolean DEFAULT_EXTRACT_PARENT = true;            /** The name for the <code>max-length-bytes</code> attribute. */    final public static String ATTR_MAX_LENGTH = "max-length-bytes";        /** The description for the <code>max-length-bytes</code> attribute. */    final private static String DESC_MAX_LENGTH =         "Maximum length in bytes to fetch.\n" +        "Fetch is truncated at this length. A value of 0 means no limit.";        /** The default value for the <code>max-length-bytes</code> attribute. */    final private static long DEFAULT_MAX_LENGTH = 0;        /** The name for the <code>fetch-bandwidth</code> attribute. */    final public static String ATTR_BANDWIDTH = "fetch-bandwidth";        /** The description for the <code>fetch-bandwidth</code> attribute. */    final private static String DESC_BANDWIDTH = "";        /** The default value for the <code>fetch-bandwidth</code> attribute. */    final private static int DEFAULT_BANDWIDTH = 0;            /** The name for the <code>timeout-seconds</code> attribute. */    final public static String ATTR_TIMEOUT = "timeout-seconds";        /** The description for the <code>timeout-seconds</code> attribute. */    final private static String DESC_TIMEOUT = "If the fetch is not "     + "completed in this number of seconds, give up (and retry later).";        /** The default value for the <code>timeout-seconds</code> attribute. */    final private static int DEFAULT_TIMEOUT = 1200;        /**     * Constructs a new <code>FetchFTP</code>.     *      * @param name  the name of this processor     */    public FetchFTP(String name) {        super(name, "FTP Fetcher.");        add(ATTR_USERNAME, DESC_USERNAME, DEFAULT_USERNAME);        add(ATTR_PASSWORD, DESC_PASSWORD, DEFAULT_PASSWORD);        add(ATTR_EXTRACT, DESC_EXTRACT, DEFAULT_EXTRACT);        add(ATTR_EXTRACT_PARENT, DESC_EXTRACT_PARENT, DEFAULT_EXTRACT_PARENT);        add(ATTR_MAX_LENGTH, DESC_MAX_LENGTH, DEFAULT_MAX_LENGTH);        add(ATTR_BANDWIDTH, DESC_BANDWIDTH, DEFAULT_BANDWIDTH);        add(ATTR_TIMEOUT, DESC_TIMEOUT, DEFAULT_TIMEOUT);    }        /**     * Convenience method for adding an attribute.     *      * @param name   The name of the attribute     * @param desc   The description of the attribute     * @param def    The default value for the attribute     */    private void add(String name, String desc, Object def) {        SimpleType st = new SimpleType(name, desc, def);        addElementToDefinition(st);    }            /**     * Convenience method for extracting an attribute.     * If a value for the specified name cannot be found,     * a warning is written to the log and the specified     * default value is returned instead.     *      * @param context  The context for the attribute fetch     * @param name     The name of the attribute to fetch     * @param def      The value to return if the attribute isn't found     * @return         The value of that attribute     */    private Object get(Object context, String name, Object def) {        try {            return getAttribute(context, name);        } catch (AttributeNotFoundException e) {            logger.warning("Attribute not found (using default): " + name);            return def;        }    }        /**     * Processes the given URI.  If the given URI is not an FTP URI, then     * this method does nothing.  Otherwise an attempt is made to connect     * to the FTP server.     *      * <p>If the connection is successful, an attempt will be made to CD to      * the path specified in the URI.  If the remote CD command succeeds,      * then it is assumed that the URI represents a directory.  If the     * CD command fails, then it is assumed that the URI represents     * a file.     *      * <p>For directories, the directory listing will be fetched using     * the FTP LIST command, and saved to the HttpRecorder.  If the     * <code>extract.from.dirs</code> attribute is set to true, then     * the files in the fetched list will be added to the curi as     * extracted FTP links.  (It was easier to do that here, rather     * than writing a separate FTPExtractor.)     *      * <p>For files, the file will be fetched using the FTP RETR     * command, and saved to the HttpRecorder.     *      * <p>All file transfers (including directory listings) occur using     * Binary mode transfer.  Also, the local passive transfer mode     * is always used, to play well with firewalls.     *      * @param curi  the curi to process     * @throws InterruptedException  if the thread is interrupted during     *   processing     */    public void innerProcess(CrawlURI curi) throws InterruptedException {        if (!curi.getUURI().getScheme().equals("ftp")) {            return;        }                curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());        HttpRecorder recorder = HttpRecorder.getHttpRecorder();        ClientFTP client = new ClientFTP();                try {            fetch(curi, client, recorder);        } catch (FTPException e) {            logger.log(Level.SEVERE, "FTP server reported problem.", e);            curi.setFetchStatus(e.getReplyCode());        } catch (IOException e) {            logger.log(Level.SEVERE, "IO Error during FTP fetch.", e);            curi.setFetchStatus(FetchStatusCodes.S_CONNECT_LOST);        } finally {            disconnect(client);            curi.setContentSize(recorder.getRecordedInput().getSize());            curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis());        }    }    /**     * Fetches a document from an FTP server.     *      * @param curi      the URI of the document to fetch     * @param client    the FTPClient to use for the fetch     * @param recorder  the recorder to preserve the document in     * @throws IOException  if a network or protocol error occurs     * @throws InterruptedException  if the thread is interrupted     */    private void fetch(CrawlURI curi, ClientFTP client, HttpRecorder recorder)     throws IOException, InterruptedException {        // Connect to the FTP server.        UURI uuri = curi.getUURI();        int port = uuri.getPort();        if (port == -1) {            port = 21;        }        client.connectStrict(uuri.getHost(), port);                // Authenticate.        String[] auth = getAuth(curi);        client.loginStrict(auth[0], auth[1]);                // The given resource may or may not be a directory.        // To figure out which is which, execute a CD command to        // the UURI's path.  If CD works, it's a directory.        boolean dir = client.changeWorkingDirectory(uuri.getPath());        if (dir) {            curi.setContentType("text/plain");        }                // TODO: A future version of this class could use the system string to        // set up custom directory parsing if the FTP server doesn't support         // the nlist command.        if (logger.isLoggable(Level.FINE)) {            String system = client.getSystemName();            logger.fine(system);        }                // Get a data socket.  This will either be the result of a NLIST        // command for a directory, or a RETR command for a file.        int command = dir ? FTPCommand.NLST : FTPCommand.RETR;        String path = dir ? "." : uuri.getPath();        client.enterLocalPassiveMode();        client.setBinary();        Socket socket = client.openDataConnection(command, path);        curi.setFetchStatus(client.getReplyCode());        // Save the streams in the CURI, where downstream processors        // expect to find them.        try {            saveToRecorder(curi, socket, recorder);        } finally {            recorder.close();            close(socket);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -