📄 fetchftp.java
字号:
/* FetchFTP.java * * $Id: FetchFTP.java 5080 2007-04-13 20:30:49Z gojomo $ * * Created on Jun 5, 2003 * * Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package org.archive.crawler.fetcher;import java.io.IOException;import java.io.UnsupportedEncodingException;import java.net.Socket;import java.net.URLEncoder;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Matcher;import java.util.regex.Pattern;import javax.management.AttributeNotFoundException;import org.apache.commons.httpclient.URIException;import org.apache.commons.net.ftp.FTPCommand;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.FetchStatusCodes;import org.archive.crawler.extractor.Link;import static org.archive.crawler.extractor.Link.NAVLINK_HOP;import static org.archive.crawler.extractor.Link.NAVLINK_MISC;import org.archive.crawler.framework.Processor;import org.archive.crawler.settings.SimpleType;import org.archive.io.RecordingInputStream;import org.archive.io.ReplayCharSequence;import org.archive.net.ClientFTP;import org.archive.net.FTPException;import org.archive.net.UURI;import org.archive.util.ArchiveUtils;import org.archive.util.HttpRecorder;/** * Fetches documents and directory listings using FTP. This class will also * try to extract FTP "links" from directory listings. For this class to * archive a directory listing, the remote FTP server must support the NLIST * command. Most modern FTP servers should. * * @author pjack * */public class FetchFTP extends Processor implements CoreAttributeConstants { /** Serialization ID; robust against trivial API changes. */ private static final long serialVersionUID = ArchiveUtils.classnameBasedUID(FetchFTP.class,1); /** Logger for this class. */ private static Logger logger = Logger.getLogger(FetchFTP.class.getName()); /** Pattern for matching directory entries. */ private static Pattern DIR = Pattern.compile("(.+)$", Pattern.MULTILINE); /** The name for the <code>username</code> attribute. */ final public static String ATTR_USERNAME = "username"; /** The description for the <code>username</code> attribute. */ final private static String DESC_USERNAME = "The username to send to " + "FTP servers. By convention, the default value of \"anonymous\" is " + "used for publicly available FTP sites."; /** The default value for the <code>username</code> attribute. */ final private static String DEFAULT_USERNAME = "anonymous"; /** The name for the <code>password</code> attribute. */ final public static String ATTR_PASSWORD = "password"; /** The description for the <code>password</code> attribute. */ final private static String DESC_PASSWORD = "The password to send to " + "FTP servers. By convention, anonymous users send their email address " + "in this field."; /** The default value for the <code>password</code> attribute. */ final private static String DEFAULT_PASSWORD = ""; /** The name for the <code>extract-from-dirs</code> attribute. */ final private static String ATTR_EXTRACT = "extract-from-dirs"; /** The description for the <code>extract-from-dirs</code> attribute. */ final private static String DESC_EXTRACT = "Set to true to extract " + "further URIs from FTP directories. Default is true."; /** The default value for the <code>extract-from-dirs</code> attribute. */ final private static boolean DEFAULT_EXTRACT = true; /** The name for the <code>extract-parent</code> attribute. */ final private static String ATTR_EXTRACT_PARENT = "extract_parent"; /** The description for the <code>extract-parent</code> attribute. */ final private static String DESC_EXTRACT_PARENT = "Set to true to extract " + "the parent URI from all FTP URIs. Default is true."; /** The default value for the <code>extract-parent</code> attribute. */ final private static boolean DEFAULT_EXTRACT_PARENT = true; /** The name for the <code>max-length-bytes</code> attribute. */ final public static String ATTR_MAX_LENGTH = "max-length-bytes"; /** The description for the <code>max-length-bytes</code> attribute. */ final private static String DESC_MAX_LENGTH = "Maximum length in bytes to fetch.\n" + "Fetch is truncated at this length. A value of 0 means no limit."; /** The default value for the <code>max-length-bytes</code> attribute. */ final private static long DEFAULT_MAX_LENGTH = 0; /** The name for the <code>fetch-bandwidth</code> attribute. */ final public static String ATTR_BANDWIDTH = "fetch-bandwidth"; /** The description for the <code>fetch-bandwidth</code> attribute. */ final private static String DESC_BANDWIDTH = ""; /** The default value for the <code>fetch-bandwidth</code> attribute. */ final private static int DEFAULT_BANDWIDTH = 0; /** The name for the <code>timeout-seconds</code> attribute. */ final public static String ATTR_TIMEOUT = "timeout-seconds"; /** The description for the <code>timeout-seconds</code> attribute. */ final private static String DESC_TIMEOUT = "If the fetch is not " + "completed in this number of seconds, give up (and retry later)."; /** The default value for the <code>timeout-seconds</code> attribute. */ final private static int DEFAULT_TIMEOUT = 1200; /** * Constructs a new <code>FetchFTP</code>. * * @param name the name of this processor */ public FetchFTP(String name) { super(name, "FTP Fetcher."); add(ATTR_USERNAME, DESC_USERNAME, DEFAULT_USERNAME); add(ATTR_PASSWORD, DESC_PASSWORD, DEFAULT_PASSWORD); add(ATTR_EXTRACT, DESC_EXTRACT, DEFAULT_EXTRACT); add(ATTR_EXTRACT_PARENT, DESC_EXTRACT_PARENT, DEFAULT_EXTRACT_PARENT); add(ATTR_MAX_LENGTH, DESC_MAX_LENGTH, DEFAULT_MAX_LENGTH); add(ATTR_BANDWIDTH, DESC_BANDWIDTH, DEFAULT_BANDWIDTH); add(ATTR_TIMEOUT, DESC_TIMEOUT, DEFAULT_TIMEOUT); } /** * Convenience method for adding an attribute. * * @param name The name of the attribute * @param desc The description of the attribute * @param def The default value for the attribute */ private void add(String name, String desc, Object def) { SimpleType st = new SimpleType(name, desc, def); addElementToDefinition(st); } /** * Convenience method for extracting an attribute. * If a value for the specified name cannot be found, * a warning is written to the log and the specified * default value is returned instead. * * @param context The context for the attribute fetch * @param name The name of the attribute to fetch * @param def The value to return if the attribute isn't found * @return The value of that attribute */ private Object get(Object context, String name, Object def) { try { return getAttribute(context, name); } catch (AttributeNotFoundException e) { logger.warning("Attribute not found (using default): " + name); return def; } } /** * Processes the given URI. If the given URI is not an FTP URI, then * this method does nothing. Otherwise an attempt is made to connect * to the FTP server. * * <p>If the connection is successful, an attempt will be made to CD to * the path specified in the URI. If the remote CD command succeeds, * then it is assumed that the URI represents a directory. If the * CD command fails, then it is assumed that the URI represents * a file. * * <p>For directories, the directory listing will be fetched using * the FTP LIST command, and saved to the HttpRecorder. If the * <code>extract.from.dirs</code> attribute is set to true, then * the files in the fetched list will be added to the curi as * extracted FTP links. (It was easier to do that here, rather * than writing a separate FTPExtractor.) * * <p>For files, the file will be fetched using the FTP RETR * command, and saved to the HttpRecorder. * * <p>All file transfers (including directory listings) occur using * Binary mode transfer. Also, the local passive transfer mode * is always used, to play well with firewalls. * * @param curi the curi to process * @throws InterruptedException if the thread is interrupted during * processing */ public void innerProcess(CrawlURI curi) throws InterruptedException { if (!curi.getUURI().getScheme().equals("ftp")) { return; } curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis()); HttpRecorder recorder = HttpRecorder.getHttpRecorder(); ClientFTP client = new ClientFTP(); try { fetch(curi, client, recorder); } catch (FTPException e) { logger.log(Level.SEVERE, "FTP server reported problem.", e); curi.setFetchStatus(e.getReplyCode()); } catch (IOException e) { logger.log(Level.SEVERE, "IO Error during FTP fetch.", e); curi.setFetchStatus(FetchStatusCodes.S_CONNECT_LOST); } finally { disconnect(client); curi.setContentSize(recorder.getRecordedInput().getSize()); curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis()); } } /** * Fetches a document from an FTP server. * * @param curi the URI of the document to fetch * @param client the FTPClient to use for the fetch * @param recorder the recorder to preserve the document in * @throws IOException if a network or protocol error occurs * @throws InterruptedException if the thread is interrupted */ private void fetch(CrawlURI curi, ClientFTP client, HttpRecorder recorder) throws IOException, InterruptedException { // Connect to the FTP server. UURI uuri = curi.getUURI(); int port = uuri.getPort(); if (port == -1) { port = 21; } client.connectStrict(uuri.getHost(), port); // Authenticate. String[] auth = getAuth(curi); client.loginStrict(auth[0], auth[1]); // The given resource may or may not be a directory. // To figure out which is which, execute a CD command to // the UURI's path. If CD works, it's a directory. boolean dir = client.changeWorkingDirectory(uuri.getPath()); if (dir) { curi.setContentType("text/plain"); } // TODO: A future version of this class could use the system string to // set up custom directory parsing if the FTP server doesn't support // the nlist command. if (logger.isLoggable(Level.FINE)) { String system = client.getSystemName(); logger.fine(system); } // Get a data socket. This will either be the result of a NLIST // command for a directory, or a RETR command for a file. int command = dir ? FTPCommand.NLST : FTPCommand.RETR; String path = dir ? "." : uuri.getPath(); client.enterLocalPassiveMode(); client.setBinary(); Socket socket = client.openDataConnection(command, path); curi.setFetchStatus(client.getReplyCode()); // Save the streams in the CURI, where downstream processors // expect to find them. try { saveToRecorder(curi, socket, recorder); } finally { recorder.close(); close(socket);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -