📄 fetchftp.java
字号:
} curi.setFetchStatus(200); if (dir) { extract(curi, recorder); } addParent(curi); } /** * Saves the given socket to the given recorder. * * @param curi the curi that owns the recorder * @param socket the socket whose streams to save * @param recorder the recorder to save them to * @throws IOException if a network or file error occurs * @throws InterruptedException if the thread is interrupted */ private void saveToRecorder(CrawlURI curi, Socket socket, HttpRecorder recorder) throws IOException, InterruptedException { curi.setHttpRecorder(recorder); recorder.markContentBegin(); recorder.inputWrap(socket.getInputStream()); recorder.outputWrap(socket.getOutputStream()); // Read the remote file/dir listing in its entirety. long softMax = 0; long hardMax = getMaxLength(curi); long timeout = (long)getTimeout(curi) * 1000; int maxRate = getFetchBandwidth(curi); RecordingInputStream input = recorder.getRecordedInput(); input.setLimits(hardMax, timeout, maxRate); input.readFullyOrUntil(softMax); } /** * Extract FTP links in a directory listing. * The listing must already be saved to the given recorder. * * @param curi The curi to save extracted links to * @param recorder The recorder containing the directory listing */ private void extract(CrawlURI curi, HttpRecorder recorder) { if (!getExtractFromDirs(curi)) { return; } ReplayCharSequence seq = null; try { seq = recorder.getReplayCharSequence(); extract(curi, seq); } catch (IOException e) { logger.log(Level.SEVERE, "IO error during extraction.", e); } catch (RuntimeException e) { logger.log(Level.SEVERE, "IO error during extraction.", e); } finally { close(seq); } } /** * Extracts FTP links in a directory listing. * * @param curi The curi to save extracted links to * @param dir The directory listing to extract links from * @throws URIException if an extracted link is invalid */ private void extract(CrawlURI curi, ReplayCharSequence dir) { logger.log(Level.FINEST, "Extracting URIs from FTP directory."); Matcher matcher = DIR.matcher(dir); while (matcher.find()) { String file = matcher.group(1); addExtracted(curi, file); } } /** * Adds an extracted filename to the curi. A new URI will be formed * by taking the given curi (which should represent the directory the * file lives in) and appending the file. * * @param curi the curi to store the discovered link in * @param file the filename of the discovered link */ private void addExtracted(CrawlURI curi, String file) { try { file = URLEncoder.encode(file, "UTF-8"); } catch (UnsupportedEncodingException e) { throw new AssertionError(e); } if (logger.isLoggable(Level.FINEST)) { logger.log(Level.FINEST, "Found " + file); } String base = curi.toString(); if (base.endsWith("/")) { base = base.substring(0, base.length() - 1); } try { UURI n = new UURI(base + "/" + file, true); Link link = new Link(curi.getUURI(), n, NAVLINK_MISC, NAVLINK_HOP); curi.addOutLink(link); } catch (URIException e) { logger.log(Level.WARNING, "URI error during extraction.", e); } } /** * Extracts the parent URI from the given curi, then adds that parent * URI as a discovered link to the curi. * * <p>If the <code>extract-parent</code> attribute is false, then this * method does nothing. Also, if the path of the given curi is * <code>/</code>, then this method does nothing. * * <p>Otherwise the parent is determined by eliminated the lowest part * of the URI's path. Eg, the parent of <code>ftp://foo.com/one/two</code> * is <code>ftp://foo.com/one</code>. * * @param curi the curi whose parent to add */ private void addParent(CrawlURI curi) { if (!getExtractParent(curi)) { return; } UURI uuri = curi.getUURI(); try { if (uuri.getPath().equals("/")) { // There's no parent to add. return; } String scheme = uuri.getScheme(); String auth = uuri.getEscapedAuthority(); String path = uuri.getEscapedCurrentHierPath(); UURI parent = new UURI(scheme + "://" + auth + path, false); Link link = new Link(uuri, parent, NAVLINK_MISC, NAVLINK_HOP); curi.addOutLink(link); } catch (URIException e) { logger.log(Level.WARNING, "URI error during extraction.", e); } } /** * Returns the <code>extract.from.dirs</code> attribute for this * <code>FetchFTP</code> and the given curi. * * @param curi the curi whose attribute to return * @return that curi's <code>extract.from.dirs</code> */ public boolean getExtractFromDirs(CrawlURI curi) { return (Boolean)get(curi, ATTR_EXTRACT, DEFAULT_EXTRACT); } /** * Returns the <code>extract.parent</code> attribute for this * <code>FetchFTP</code> and the given curi. * * @param curi the curi whose attribute to return * @return that curi's <code>extract-parent</code> */ public boolean getExtractParent(CrawlURI curi) { return (Boolean)get(curi, ATTR_EXTRACT_PARENT, DEFAULT_EXTRACT_PARENT); } /** * Returns the <code>timeout-seconds</code> attribute for this * <code>FetchFTP</code> and the given curi. * * @param curi the curi whose attribute to return * @return that curi's <code>timeout-seconds</code> */ public int getTimeout(CrawlURI curi) { return (Integer)get(curi, ATTR_TIMEOUT, DEFAULT_TIMEOUT); } /** * Returns the <code>max-length-bytes</code> attribute for this * <code>FetchFTP</code> and the given curi. * * @param curi the curi whose attribute to return * @return that curi's <code>max-length-bytes</code> */ public long getMaxLength(CrawlURI curi) { return (Long)get(curi, ATTR_MAX_LENGTH, DEFAULT_MAX_LENGTH); } /** * Returns the <code>fetch-bandwidth</code> attribute for this * <code>FetchFTP</code> and the given curi. * * @param curi the curi whose attribute to return * @return that curi's <code>fetch-bandwidth</code> */ public int getFetchBandwidth(CrawlURI curi) { return (Integer)get(curi, ATTR_BANDWIDTH, DEFAULT_BANDWIDTH); } /** * Returns the username and password for the given URI. This method * always returns an array of length 2. The first element in the returned * array is the username for the URI, and the second element is the * password. * * <p>If the URI itself contains the username and password (i.e., it looks * like <code>ftp://username:password@host/path</code>) then that username * and password are returned. * * <p>Otherwise the settings system is probed for the <code>username</code> * and <code>password</code> attributes for this <code>FTPFetch</code> * and the given <code>curi</code> context. The values of those * attributes are then returned. * * @param curi the curi whose username and password to return * @return an array containing the username and password */ private String[] getAuth(CrawlURI curi) { String[] result = new String[2]; UURI uuri = curi.getUURI(); String userinfo; try { userinfo = uuri.getUserinfo(); } catch (URIException e) { assert false; logger.finest("getUserinfo raised URIException."); userinfo = null; } if (userinfo != null) { int p = userinfo.indexOf(':'); if (p > 0) { result[0] = userinfo.substring(0,p); result[1] = userinfo.substring(p + 1); return result; } } result[0] = (String)get(curi, ATTR_USERNAME, DEFAULT_USERNAME); result[1] = (String)get(curi, ATTR_PASSWORD, DEFAULT_PASSWORD); return result; } /** * Determines the password for the given URI. If the URI itself contains * a password, then that password is returned. Otherwise the settings * system is probed for the <code>password</code> attribute, and the value * for that attribute is returned. * * @param curi the curi whose password to return * @return that password */ public String determinePassword(CrawlURI curi) { return (String)get(curi, ATTR_PASSWORD, DEFAULT_PASSWORD); } /** * Quietly closes the given socket. * * @param socket the socket to close */ private static void close(Socket socket) { try { socket.close(); } catch (IOException e) { logger.log(Level.WARNING, "IO error closing socket.", e); } } /** * Quietly closes the given sequence. * If an IOException is raised, this method logs it as a warning. * * @param seq the sequence to close */ private static void close(ReplayCharSequence seq) { if (seq == null) { return; } try { seq.close(); } catch (IOException e) { logger.log(Level.WARNING, "IO error closing ReplayCharSequence.", e); } } /** * Quietly disconnects from the given FTP client. * If an IOException is raised, this method logs it as a warning. * * @param client the client to disconnect */ private static void disconnect(ClientFTP client) { if (client.isConnected()) try { client.disconnect(); } catch (IOException e) { if (logger.isLoggable(Level.WARNING)) { logger.warning("Could not disconnect from FTP client: " + e.getMessage()); } } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -