📄 mirrorwriterprocessor.java
字号:
// Trim any trailing File.separatorChar characters from baseSeg. while ((baseSeg.length() > 1) && baseSeg.endsWith(File.separator)) { baseSeg = baseSeg.substring(0, baseSeg.length() - 1); } if (0 == baseSeg.length()) { baseDir = getController().getDisk().getPath(); } else if ((new File(baseSeg)).isAbsolute()) { baseDir = baseSeg; } else { baseDir = getController().getDisk().getPath() + File.separator + baseSeg; } // Already have a path for this URI. boolean reCrawl = curi.containsKey(A_MIRROR_PATH); /* The file system path, relative to the value of ATTR_PATH, where this resource should be written. The intent is to add later a persistent mapping from URI to path. This will allow a URI to be re-crawled and updated if it has changed. If the resource has already been fetched and written to a file before, the path to that file has already been obtained from the persistent mapping and placed on the AList by some other module, such as the frontier. */ String mps = null; File destFile = null; // Write resource contents to this file. try { if (reCrawl) { mps = curi.getString(A_MIRROR_PATH); destFile = new File(baseDir + File.separator + mps); File parent = destFile.getParentFile(); if (null != parent) { IoUtils.ensureWriteableDirectory(parent); } } else { URIToFileReturn r = null; // Return from uriToFile(). try { r = uriToFile(baseDir, curi); } catch (AttributeNotFoundException e) { logger.warning(e.getLocalizedMessage()); return; } destFile = r.getFile(); mps = r.getRelativePath(); } logger.info(uuri.toString() + " -> " + destFile.getPath()); writeToPath(recis, destFile); if (!reCrawl) { curi.putString(A_MIRROR_PATH, mps); } } catch (IOException e) { curi.addLocalizedError(this.getName(), e, "Mirror"); } } /** Gets the directory in which the file will reside. Any directories needed are created. @param baseDir the path to the starting directory @param host the host part of the URI, or null if the host name should not be part of the returned path @param port the port part of the URI, or -1 if the port should not be part of the returned path @param segs all the segments in the URI @param maxLen the maximum path length allowed to the directory; this must leave some room for the file itself @return the directory, or null if maxLen would be exceeded @throws IOException if a needed directory could not be created @throws IOException if a needed directory is not writeable @throws IOException if a non-directory file exists with the same path as a needed directory */ private URIToFileReturn dirPath(String baseDir, String host, int port, PathSegment[] segs, int maxLen) throws IOException { // Return value. URIToFileReturn r = new URIToFileReturn(baseDir, host, port); r.mkdirs(); for (int i = 0; (segs.length - 1) != i; ++i) { segs[i].addToPath(r); if (r.longerThan(maxLen)) { return null; } } return r; } /** Ensures that a list contains an even number of elements. If not, the last element is removed. @param list the list */ private void ensurePairs(ListType list) { if (1 == (list.size() % 2)) { list.remove(list.size() - 1); } } /** Makes a path in which a resource can be stored. @param baseDir the path to the starting directory @param curi the URI @return a path to the file in which to store the resource @throws AttributeNotFoundException if a needed setting is missing @throws IOException if a needed directory could not be created @throws IOException if a needed directory is not writeable @throws IOException if a non-directory file exists with the same path as a needed directory */ private URIToFileReturn uriToFile(String baseDir, CrawlURI curi) throws AttributeNotFoundException, IOException { UURI uuri = curi.getUURI(); // Current URI. String host = null; Boolean hd = (Boolean) getAttribute(ATTR_HOST_DIRECTORY, curi); if (hd.booleanValue()) { host = uuri.getHost(); StringList hostMap = (StringList) getAttribute(ATTR_HOST_MAP, curi); if ((null != hostMap) && (hostMap.size() > 1)) { ensurePairs(hostMap); Iterator<String> i = hostMap.typesafe().iterator(); for (boolean more = true; more && i.hasNext();) { String h1 = i.next(); String h2 = i.next(); if (host.equalsIgnoreCase(h1)) { more = false; if ((null != h2) && (0 != h2.length())) { host = h2; } } } } } int port = ((Boolean) getAttribute(ATTR_PORT_DIRECTORY, curi)).booleanValue() ? uuri.getPort() : -1; String suffix = null; // Replacement suffix. StringList ctm = (StringList) getAttribute(ATTR_CONTENT_TYPE_MAP, curi); if ((null != ctm) && (ctm.size() > 1)) { ensurePairs(ctm); String contentType = curi.getContentType().toLowerCase(); Iterator i = ctm.iterator(); for (boolean more = true; more && i.hasNext();) { String ct = (String) i.next(); String suf = (String) i.next(); if ((null != ct) && contentType.startsWith(ct.toLowerCase())) { more = false; if ((null != suf) && (0 != suf.length())) { suffix = suf; } } } } int maxSegLen = ((Integer) getAttribute(ATTR_MAX_SEG_LEN, curi)).intValue(); if (maxSegLen < 2) { maxSegLen = DEFAULT_MAX_SEG_LEN; } int maxPathLen = ((Integer) getAttribute(ATTR_MAX_PATH_LEN, curi)).intValue(); if (maxPathLen < 2) { maxPathLen = DEFAULT_MAX_PATH_LEN; } Map<String,String> characterMap = EMPTY_MAP; StringList cm = (StringList) getAttribute(ATTR_CHAR_MAP, curi); if ((null != cm) && (cm.size() > 1)) { ensurePairs(cm); characterMap = new HashMap<String,String>(cm.size()); // Above will be half full. for (Iterator i = cm.iterator(); i.hasNext();) { String s1 = (String) i.next(); String s2 = (String) i.next(); if ((null != s1) && (1 == s1.length()) && (null != s2) && (0 != s2.length())) { characterMap.put(s1, s2); } } } String dotBegin = (String) getAttribute(ATTR_DOT_BEGIN, curi); if (".".equals(dotBegin)) { dotBegin = null; } String dotEnd = (String) getAttribute(ATTR_DOT_END, curi); if (".".equals(dotEnd)) { dotEnd = null; } String tld = (String) getAttribute(ATTR_TOO_LONG_DIRECTORY, curi); if ((null == tld) || (0 == tld.length()) || (-1 != tld.indexOf(File.separatorChar))) { tld = DEFAULT_TOO_LONG_DIRECTORY; } Set<String> underscoreSet = null; StringList us = (StringList) getAttribute(ATTR_UNDERSCORE_SET, curi); if ((null != us) && (0 != us.size())) { underscoreSet = new HashSet<String>(us.size(), 0.5F); for (String s: us.typesafe()) { if ((null != s) && (0 != s.length())) { underscoreSet.add(s.toLowerCase()); } } } return uriToFile(curi, host, port, uuri.getPath(), uuri.getQuery(), suffix, baseDir, maxSegLen, maxPathLen, ((Boolean) getAttribute(ATTR_CASE_SENSITIVE, curi)).booleanValue(), (String) getAttribute(ATTR_DIRECTORY_FILE, curi), characterMap, dotBegin, dotEnd, tld, ((Boolean) getAttribute(ATTR_SUFFIX_AT_END, curi)).booleanValue(), underscoreSet); } /** Makes a path in which a resource can be stored. @param curi the URI @param host the host part of the URI, or null if the host name should not be part of the returned path @param port the port part of the URI, or -1 if the port should not be part of the returned path @param uriPath the path part of the URI (must be absolute) @param query the query part of the URI, or null if none @param suffix if non-null, use this as the suffix in preference to any suffix that uriPath might have @param baseDir the path to the starting directory @param maxSegLen the maximum number of characters allowed in one file system path segment (component) @param maxPathLen the maximum number of characters allowed in a file system path @param caseSensitive if true, the file system is assumed to be case-sensitive; otherwise the file system is assumed to be case-insensitive but case-preserving @param dirFile the simple file name to append to a URI path ending in '/' @param characterMap a map from characters (as length-1 String values) in the URI path and query to replacement String values @param dotBegin if non-null, this replaces a '.' at the beginning of a segment @param dotEnd if non-null, this replaces a '.' that appears at the end of a directory name @param tooLongDir if the path length would exceed or be close to exceeding maxPathLen then this simple name is used as a directory under baseDir instead @param suffixAtEnd if true, the suffix is placed at the end of the path, after the query (if any); otherwise, the suffix is placed before the query @param underscoreSet if non-null and a segment, after conversion to lower case, is in this set, then prepend an underscore to the segment @return a path to the file in which to store the resource @throws IOException if a needed directory could not be created @throws IOException if a needed directory is not writeable @throws IOException if a non-directory file exists with the same path as a needed directory */ private URIToFileReturn uriToFile(CrawlURI curi, String host, int port, String uriPath, String query, String suffix, String baseDir, int maxSegLen, int maxPathLen, boolean caseSensitive, String dirFile, Map characterMap, String dotBegin, String dotEnd, String tooLongDir, boolean suffixAtEnd, Set underscoreSet) throws IOException { assert (null == host) || (0 != host.length()); assert 0 != uriPath.length(); assert '/' == uriPath.charAt(0) : "uriPath: " + uriPath; assert -1 == uriPath.indexOf("//") : "uriPath: " + uriPath; assert -1 == uriPath.indexOf("/./") : "uriPath: " + uriPath; assert !uriPath.endsWith("/.") : "uriPath: " + uriPath; assert (null == query) || (-1 == query.indexOf('/')) : "query: " + query; assert (null == suffix) || ((0 != suffix.length()) && (-1 == suffix.indexOf('/'))) : "suffix: " + suffix; assert 0 != baseDir.length(); assert maxSegLen > 2 : "maxSegLen: " + maxSegLen; assert maxPathLen > 1; assert maxPathLen >= maxSegLen : "maxSegLen: " + maxSegLen + " maxPathLen: " + maxPathLen; assert 0 != dirFile.length(); assert -1 == dirFile.indexOf("/") : "dirFile: " + dirFile; assert null != characterMap; assert (null == dotBegin) || (0 != dotBegin.length()); assert (null == dotEnd) || !dotEnd.endsWith(".") : "dotEnd: " + dotEnd; assert 0 != tooLongDir.length(); assert '/' != tooLongDir.charAt(0) : "tooLongDir: " + tooLongDir; int nSegs = 0; // Number of segments in the URI path. for (int i = 0; uriPath.length() != i; ++i) { if ('/' == uriPath.charAt(i)) { ++nSegs; // Just count slashes. } } assert nSegs > 0 : "uriPath: " + uriPath; PathSegment[] segs = new PathSegment[nSegs]; // The segments. int slashIndex = 0; // Index in uriPath of current /. for (int i = 0; (segs.length - 1) != i; ++i) { int nsi = uriPath.indexOf('/', slashIndex + 1); // Next index. assert nsi > slashIndex : "uriPath: " + uriPath; segs[i] = new DirSegment(uriPath, slashIndex + 1, nsi, maxSegLen, caseSensitive, curi, characterMap, dotBegin, dotEnd, underscoreSet); slashIndex = nsi; } if (slashIndex < (uriPath.length() - 1)) { // There's something after the last /.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -