⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 mirrorwriterprocessor.java

📁 Heritrix是一个开源,可扩展的web爬虫项目。Heritrix设计成严格按照robots.txt文件的排除指示和META robots标签。
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
            segs[segs.length - 1] = new EndSegment(uriPath, slashIndex + 1,                    uriPath.length(), maxSegLen, caseSensitive, curi,                    characterMap, dotBegin, query, suffix, maxPathLen,                    suffixAtEnd);        } else {            // The URI ends with a /.            segs[segs.length - 1] = new EndSegment(dirFile, 0, dirFile.length(),                    maxSegLen, caseSensitive, curi, characterMap, null,                    query, suffix, maxPathLen, suffixAtEnd);        }        URIToFileReturn r = dirPath(baseDir, host, port, segs,                                    maxPathLen - maxSegLen);        if (null == r) {            // The path is too long.            // Replace all the segment directories by tooLongDir.            PathSegment endSegment = segs[segs.length - 1];            segs = new PathSegment[2];            segs[0] = new DirSegment(tooLongDir, 0, tooLongDir.length(),                                     maxSegLen, caseSensitive, curi, EMPTY_MAP,                                     null, null, null);            segs[1] = endSegment;            r = dirPath(baseDir, host, port, segs, maxPathLen - maxSegLen);        }        segs[segs.length - 1].addToPath(r);        return r;    }    /**       Copies a resource into a file.       A temporary file is created and then atomically renamed to       the destination file.       This prevents leaving a partial file in case of a crash.       @param recis the RecordingInputStream that recorded the contents       of the resource       @param dest the destination file       @throws IOException on I/O error       @throws IOException if       the file rename fails    */    private void writeToPath(RecordingInputStream recis, File dest)        throws IOException {        ReplayInputStream replayis = recis.getContentReplayInputStream();        File tf = new File (dest.getPath() + "N");        FileOutputStream fos = new FileOutputStream(tf);        try {            replayis.readFullyTo(fos);        } finally {            fos.close();            replayis.close();        }        if (!tf.renameTo(dest)) {            throw new IOException("Can not rename " + tf.getAbsolutePath()                                  + " to " + dest.getAbsolutePath());        }    }    /**       This class represents one segment (component) of a URI path.       A segment between '/' characters is a directory segment.       The segment after the last '/' is the end segment.    */    abstract class PathSegment {        /**           existsMaybeCaseSensitive return code           for a file that does not exist.        */        protected static final int EXISTS_NOT = 1;        /**           existsMaybeCaseSensitive return code           for a file that exists.           Furthermore, the comparison is case-sensitive.        */        protected static final int EXISTS_EXACT_MATCH = 2;        /**           existsMaybeCaseSensitive return code           for a file that exists, using a case-insensitive comparison.           Furthermore, the file would not exist if the comparison           were case-sensitive.        */        protected static final int EXISTS_CASE_INSENSITIVE_MATCH = 3;        /** The URI, for logging and error reporting.*/        protected CrawlURI curi;        /**           The main part of this segment.           For a directory segment, that's all there is.           For an end segment, it's the part of the URI after the last '/'           up to but not including the '.' before the suffix (if any).        */        protected LumpyString mainPart = null;        /**           The maximum number of characters allowed           in one file system path segment.           A URI segment can potentially be much longer,           but we'll trim it to this.        */        protected int maxSegLen;        /** If true, the file system is assumed to be            case-sensitive; otherwise the file system is assumed to be            case-insensitive.        */        private boolean caseSensitive;        /**           Creates a new PathSegment.           @param maxSegLen the maximum number of characters           allowed in one path segment           @param caseSensitive if true, the file system is assumed to be           case-sensitive; otherwise the file system is assumed to be           case-insensitive           @param curi the URI           @throws IllegalArgumentException if           maxSegLen is too small        */        PathSegment(int maxSegLen, boolean caseSensitive, CrawlURI curi) {            if (maxSegLen < 2) {                throw new IllegalArgumentException("maxSegLen: " + maxSegLen);            }            this.maxSegLen = maxSegLen;            this.caseSensitive = caseSensitive;            this.curi = curi;        }        /**           Adds this segment to a file path.           This is the key method of this class.           It extends the given path by one segment,           named to obey all constraints.           A new directory is created if necessary.           @param currentPath the current path, to which this segment is added           @throws IOException           if a needed directory could not be created           @throws IOException           if a needed directory is not writeable        */        abstract void addToPath(URIToFileReturn currentPath) throws IOException;        /**           Checks if a file (including directories) exists.           @param fsf the directory containing the file to be checked           @param segStr the simple file or directory name           @param check the file or directory for which to check           @return EXISTS_NOT if check does not exist,           EXISTS_EXACT_MATCH if check exists with a name that matches           (case-sensitive) segStr, and           EXISTS_CASE_INSENSITIVE_MATCH if check exists           with a name that matches           segStr using a case-insensitive match but not using a           case-sensitive match        */        protected int existsMaybeCaseSensitive(File fsf, String segStr,                                               File check) {            if (caseSensitive) {                return check.exists() ? EXISTS_EXACT_MATCH : EXISTS_NOT;            }            if (!check.exists()) {                return EXISTS_NOT;            }            /*              The JVM says the file exists, but the file system is assumed to be              case-insensitive, so do we have an exact match or just a              case-insensitive match?  We get an array of all the              file names that match (case-insensitive) the one we're              checking, then we can look for a case-sensitive match.            */            String[] fna = fsf.list(new CaseInsensitiveFilenameFilter(segStr));            for (int i = 0; fna.length != i; ++i) {                if (segStr.equals(fna[i])) {                  return EXISTS_EXACT_MATCH;                }            }            return EXISTS_CASE_INSENSITIVE_MATCH;        }        /**           This class implements a FilenameFilter that matches           by name, ignoring case.        */        class CaseInsensitiveFilenameFilter implements FilenameFilter {            /** The file name we're looking for. */            private String target;            /**               Creates a CaseInsensitiveFilenameFilter.               @param target the target file name               @throws IllegalArgumentException if               target is null or empty.            */            CaseInsensitiveFilenameFilter(String target) {                if (null == target) {                    throw new IllegalArgumentException("target null");                }                if (0 == target.length()) {                    throw new IllegalArgumentException("target empty");                }                this.target = target;            }            public boolean accept(File dir, String name) {                return target.equalsIgnoreCase(name);            }        }    }    /**       This class represents one directory segment (component) of a URI path.    */    class DirSegment extends PathSegment {        /** If a segment name is in this set, prepend an underscore.*/        private Set underscoreSet;        /**           Creates a DirSegment.           @param uriPath the path part of the URI           @param beginIndex the beginning index, inclusive, of the substring           of uriPath to be used           @param endIndex the ending index, exclusive, of the substring           of uriPath to be used           @param maxSegLen the maximum number of characters allowed in one           file system path segment (component)           @param caseSensitive if true, the file system is assumed to be           case-sensitive; otherwise the file system is assumed to be           case-insensitive but case-preserving           @param curi the URI           @param characterMap a map from characters           (as length-1 String values) in           the URI path and query to replacement String values           @param dotBegin if non-null, this replaces a '.' at           the beginning of the directory name           @param dotEnd if non-null, this replaces a '.'           that appears at the end of a directory name           @param underscoreSet if non-null and a segment, after conversion           to lower case, is in this set, then prepend an underscore           to the segment           @throws IllegalArgumentException if           beginIndex is negative.           @throws IllegalArgumentException if           endIndex is less than beginIndex.           @throws IllegalArgumentException if           maxSegLen is too small.        */        DirSegment(String uriPath, int beginIndex, int endIndex, int maxSegLen,                   boolean caseSensitive, CrawlURI curi, Map characterMap,                   String dotBegin, String dotEnd, Set underscoreSet) {            super(maxSegLen, caseSensitive, curi);            mainPart = new LumpyString(uriPath, beginIndex, endIndex,                                       (null == dotEnd) ? 0 : dotEnd.length(),                                       this.maxSegLen, characterMap, dotBegin);            if (null != dotEnd) {                // We might get a segment like /VeryLong............../                // so we have to loop to guarantee the segment doesn't                // end with a dot.                int dl = dotEnd.length();                while (mainPart.endsWith('.')) {                    // Chop off the dot at the end.                    mainPart.trimToMax(mainPart.length() - 1);                    if ((mainPart.length() + dl) <= this.maxSegLen) {                        mainPart.append(dotEnd);                    }                }            }            this.underscoreSet = underscoreSet;        }        void addToPath(URIToFileReturn currentPath) throws IOException {            NumberFormat nf = null;            int startLen = mainPart.length(); // Starting length.            for (int i = 0; ; ++i) {                if (0 != i) {                    // Try to create a unique file name by appending a                    // number.                    if (null == nf) {                        nf = NumberFormat.getIntegerInstance();                    }                    String ending = nf.format(i);                    mainPart.trimToMax(Math.min(startLen,                                                maxSegLen - ending.length()));                    mainPart.append(ending);                }                String segStr = mainPart.toString();                if ((null != underscoreSet)                        && underscoreSet.contains(segStr.toLowerCase())) {                    mainPart.prepend('_');                    ++startLen;                    mainPart.trimToMax(maxSegLen);                    segStr = mainPart.toString();                }                File fsf = currentPath.getFile();                File f = new File(fsf, segStr);                int er = existsMaybeCaseSensitive(fsf, segStr, f);                switch (er) {                case EXISTS_NOT:                    if (!f.mkdir()) {                        throw new IOException("Can not mkdir "                                              + f.getAbsolutePath());                    }                    currentPath.append(f, segStr);                    return; // Created new directory.                case EXISTS_EXACT_MATCH:                    if (f.isDirectory()) {                        if (!f.canWrite()) {                            throw new IOException("Directory "                                                  + f.getAbsolutePath()                                                  + " not writeable.");                        }                        /*                          A writeable directory already exists.                          Assume it's the one we want.                          This assumption fails for cases like                          http://foo.com/a*256/b.html                          followed by

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -