📄 mirrorwriterprocessor.java

📁 Heritrix是一个开源,可扩展的web爬虫项目。Heritrix设计成严格按照robots.txt文件的排除指示和META robots标签。
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
                          http://foo.com/a*256z/b.html                          where a*256 means a sequence of the maximum allowed                          number of "a"s.                        */                        currentPath.append(f, segStr);                        return;                    }                    /*                      A segment already exists but isn't a directory.                      This could arise from, for example,                      http://foo.com/a*256                      followed by                      http://foo.com/a*256b/b.html                      We need to find a directory we created before in this                      situation, or make a new directory with a unique name.                      Going around the loop should eventually do that.                    */                    break;                case EXISTS_CASE_INSENSITIVE_MATCH:                    /*                      A segment already exists that's a case-insensitive match                      but not an exact match.  It may or may not be a directory.                      This could arise, on a case-insensitive, case-preserving                      file system (such as Macintosh HFS+).  For example,                      http://foo.com/bar/z.html                      followed by                      http://foo.com/BAR/z.html                      would do it.  We want bar and BAR to turn into different                      directories.                      Going around the loop should eventually do that.                    */                    break;                default:                    throw new IllegalStateException("Code: " + er);                }            }        }    }    /**       This class represents the last segment (component) of a URI path.    */    class EndSegment extends PathSegment {        /**           The number of characters in the path up to this EndSegment,           including the final File.separatorChar.        */        private int dirPathLen;        /**           The maximum number of characters allowed in a file path, minus 1.           The extra 1 is reserved for temporarily appending           a character so an existing file can be replaced atomically,           for example, by writing           <code>foo.htmlN</code>           and then renaming it to           <code>foo.html</code>.        */        private int maxPathLen;        /** The query part of the URI, or null if none.*/        private LumpyString query = null;        /**           The suffix, or null if none.           This isn't a LumpyString because we'd only trim a suffix           if space were very, very tight.        */        private String suffix = null;        /**           True if the suffix goes at the end, after the query.           False if the suffix goes before the query.        */        private boolean suffixAtEnd;        /** Appended to mainPart if necessary to create a unique file name.*/        private String uniquePart = null;        /**           Creates an EndSegment.           @param uriPath the path part of the URI           @param beginIndex the beginning index, inclusive, of the substring           of uriPath to be used           @param endIndex the ending index, exclusive, of the substring           of uriPath to be used           @param maxSegLen the maximum number of characters allowed in one           file system path segment (component)           @param caseSensitive if true, the file system is assumed to be           case-sensitive; otherwise the file system is assumed to be           case-insensitive but case-preserving           @param curi the URI           @param characterMap maps characters (as length-1 String values) in           the URI path and query to replacement String values           @param dotBegin if non-null, this replaces a '.' at           the beginning of the segment           @param query the query part of the URI, or null if none           @param suffix if non-null, use this as the suffix in preference to           any suffix that uriPath might have           @param maxPathLen the maximum number of characters allowed in a           file system path           @param suffixAtEnd if true, the suffix is placed at the end of the           path, after the query (if any); otherwise, the suffix is placed           before the query           @throws IllegalArgumentException if           beginIndex is negative.           @throws IllegalArgumentException if           endIndex is less than beginIndex.           @throws IllegalArgumentException if           maxSegLen is too small.        */        EndSegment(String uriPath, int beginIndex, int endIndex, int maxSegLen,                   boolean caseSensitive, CrawlURI curi, Map characterMap,                   String dotBegin, String query, String suffix,                   int maxPathLen, boolean suffixAtEnd) {            super(maxSegLen - 1, caseSensitive, curi);            int mpe = endIndex; // endIndex for the main part (no suffix).            int ldi = uriPath.lastIndexOf('.'); // Index of last dot.            if ((ldi > 0) && (ldi < (endIndex - 1)) && (ldi > beginIndex)) {                mpe = ldi; // uriPath has a suffix.            }            this.suffix = suffix;            if ((null == this.suffix) && (mpe < (endIndex - 1))) {                // There's no replacement suffix and uriPath has a suffix.                // Run it through a LumpyString to do the character mapping.                LumpyString ls = new LumpyString(uriPath, mpe + 1, endIndex, 0,                                                 this.maxSegLen, characterMap,                                                 null);                this.suffix = ls.toString();            }            int pad = ((null == this.suffix) ? 0 : (1 + this.suffix.length()))                + ((null == query) ? 0 : query.length());            mainPart = new LumpyString(uriPath, beginIndex, mpe, pad,                                       this.maxSegLen, characterMap, dotBegin);            this.maxPathLen = maxPathLen - 1;            if (null != query) {                this.query = new LumpyString(query, 0, query.length(), 0,                                             this.maxSegLen, characterMap,                                             null);            }            this.suffixAtEnd = suffixAtEnd;        }        void addToPath(URIToFileReturn currentPath) {            File fsf = currentPath.getFile();            NumberFormat nf = null;            dirPathLen = 1 + fsf.getPath().length();            for (int i = 0; ; ++i) {                if (0 != i) {                    if (null == nf) {                        nf = NumberFormat.getIntegerInstance();                    }                    uniquePart = nf.format(i);                }                trimWithPadding((null == uniquePart) ? 0 : uniquePart.length());                String segStr = joinParts(); // This EndSegment as a String.                File f = new File(fsf, segStr);                // Code for whether file exists.                int er = existsMaybeCaseSensitive(fsf, segStr, f);                switch (er) {                case EXISTS_NOT:                    currentPath.append(f, segStr);                    return;                case EXISTS_EXACT_MATCH:                    if (f.isFile()) {                        currentPath.append(f, segStr);                        return;                    }                    /*                      A file already exists but isn't an ordinary file.                      It might be a directory, special file, named pipe,                      whatever.                      We need to find an unused file name,                      or an ordinary file.                      Going around the loop should eventually do that.                    */                    break;                case EXISTS_CASE_INSENSITIVE_MATCH:                    /*                      A file already exists that's a case-insensitive match                      but not an exact match.                      This could arise, on a case-insensitive, case-preserving                      file system (such as Macintosh HFS+).  For example,                      http://foo.com/files.zip                      followed by                      http://foo.com/FILES.ZIP                      would do it.  We want files.zip and FILES.ZIP to turn into                      different files. Going around the loop should eventually                      do that.                    */                    break;                default:                    throw new IllegalStateException("Code: " + er);                }            }        }        /**           Creates a simple file name from the parts of this EndSegment.           @return a simple file name constructed from the main part,           unique part, query, and suffix        */        private String joinParts() {            StringBuffer sb = new StringBuffer(length());            sb.append(mainPart.asStringBuffer());            if (null != uniquePart) {                sb.append(uniquePart);            }            if (suffixAtEnd) {                if (null != query) {                    sb.append(query);                }                if (null != suffix) {                    sb.append('.');                    sb.append(suffix);                }            } else {                if (null != suffix) {                    sb.append('.');                    sb.append(suffix);                }                if (null != query) {                    sb.append(query);                }            }            return sb.toString();        }        /**           Gets the number of available character positions.           If this EndSegment were converted to a path,           it would have a path length and a segment length.           There are two constraints: maxSegLen and maxPathLen.           The number of character positions available before bumping           into the lower constraint is computed.           @return the number of available positions, which may be negative        */        private int lenAvail() {            int len = length();            return Math.min(maxSegLen - len, maxPathLen - dirPathLen - len);        }        /**           Gets the length of the simple file name that would be           created for this EndSegment.           @return the length        */        private int length() {            int r = mainPart.length(); // Return value.            if (null != uniquePart) {                r += uniquePart.length();            }            if (null != query) {                r += query.length();            }            if (null != suffix) {                r += 1 + suffix.length(); // 1 for the '.'            }            return r;        }        /**           Trims this EndSegment so a given number of characters are available.           After trimming, there will be room for at least           padding more characters before one of the constraints is           encountered.           The choices for trimming, in priority order, are:           <ol>           <li>Shorten the query.</li>           <li>Remove the query.</li>           <li>Shorten the main part.</li>           <li>Shorten the suffix.</li>           </ol>           @param padding the number of character positions that need to be           available           @throws IllegalStateException           if it's impossible to trim enough        */        private void trimWithPadding(int padding) {            assert padding >= 0 : "padding: " + padding;            int la = lenAvail();            if (la >= padding) {                return;            }            // We need space for (padding - la) characters.            // la might be negative.            if (null != query) {                query.trimToMax(Math.max(0, query.length() - (padding - la)));                if (0 == query.length()) {                    query = null;                }                la = lenAvail();                if (la >= padding) {                    return;                }            }            mainPart.trimToMax(Math.max(1, mainPart.length() - (padding - la)));            la = lenAvail();            if (la >= padding) {                return;            }            if (null != suffix) {                suffix = suffix.substring(0, Math.max(1, suffix.length()                                                      - (padding - la)));                la = lenAvail();                if (la >= padding) {                    return;                }            }            throw new IllegalStateException("Can not trim " + curi.toString());        }    }    /**       This class represents a dynamically growable string
💿 文件大小 10016 K
👤 上传用户 qqpp2q
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#Heritrix #robots #META #web
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -