⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 arcreader.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
            }                        writer.write(meta.getUrl(), meta.getMimetype(), meta.getIp(),                ArchiveUtils.parse14DigitDate(meta.getDate()).getTime(),                (int)meta.getLength(), r);        }        // System.out.println(System.currentTimeMillis() - start);    }        /**     * @return an ArchiveReader that will delete a local file on close.  Used     * when we bring Archive files local and need to clean up afterward.     */    public ARCReader getDeleteFileOnCloseReader(final File f) {        final ARCReader d = this;        return new ARCReader() {            private final ARCReader delegate = d;            private File archiveFile = f;                        public void close() throws IOException {                this.delegate.close();                if (this.archiveFile != null) {                    if (archiveFile.exists()) {                        archiveFile.delete();                    }                    this.archiveFile = null;                }            }                        public ArchiveRecord get(long o) throws IOException {                return this.delegate.get(o);            }                        public boolean isDigest() {                return this.delegate.isDigest();            }                        public boolean isStrict() {                return this.delegate.isStrict();            }                        public Iterator<ArchiveRecord> iterator() {                return this.delegate.iterator();            }                        public void setDigest(boolean d) {                this.delegate.setDigest(d);            }                        public void setStrict(boolean s) {                this.delegate.setStrict(s);            }                        public List validate() throws IOException {                return this.delegate.validate();            }            @Override            public ArchiveRecord get() throws IOException {                return this.delegate.get();            }            @Override            public String getVersion() {                return this.delegate.getVersion();            }            @Override            public List validate(int noRecords) throws IOException {                return this.delegate.validate(noRecords);            }            @Override            protected ARCRecord createArchiveRecord(InputStream is,                    long offset)            throws IOException {                return this.delegate.createArchiveRecord(is, offset);            }            @Override            protected void gotoEOR(ArchiveRecord record) throws IOException {                this.delegate.gotoEOR(record);            }            @Override            public void dump(boolean compress)            throws IOException, java.text.ParseException {                this.delegate.dump(compress);            }            @Override            public String getDotFileExtension() {                return this.delegate.getDotFileExtension();            }            @Override            public String getFileExtension() {                return this.delegate.getFileExtension();            }        };    }        // Static methods follow.    /**     *     * @param formatter Help formatter instance.     * @param options Usage options.     * @param exitCode Exit code.     */    private static void usage(HelpFormatter formatter, Options options,            int exitCode) {        formatter.printHelp("java org.archive.io.arc.ARCReader" +            " [--digest=true|false] \\\n" +            " [--format=cdx|cdxfile|dump|gzipdump|header|nohead]" +            " [--offset=#] \\\n[--strict] [--parse] ARC_FILE|ARC_URL",                options);        System.exit(exitCode);    }    /**     * Write out the arcfile.     *      * @param reader     * @param format Format to use outputting.     * @throws IOException     * @throws java.text.ParseException     */    protected static void output(ARCReader reader, String format)    throws IOException, java.text.ParseException {    	if (!reader.output(format)) {            throw new IOException("Unsupported format: " + format);    	}    }    /**     * Generate a CDX index file for an ARC file.     *     * @param urlOrPath The ARC file to generate a CDX index for     * @throws IOException     * @throws java.text.ParseException     */    public static void createCDXIndexFile(String urlOrPath)    throws IOException, java.text.ParseException {    	ARCReader r = ARCReaderFactory.get(urlOrPath);    	r.setStrict(false);    	r.setParseHttpHeaders(true);    	r.setDigest(true);    	output(r, CDX_FILE);    }    /**     * Command-line interface to ARCReader.     *     * Here is the command-line interface:     * <pre>     * usage: java org.archive.io.arc.ARCReader [--offset=#] ARCFILE     *  -h,--help      Prints this message and exits.     *  -o,--offset    Outputs record at this offset into arc file.</pre>     *     * <p>See in <code>$HERITRIX_HOME/bin/arcreader</code> for a script that'll     * take care of classpaths and the calling of ARCReader.     *     * <p>Outputs using a pseudo-CDX format as described here:     * <a href="http://www.archive.org/web/researcher/cdx_legend.php">CDX     * Legent</a> and here     * <a href="http://www.archive.org/web/researcher/example_cdx.php">Example</a>.     * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.     * Hash is hard-coded straight SHA-1 hash of content.     *     * @param args Command-line arguments.     * @throws ParseException Failed parse of the command line.     * @throws IOException     * @throws java.text.ParseException     */    public static void main(String [] args)    throws ParseException, IOException, java.text.ParseException {        Options options = getOptions();        options.addOption(new Option("p","parse", false, "Parse headers."));        PosixParser parser = new PosixParser();        CommandLine cmdline = parser.parse(options, args, false);        List cmdlineArgs = cmdline.getArgList();        Option [] cmdlineOptions = cmdline.getOptions();        HelpFormatter formatter = new HelpFormatter();        // If no args, print help.        if (cmdlineArgs.size() <= 0) {            usage(formatter, options, 0);        }        // Now look at options passed.        long offset = -1;        boolean digest = false;        boolean strict = false;        boolean parse = false;        String format = CDX;        for (int i = 0; i < cmdlineOptions.length; i++) {            switch(cmdlineOptions[i].getId()) {                case 'h':                    usage(formatter, options, 0);                    break;                case 'o':                    offset =                        Long.parseLong(cmdlineOptions[i].getValue());                    break;                                    case 's':                    strict = true;                    break;                                    case 'p':                	parse = true;                    break;                                    case 'd':                	digest = getTrueOrFalse(cmdlineOptions[i].getValue());                    break;                                    case 'f':                    format = cmdlineOptions[i].getValue().toLowerCase();                    boolean match = false;                    // List of supported formats.                    final String [] supportedFormats =                		{CDX, DUMP, GZIP_DUMP, HEADER, NOHEAD, CDX_FILE};                    for (int ii = 0; ii < supportedFormats.length; ii++) {                        if (supportedFormats[ii].equals(format)) {                            match = true;                            break;                        }                    }                    if (!match) {                        usage(formatter, options, 1);                    }                    break;                default:                    throw new RuntimeException("Unexpected option: " +                        + cmdlineOptions[i].getId());            }        }                if (offset >= 0) {            if (cmdlineArgs.size() != 1) {                System.out.println("Error: Pass one arcfile only.");                usage(formatter, options, 1);            }            ARCReader arc = ARCReaderFactory.get((String)cmdlineArgs.get(0),            	offset);            arc.setStrict(strict);            // We must parse headers if we need to skip them.            if (format.equals(NOHEAD) || format.equals(HEADER)) {                parse = true;            }            arc.setParseHttpHeaders(parse);            outputRecord(arc, format);        } else {            for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) {                String urlOrPath = (String)i.next();                try {                	ARCReader r = ARCReaderFactory.get(urlOrPath);                	r.setStrict(strict);                	r.setParseHttpHeaders(parse);                	r.setDigest(digest);                    output(r, format);                } catch (RuntimeException e) {                    // Write out name of file we failed on to help with                    // debugging.  Then print stack trace and try to keep                    // going.  We do this for case where we're being fed                    // a bunch of ARCs; just note the bad one and move                    // on to the next.                    System.err.println("Exception processing " + urlOrPath +                        ": " + e.getMessage());                    e.printStackTrace(System.err);                    System.exit(1);                }            }        }    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -