📄 arcreader.java

📁 一个基于lucene&heritrix的搜索引擎
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
                TextUtils.replaceAll("\t", url, "%09"));        }        headerFields.put(VERSION_FIELD_KEY, v);        headerFields.put(ABSOLUTE_OFFSET_KEY, new  Long(offset));        return new ARCRecordMetaData(getReaderIdentifier(), headerFields);    }        /**     * Fix space in URLs.     * The ARCWriter used to write into the ARC URLs with spaces in them.     * See <a     * href="https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966">[ 1010966 ]     * crawl.log has URIs with spaces in them</a>.     * This method does fix up on such headers converting all spaces found     * to '%20'.     * @param values List of metadata values.     * @param requiredSize Expected size of resultant values list.     * @return New list if we successfully fixed up values or original if     * fixup failed.     */    protected List<String> fixSpaceInMetadataLine(List<String> values,    		int requiredSize) {        // Do validity check. 3rd from last is a date of 14 numeric        // characters.  The 4th from last is IP, all before the IP        // should be concatenated together with a '%20' joiner.        // In the below, '4' is 4th field from end which has the IP.        if (!(values.size() > requiredSize) || values.size() < 4) {            return values;        }        // Test 3rd field is valid date.        String date = (String)values.get(values.size() - 3);        if (date.length() != 14) {            return values;        }        for (int i = 0; i < date.length(); i++) {            if (!Character.isDigit(date.charAt(i))) {                return values;            }        }        // Test 4th field is valid IP.        String ip = (String)values.get(values.size() - 4);        Matcher m = InetAddressUtil.IPV4_QUADS.matcher(ip);        if (ip == "-" || m.matches()) {            List<String> newValues = new ArrayList<String>(requiredSize);            StringBuffer url = new StringBuffer();            for (int i = 0; i < (values.size() - 4); i++) {                if (i > 0) {                    url.append("%20");                }                url.append(values.get(i));            }             newValues.add(url.toString());            for (int i = values.size() - 4; i < values.size(); i++) {                newValues.add(values.get(i));            }            values =  newValues;        }        return values;    }    	protected boolean isAlignedOnFirstRecord() {		return alignedOnFirstRecord;	}	protected void setAlignedOnFirstRecord(boolean alignedOnFirstRecord) {		this.alignedOnFirstRecord = alignedOnFirstRecord;	}	    /**     * @return Returns the parseHttpHeaders.     */    public boolean isParseHttpHeaders() {        return this.parseHttpHeaders;    }        /**     * @param parse The parseHttpHeaders to set.     */    public void setParseHttpHeaders(boolean parse) {        this.parseHttpHeaders = parse;    }    	public String getFileExtension() {		return ARC_FILE_EXTENSION;	}		public String getDotFileExtension() {		return DOT_ARC_FILE_EXTENSION;	}		protected boolean output(final String format) 	throws IOException, java.text.ParseException {		boolean result = super.output(format);		if(!result && format.equals(NOHEAD)) {			throw new IOException(format +				" only supported for single Records");		}		return result;	}        protected boolean outputRecord(final String format)    throws IOException {    	boolean result = super.outputRecord(format);    	if(!result && format.equals(NOHEAD)) {            // No point digesting if dumping content.            setDigest(false);            ARCRecord r = (ARCRecord)get();            r.skipHttpHeader();            r.dump();            result = true;        }        return result;    }    public void dump(final boolean compress)    throws IOException, java.text.ParseException {        // No point digesting if we're doing a dump.        setDigest(false);        boolean firstRecord = true;        ARCWriter writer = null;        for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) {            ARCRecord r = (ARCRecord)ii.next();            // We're to dump the arc on stdout.            // Get the first record's data if any.            ARCRecordMetaData meta = r.getMetaData();            if (firstRecord) {                firstRecord = false;                // Get an ARCWriter.                ByteArrayOutputStream baos =                    new ByteArrayOutputStream(r.available());                // This is slow but done only once at top of ARC.                while (r.available() > 0) {                    baos.write(r.read());                }                List<String> listOfMetadata = new ArrayList<String>();                listOfMetadata.add(baos.toString(WriterPoolMember.UTF8));                // Assume getArc returns full path to file.  ARCWriter                // or new File will complain if it is otherwise.                writer = new ARCWriter(new AtomicInteger(), System.out,                    new File(meta.getArc()),                    compress, meta.getDate(), listOfMetadata);                continue;            }                        writer.write(meta.getUrl(), meta.getMimetype(), meta.getIp(),                ArchiveUtils.parse14DigitDate(meta.getDate()).getTime(),                (int)meta.getLength(), r);        }        // System.out.println(System.currentTimeMillis() - start);    }        // Static methods follow.    /**     *     * @param formatter Help formatter instance.     * @param options Usage options.     * @param exitCode Exit code.     */    private static void usage(HelpFormatter formatter, Options options,            int exitCode) {        formatter.printHelp("java org.archive.io.arc.ARCReader" +            " [--digest=true|false] \\\n" +            " [--format=cdx|cdxfile|dump|gzipdump|nohead]" +            " [--offset=#] \\\n[--strict] [--parse] ARC_FILE|ARC_URL",                options);        System.exit(exitCode);    }    /**     * Write out the arcfile.     *      * @param reader     * @param format Format to use outputting.     * @throws IOException     * @throws java.text.ParseException     */    protected static void output(ARCReader reader, String format)    throws IOException, java.text.ParseException {    	if (!reader.output(format)) {            throw new IOException("Unsupported format: " + format);    	}    }            /**     * Output passed record using passed format specifier.     * @param r ARCReader instance to output.     * @param format What format to use outputting.     * @throws IOException     */    protected static void outputRecord(final ARCReader r, final String format)    throws IOException {    	if (!r.outputRecord(format)) {            throw new IOException("Unsupported format" +                " (or unsupported on a single record): " + format);    	}    }    /**     * Generate a CDX index file for an ARC file.     *     * @param urlOrPath The ARC file to generate a CDX index for     * @throws IOException     * @throws java.text.ParseException     */    public static void createCDXIndexFile(String urlOrPath)    throws IOException, java.text.ParseException {    	ARCReader r = ARCReaderFactory.get(urlOrPath);    	r.setStrict(false);    	r.setParseHttpHeaders(true);    	r.setDigest(true);    	output(r, CDX_FILE);    }    /**     * Command-line interface to ARCReader.     *     * Here is the command-line interface:     * <pre>     * usage: java org.archive.io.arc.ARCReader [--offset=#] ARCFILE     *  -h,--help      Prints this message and exits.     *  -o,--offset    Outputs record at this offset into arc file.</pre>     *     * <p>See in <code>$HERITRIX_HOME/bin/arcreader</code> for a script that'll     * take care of classpaths and the calling of ARCReader.     *     * <p>Outputs using a pseudo-CDX format as described here:     * <a href="http://www.archive.org/web/researcher/cdx_legend.php">CDX     * Legent</a> and here     * <a href="http://www.archive.org/web/researcher/example_cdx.php">Example</a>.     * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.     * Hash is hard-coded straight SHA-1 hash of content.     *     * @param args Command-line arguments.     * @throws ParseException Failed parse of the command line.     * @throws IOException     * @throws java.text.ParseException     */    public static void main(String [] args)    throws ParseException, IOException, java.text.ParseException {        Options options = new Options();        options.addOption(new Option("h","help", false,            "Prints this message and exits."));        options.addOption(new Option("o","offset", true,            "Outputs record at this offset into arc file."));        options.addOption(new Option("d","digest", true,            "Pass true|false. Expensive. Default: true (SHA-1)."));        options.addOption(new Option("s","strict", false,            "Strict mode. Fails parse if incorrectly formatted ARC."));        options.addOption(new Option("p","parse", true,        	"Pass true|false to parse HTTP Headers. Default: false."));        options.addOption(new Option("f","format", true,            "Output options: 'cdx', cdxfile', 'dump', 'gzipdump'," +            "'or 'nohead'. Default: 'cdx'."));        PosixParser parser = new PosixParser();        CommandLine cmdline = parser.parse(options, args, false);        List cmdlineArgs = cmdline.getArgList();        Option [] cmdlineOptions = cmdline.getOptions();        HelpFormatter formatter = new HelpFormatter();        // If no args, print help.        if (cmdlineArgs.size() <= 0) {            usage(formatter, options, 0);        }        // Now look at options passed.        long offset = -1;        boolean digest = false;        boolean strict = false;        boolean parse = false;        String format = CDX;        for (int i = 0; i < cmdlineOptions.length; i++) {            switch(cmdlineOptions[i].getId()) {                case 'h':                    usage(formatter, options, 0);                    break;                case 'o':                    offset =                        Long.parseLong(cmdlineOptions[i].getValue());                    break;                                    case 's':                    strict = true;                    break;                                    case 'p':                	parse = getTrueOrFalse(cmdlineOptions[i].getValue());                    break;                                    case 'd':                	digest = getTrueOrFalse(cmdlineOptions[i].getValue());                    break;                                    case 'f':                    format = cmdlineOptions[i].getValue().toLowerCase();                    boolean match = false;                    // List of supported formats.                    final String [] supportedFormats =                		{CDX, DUMP, GZIP_DUMP, NOHEAD, CDX_FILE};                    for (int ii = 0; ii < supportedFormats.length; ii++) {                        if (supportedFormats[ii].equals(format)) {                            match = true;                            break;                        }                    }                    if (!match) {                        usage(formatter, options, 1);                    }                    break;                default:                    throw new RuntimeException("Unexpected option: " +                        + cmdlineOptions[i].getId());            }        }                if (offset >= 0) {            if (cmdlineArgs.size() != 1) {                System.out.println("Error: Pass one arcfile only.");                usage(formatter, options, 1);            }            ARCReader arc = ARCReaderFactory.get(            	new File((String)cmdlineArgs.get(0)), offset);            arc.setStrict(strict);            arc.setParseHttpHeaders(parse);            outputRecord(arc, format);        } else {            for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) {                String urlOrPath = (String)i.next();                try {                	ARCReader r = ARCReaderFactory.get(urlOrPath);                	r.setStrict(strict);                	r.setParseHttpHeaders(parse);                	r.setDigest(digest);                    output(r, format);                } catch (RuntimeException e) {                    // Write out name of file we failed on to help with                    // debugging.  Then print stack trace and try to keep                    // going.  We do this for case where we're being fed                    // a bunch of ARCs; just note the bad one and move                    // on to the next.                    System.err.println("Exception processing " + urlOrPath +                        ": " + e.getMessage());                    e.printStackTrace(System.err);                    System.exit(1);                }            }        }    }}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -