📄 arcreader.java
字号:
TextUtils.replaceAll("\t", url, "%09")); } headerFields.put(VERSION_FIELD_KEY, v); headerFields.put(ABSOLUTE_OFFSET_KEY, new Long(offset)); return new ARCRecordMetaData(getReaderIdentifier(), headerFields); } /** * Fix space in URLs. * The ARCWriter used to write into the ARC URLs with spaces in them. * See <a * href="https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966">[ 1010966 ] * crawl.log has URIs with spaces in them</a>. * This method does fix up on such headers converting all spaces found * to '%20'. * @param values List of metadata values. * @param requiredSize Expected size of resultant values list. * @return New list if we successfully fixed up values or original if * fixup failed. */ protected List<String> fixSpaceInMetadataLine(List<String> values, int requiredSize) { // Do validity check. 3rd from last is a date of 14 numeric // characters. The 4th from last is IP, all before the IP // should be concatenated together with a '%20' joiner. // In the below, '4' is 4th field from end which has the IP. if (!(values.size() > requiredSize) || values.size() < 4) { return values; } // Test 3rd field is valid date. String date = (String)values.get(values.size() - 3); if (date.length() != 14) { return values; } for (int i = 0; i < date.length(); i++) { if (!Character.isDigit(date.charAt(i))) { return values; } } // Test 4th field is valid IP. String ip = (String)values.get(values.size() - 4); Matcher m = InetAddressUtil.IPV4_QUADS.matcher(ip); if (ip == "-" || m.matches()) { List<String> newValues = new ArrayList<String>(requiredSize); StringBuffer url = new StringBuffer(); for (int i = 0; i < (values.size() - 4); i++) { if (i > 0) { url.append("%20"); } url.append(values.get(i)); } newValues.add(url.toString()); for (int i = values.size() - 4; i < values.size(); i++) { newValues.add(values.get(i)); } values = newValues; } return values; } protected boolean isAlignedOnFirstRecord() { return alignedOnFirstRecord; } protected void setAlignedOnFirstRecord(boolean alignedOnFirstRecord) { this.alignedOnFirstRecord = alignedOnFirstRecord; } /** * @return Returns the parseHttpHeaders. */ public boolean isParseHttpHeaders() { return this.parseHttpHeaders; } /** * @param parse The parseHttpHeaders to set. */ public void setParseHttpHeaders(boolean parse) { this.parseHttpHeaders = parse; } public String getFileExtension() { return ARC_FILE_EXTENSION; } public String getDotFileExtension() { return DOT_ARC_FILE_EXTENSION; } protected boolean output(final String format) throws IOException, java.text.ParseException { boolean result = super.output(format); if(!result && format.equals(NOHEAD)) { throw new IOException(format + " only supported for single Records"); } return result; } protected boolean outputRecord(final String format) throws IOException { boolean result = super.outputRecord(format); if(!result && format.equals(NOHEAD)) { // No point digesting if dumping content. setDigest(false); ARCRecord r = (ARCRecord)get(); r.skipHttpHeader(); r.dump(); result = true; } return result; } public void dump(final boolean compress) throws IOException, java.text.ParseException { // No point digesting if we're doing a dump. setDigest(false); boolean firstRecord = true; ARCWriter writer = null; for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) { ARCRecord r = (ARCRecord)ii.next(); // We're to dump the arc on stdout. // Get the first record's data if any. ARCRecordMetaData meta = r.getMetaData(); if (firstRecord) { firstRecord = false; // Get an ARCWriter. ByteArrayOutputStream baos = new ByteArrayOutputStream(r.available()); // This is slow but done only once at top of ARC. while (r.available() > 0) { baos.write(r.read()); } List<String> listOfMetadata = new ArrayList<String>(); listOfMetadata.add(baos.toString(WriterPoolMember.UTF8)); // Assume getArc returns full path to file. ARCWriter // or new File will complain if it is otherwise. writer = new ARCWriter(new AtomicInteger(), System.out, new File(meta.getArc()), compress, meta.getDate(), listOfMetadata); continue; } writer.write(meta.getUrl(), meta.getMimetype(), meta.getIp(), ArchiveUtils.parse14DigitDate(meta.getDate()).getTime(), (int)meta.getLength(), r); } // System.out.println(System.currentTimeMillis() - start); } // Static methods follow. /** * * @param formatter Help formatter instance. * @param options Usage options. * @param exitCode Exit code. */ private static void usage(HelpFormatter formatter, Options options, int exitCode) { formatter.printHelp("java org.archive.io.arc.ARCReader" + " [--digest=true|false] \\\n" + " [--format=cdx|cdxfile|dump|gzipdump|nohead]" + " [--offset=#] \\\n[--strict] [--parse] ARC_FILE|ARC_URL", options); System.exit(exitCode); } /** * Write out the arcfile. * * @param reader * @param format Format to use outputting. * @throws IOException * @throws java.text.ParseException */ protected static void output(ARCReader reader, String format) throws IOException, java.text.ParseException { if (!reader.output(format)) { throw new IOException("Unsupported format: " + format); } } /** * Output passed record using passed format specifier. * @param r ARCReader instance to output. * @param format What format to use outputting. * @throws IOException */ protected static void outputRecord(final ARCReader r, final String format) throws IOException { if (!r.outputRecord(format)) { throw new IOException("Unsupported format" + " (or unsupported on a single record): " + format); } } /** * Generate a CDX index file for an ARC file. * * @param urlOrPath The ARC file to generate a CDX index for * @throws IOException * @throws java.text.ParseException */ public static void createCDXIndexFile(String urlOrPath) throws IOException, java.text.ParseException { ARCReader r = ARCReaderFactory.get(urlOrPath); r.setStrict(false); r.setParseHttpHeaders(true); r.setDigest(true); output(r, CDX_FILE); } /** * Command-line interface to ARCReader. * * Here is the command-line interface: * <pre> * usage: java org.archive.io.arc.ARCReader [--offset=#] ARCFILE * -h,--help Prints this message and exits. * -o,--offset Outputs record at this offset into arc file.</pre> * * <p>See in <code>$HERITRIX_HOME/bin/arcreader</code> for a script that'll * take care of classpaths and the calling of ARCReader. * * <p>Outputs using a pseudo-CDX format as described here: * <a href="http://www.archive.org/web/researcher/cdx_legend.php">CDX * Legent</a> and here * <a href="http://www.archive.org/web/researcher/example_cdx.php">Example</a>. * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'. * Hash is hard-coded straight SHA-1 hash of content. * * @param args Command-line arguments. * @throws ParseException Failed parse of the command line. * @throws IOException * @throws java.text.ParseException */ public static void main(String [] args) throws ParseException, IOException, java.text.ParseException { Options options = new Options(); options.addOption(new Option("h","help", false, "Prints this message and exits.")); options.addOption(new Option("o","offset", true, "Outputs record at this offset into arc file.")); options.addOption(new Option("d","digest", true, "Pass true|false. Expensive. Default: true (SHA-1).")); options.addOption(new Option("s","strict", false, "Strict mode. Fails parse if incorrectly formatted ARC.")); options.addOption(new Option("p","parse", true, "Pass true|false to parse HTTP Headers. Default: false.")); options.addOption(new Option("f","format", true, "Output options: 'cdx', cdxfile', 'dump', 'gzipdump'," + "'or 'nohead'. Default: 'cdx'.")); PosixParser parser = new PosixParser(); CommandLine cmdline = parser.parse(options, args, false); List cmdlineArgs = cmdline.getArgList(); Option [] cmdlineOptions = cmdline.getOptions(); HelpFormatter formatter = new HelpFormatter(); // If no args, print help. if (cmdlineArgs.size() <= 0) { usage(formatter, options, 0); } // Now look at options passed. long offset = -1; boolean digest = false; boolean strict = false; boolean parse = false; String format = CDX; for (int i = 0; i < cmdlineOptions.length; i++) { switch(cmdlineOptions[i].getId()) { case 'h': usage(formatter, options, 0); break; case 'o': offset = Long.parseLong(cmdlineOptions[i].getValue()); break; case 's': strict = true; break; case 'p': parse = getTrueOrFalse(cmdlineOptions[i].getValue()); break; case 'd': digest = getTrueOrFalse(cmdlineOptions[i].getValue()); break; case 'f': format = cmdlineOptions[i].getValue().toLowerCase(); boolean match = false; // List of supported formats. final String [] supportedFormats = {CDX, DUMP, GZIP_DUMP, NOHEAD, CDX_FILE}; for (int ii = 0; ii < supportedFormats.length; ii++) { if (supportedFormats[ii].equals(format)) { match = true; break; } } if (!match) { usage(formatter, options, 1); } break; default: throw new RuntimeException("Unexpected option: " + + cmdlineOptions[i].getId()); } } if (offset >= 0) { if (cmdlineArgs.size() != 1) { System.out.println("Error: Pass one arcfile only."); usage(formatter, options, 1); } ARCReader arc = ARCReaderFactory.get( new File((String)cmdlineArgs.get(0)), offset); arc.setStrict(strict); arc.setParseHttpHeaders(parse); outputRecord(arc, format); } else { for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) { String urlOrPath = (String)i.next(); try { ARCReader r = ARCReaderFactory.get(urlOrPath); r.setStrict(strict); r.setParseHttpHeaders(parse); r.setDigest(digest); output(r, format); } catch (RuntimeException e) { // Write out name of file we failed on to help with // debugging. Then print stack trace and try to keep // going. We do this for case where we're being fed // a bunch of ARCs; just note the bad one and move // on to the next. System.err.println("Exception processing " + urlOrPath + ": " + e.getMessage()); e.printStackTrace(System.err); System.exit(1); } } } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -