📄 arcreader.java
字号:
" some of what was read: " + buffer.substring(0, Math.min(buffer.length(), 256))); } if (c == LINE_SEPARATOR) { if (buffer.length() == 0) { // Empty line at start of buffer. Skip it and try again. continue; } if (list != null) { list.add(buffer.toString()); } // LOOP TERMINATION. break; } else if (c == HEADER_FIELD_SEPARATOR) { if (!isStrict() && previous == HEADER_FIELD_SEPARATOR) { // Early ARCs sometimes had multiple spaces between fields. continue; } if (list != null) { list.add(buffer.toString()); } // reset to empty buffer.setLength(0); } else { buffer.append((char)c); } } // List must have at least 3 elements in it and no more than 10. If // it has other than this, then bogus parse. if (list != null && (list.size() < 3 || list.size() > 100)) { throw new IOException("Unparseable header line: " + list); } return read; } /** * Compute metadata fields. * * Here we check the meta field has right number of items in it. * * @param keys Keys to use composing headerFields map. * @param values Values to set into the headerFields map. * @param v The version of this ARC file. * @param offset Offset into arc file. * * @return Metadata structure for this record. * * @exception IOException If no. of keys doesn't match no. of values. */ private ARCRecordMetaData computeMetaData(List<String> keys, List<String> values, String v, long offset) throws IOException { if (keys.size() != values.size()) { List<String> originalValues = values; if (!isStrict()) { values = fixSpaceInURL(values, keys.size()); // If values still doesn't match key size, try and do // further repair. if (keys.size() != values.size()) { // Early ARCs had a space in mimetype. if (values.size() == (keys.size() + 1) && values.get(4).toLowerCase().startsWith("charset=")) { List<String> nuvalues = new ArrayList<String>(keys.size()); nuvalues.add(0, values.get(0)); nuvalues.add(1, values.get(1)); nuvalues.add(2, values.get(2)); nuvalues.add(3, values.get(3) + values.get(4)); nuvalues.add(4, values.get(5)); values = nuvalues; } else if((values.size() + 1) == keys.size() && isLegitimateIPValue(values.get(1)) && isDate(values.get(2)) && isNumber(values.get(3))) { // Mimetype is empty. List<String> nuvalues = new ArrayList<String>(keys.size()); nuvalues.add(0, values.get(0)); nuvalues.add(1, values.get(1)); nuvalues.add(2, values.get(2)); nuvalues.add(3, "-"); nuvalues.add(4, values.get(3)); values = nuvalues; } } } if (keys.size() != values.size()) { throw new IOException("Size of field name keys does" + " not match count of field values: " + values); } // Note that field was fixed on stderr. logStdErr(Level.WARNING, "Fixed spaces in metadata line at " + "offset " + offset + " Original: " + originalValues + ", New: " + values); } Map<Object, Object> headerFields = new HashMap<Object, Object>(keys.size() + 2); for (int i = 0; i < keys.size(); i++) { headerFields.put(keys.get(i), values.get(i)); } // Add a check for tabs in URLs. If any, replace with '%09'. // See https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966, // [ 1010966 ] crawl.log has URIs with spaces in them. String url = (String)headerFields.get(URL_FIELD_KEY); if (url != null && url.indexOf('\t') >= 0) { headerFields.put(URL_FIELD_KEY, TextUtils.replaceAll("\t", url, "%09")); } headerFields.put(VERSION_FIELD_KEY, v); headerFields.put(ABSOLUTE_OFFSET_KEY, new Long(offset)); return new ARCRecordMetaData(getReaderIdentifier(), headerFields); } protected boolean isDate(final String date) { if (date.length() != 14) { return false; } return isNumber(date); } protected boolean isNumber(final String n) { for (int i = 0; i < n.length(); i++) { if (!Character.isDigit(n.charAt(i))) { return false; } } return true; } protected boolean isLegitimateIPValue(final String ip) { if ("-".equals(ip)) { return true; } Matcher m = InetAddressUtil.IPV4_QUADS.matcher(ip); return m != null && m.matches(); } /** * Fix space in URLs. * The ARCWriter used to write into the ARC URLs with spaces in them. * See <a * href="https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966">[ 1010966 ] * crawl.log has URIs with spaces in them</a>. * This method does fix up on such headers converting all spaces found * to '%20'. * @param values List of metadata values. * @param requiredSize Expected size of resultant values list. * @return New list if we successfully fixed up values or original if * fixup failed. */ protected List<String> fixSpaceInURL(List<String> values, int requiredSize) { // Do validity check. 3rd from last is a date of 14 numeric // characters. The 4th from last is IP, all before the IP // should be concatenated together with a '%20' joiner. // In the below, '4' is 4th field from end which has the IP. if (!(values.size() > requiredSize) || values.size() < 4) { return values; } // Test 3rd field is valid date. if (!isDate((String) values.get(values.size() - 3))) { return values; } // Test 4th field is valid IP. if (!isLegitimateIPValue((String) values.get(values.size() - 4))) { return values; } List<String> newValues = new ArrayList<String>(requiredSize); StringBuffer url = new StringBuffer(); for (int i = 0; i < (values.size() - 4); i++) { if (i > 0) { url.append("%20"); } url.append(values.get(i)); } newValues.add(url.toString()); for (int i = values.size() - 4; i < values.size(); i++) { newValues.add(values.get(i)); } return newValues; } protected boolean isAlignedOnFirstRecord() { return alignedOnFirstRecord; } protected void setAlignedOnFirstRecord(boolean alignedOnFirstRecord) { this.alignedOnFirstRecord = alignedOnFirstRecord; } /** * @return Returns the parseHttpHeaders. */ public boolean isParseHttpHeaders() { return this.parseHttpHeaders; } /** * @param parse The parseHttpHeaders to set. */ public void setParseHttpHeaders(boolean parse) { this.parseHttpHeaders = parse; } public String getFileExtension() { return ARC_FILE_EXTENSION; } public String getDotFileExtension() { return DOT_ARC_FILE_EXTENSION; } protected boolean output(final String format) throws IOException, java.text.ParseException { boolean result = super.output(format); if(!result && (format.equals(NOHEAD) || format.equals(HEADER))) { throw new IOException(format + " format only supported for single Records"); } return result; } public boolean outputRecord(final String format) throws IOException { boolean result = super.outputRecord(format); if (result) { return result; } if (format.equals(NOHEAD)) { // No point digesting if dumping content. setDigest(false); ARCRecord r = (ARCRecord) get(); r.skipHttpHeader(); r.dump(); result = true; } else if (format.equals(HEADER)) { // No point digesting if dumping content. setDigest(false); ARCRecord r = (ARCRecord) get(); r.dumpHttpHeader(); result = true; } return result; } public void dump(final boolean compress) throws IOException, java.text.ParseException { // No point digesting if we're doing a dump. setDigest(false); boolean firstRecord = true; ARCWriter writer = null; for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) { ARCRecord r = (ARCRecord)ii.next(); // We're to dump the arc on stdout. // Get the first record's data if any. ARCRecordMetaData meta = r.getMetaData(); if (firstRecord) { firstRecord = false; // Get an ARCWriter. ByteArrayOutputStream baos = new ByteArrayOutputStream(r.available()); // This is slow but done only once at top of ARC. while (r.available() > 0) { baos.write(r.read()); } List<String> listOfMetadata = new ArrayList<String>(); listOfMetadata.add(baos.toString(WriterPoolMember.UTF8)); // Assume getArc returns full path to file. ARCWriter // or new File will complain if it is otherwise. writer = new ARCWriter(new AtomicInteger(), System.out, new File(meta.getArc()), compress, meta.getDate(), listOfMetadata); continue;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -