📄 arc2wcdx.java
字号:
/* * ARC2WCDX.java * * $Id: ARC2WCDX.java 4903 2007-02-16 01:45:10Z gojomo $ * * Created on Nov 13, 2006 * * Copyright (C) 2006 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package org.archive.io.arc;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.PrintStream;import java.util.Date;import java.util.Iterator;import java.util.zip.GZIPOutputStream;import org.apache.commons.httpclient.Header;import org.apache.commons.httpclient.HeaderGroup;import org.apache.commons.httpclient.util.DateParseException;import org.apache.commons.httpclient.util.DateUtil;import org.archive.util.ArchiveUtils;import org.archive.util.SURT;/** * Create a 'Wide' CDX from an ARC. Takes one argument, the path to the ARC. * Writes .wcdx.gz in same directory. * * @author gojomo */public class ARC2WCDX { final public static String WCDX_VERSION="0.1"; public static void main(String[] args) throws IOException { String arcFilename = args[0]; createWcdx(arcFilename); } public static Object[] createWcdx(String arcFilename) throws IOException { ARCReader reader = ARCReaderFactory.get(arcFilename); Object[] retVal = createWcdx(reader); reader.close(); return retVal; } public static Object[] createWcdx(ARCReader reader) { reader.setDigest(true); String wcdxPath = reader.getReaderIdentifier().replaceAll("\\.arc(\\.gz)?$",".wcdx.gz"); File wcdxFile = new File(wcdxPath+".open"); PrintStream writer = null; long count = 0; try { writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(wcdxFile))); // write header: legend + timestamp StringBuilder legend = new StringBuilder(); appendField(legend,"CDX"); appendField(legend,"surt-uri"); appendField(legend,"b"); // ARC timestamp appendField(legend,"http-date"); appendField(legend,"s"); // status code appendField(legend,"m"); // media type appendField(legend,"sha1"); // content sha1 appendField(legend,"g"); // ARC name appendField(legend,"V"); // start offset appendField(legend,"end-offset"); // TODO: implement appendField(legend,"n"); // ARC record length TODO: verify appendField(legend,"http-content-length"); appendField(legend,"http-last-modified"); appendField(legend,"http-expires"); appendField(legend,"http-etag"); appendField(legend,"http-location"); appendField(legend,"e"); // IP appendField(legend,"a"); // original URL // WCDX version+creation time: crude version control appendField(legend,WCDX_VERSION+"@"+ArchiveUtils.get14DigitDate()); writer.println(legend.toString()); Iterator iter = reader.iterator(); count = 0; while(iter.hasNext()) { ARCRecord record = (ARCRecord) iter.next(); record.close(); ARCRecordMetaData h = (ARCRecordMetaData) record.getHeader(); Header[] httpHeaders = record.getHttpHeaders(); if(httpHeaders==null) { httpHeaders = new Header[0]; } HeaderGroup hg = new HeaderGroup(); hg.setHeaders(httpHeaders); StringBuilder builder = new StringBuilder(); // SURT-form URI appendField(builder,SURT.fromURI(h.getUrl())); // record timestamp ('b') appendField(builder,h.getDate()); // http header date appendTimeField(builder,hg.getFirstHeader("Date")); // response code ('s') appendField(builder,h.getStatusCode()); // media type ('m') appendField(builder,h.getMimetype()); // content checksum (like 'c', but here Base32 SHA1) appendField(builder,record.getDigestStr()); // arc name ('g') appendField(builder,reader.getFileName()); // compressed start offset ('V') appendField(builder,h.getOffset()); // compressed end offset (?)// appendField(builder,// reader.getInputStream() instanceof RepositionableStream// ? ((GzippedInputStream)reader.getInputStream()).vPosition()// : "-"); // TODO; leave unavail for now appendField(builder, "-"); // uncompressed (declared in ARC headerline) record length appendField(builder,h.getLength()); // http header content-length appendField(builder,hg.getFirstHeader("Content-Length")); // http header mod-date appendTimeField(builder,hg.getFirstHeader("Last-Modified")); // http header expires appendTimeField(builder,hg.getFirstHeader("Expires")); // http header etag appendField(builder,hg.getFirstHeader("ETag")); // http header redirect ('Location' header?) appendField(builder,hg.getFirstHeader("Location")); // ip ('e') appendField(builder,h.getIp()); // original URI appendField(builder,h.getUrl()); // TODO MAYBE - a title from inside content? writer.println(builder.toString()); count++; } wcdxFile.renameTo(new File(wcdxPath)); } catch (IOException e) { // soldier on: but leave '.open' wcdx file as indicator of error if(!wcdxFile.exists()) { try { wcdxFile.createNewFile(); } catch (IOException e1) { // TODO Auto-generated catch block throw new RuntimeException(e1); } } } catch (RuntimeException e) { // soldier on: but leave '.open' wcdx file as indicator of error if(!wcdxFile.exists()) { try { wcdxFile.createNewFile(); } catch (IOException e1) { // TODO Auto-generated catch block throw new RuntimeException(e1); } } } finally { if(writer!=null) { writer.close(); } } return new Object[] {wcdxPath, count}; } protected static void appendField(StringBuilder builder, Object obj) { if(builder.length()>0) { // prepend with delimiter builder.append(' '); } if(obj instanceof Header) { obj = ((Header)obj).getValue().trim(); } builder.append((obj==null||obj.toString().length()==0)?"-":obj); } protected static void appendTimeField(StringBuilder builder, Object obj) { if(builder.length()>0) { // prepend with delimiter builder.append(' '); } if(obj==null) { builder.append("-"); return; } if(obj instanceof Header) { String s = ((Header)obj).getValue().trim(); try { Date date = DateUtil.parseDate(s); String d = ArchiveUtils.get14DigitDate(date); if(d.startsWith("209")) { d = "199"+d.substring(3); } obj = d; } catch (DateParseException e) { builder.append('e'); return; } } builder.append(obj); }}//'wide' CDX//a original url//b timestamp//s resp code//m type//? content md5 (full 'k'? 'c'?//g arc name//V compressed start offset//? compressed length//n? uncompressed length//? mod date//? expires//? server 'date' hdr//? etag//r redirect ('Location'?)//e ip//MAYBE: //? TITLE from HTML or other format?
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -