⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 warcwriterprocessor.java

📁 高性能分词算法
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
        } finally {            if (writer != null) {            	setTotalBytesWritten(getTotalBytesWritten() +            	     (writer.getPosition() - position));                getPool().returnFile(writer);            }        }        checkBytesWritten();    }        protected URI writeRequest(final WARCWriter w,            final String timestamp, final String mimetype,            final URI baseid, final CrawlURI curi,            final ANVLRecord namedFields)     throws IOException {        final URI uid = qualifyRecordID(baseid, TYPE, REQUEST);        ReplayInputStream ris =            curi.getHttpRecorder().getRecordedOutput().getReplayInputStream();        try {            w.writeRequestRecord(curi.toString(), timestamp, mimetype, uid,                namedFields, ris,                curi.getHttpRecorder().getRecordedOutput().getSize());        } finally {            if (ris != null) {                ris.close();            }        }        return uid;    }        protected URI writeResponse(final WARCWriter w,            final String timestamp, final String mimetype,            final URI baseid, final CrawlURI curi,            final ANVLRecord namedFields)     throws IOException {        ReplayInputStream ris =            curi.getHttpRecorder().getRecordedInput().getReplayInputStream();        try {            w.writeResponseRecord(curi.toString(), timestamp, mimetype, baseid,                namedFields, ris,                curi.getHttpRecorder().getRecordedInput().getSize());        } finally {            if (ris != null) {                ris.close();            }        }        return baseid;    }        protected URI writeResource(final WARCWriter w,            final String timestamp, final String mimetype,            final URI baseid, final CrawlURI curi,            final ANVLRecord namedFields)     throws IOException {        ReplayInputStream ris =            curi.getHttpRecorder().getRecordedInput().getReplayInputStream();        try {            w.writeResourceRecord(curi.toString(), timestamp, mimetype, baseid,                namedFields, ris,                curi.getHttpRecorder().getRecordedInput().getSize());        } finally {            if (ris != null) {                ris.close();            }        }        return baseid;    }        protected URI writeRevisitDigest(final WARCWriter w,            final String timestamp, final String mimetype,            final URI baseid, final CrawlURI curi,            final ANVLRecord namedFields)     throws IOException {        long revisedLength = curi.getHttpRecorder().getRecordedInput().getContentBegin();        revisedLength = revisedLength > 0             ? revisedLength             : curi.getHttpRecorder().getRecordedInput().getSize();        namedFields.addLabelValue(        		HEADER_KEY_PROFILE, PROFILE_REVISIT_IDENTICAL_DIGEST);        namedFields.addLabelValue(        		HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LENGTH);        ReplayInputStream ris =            curi.getHttpRecorder().getRecordedInput().getReplayInputStream();        try {            w.writeRevisitRecord(curi.toString(), timestamp, mimetype, baseid,                namedFields, ris, revisedLength);        } finally {            if (ris != null) {                ris.close();            }        }        curi.addAnnotation("warcRevisit:digest");         return baseid;    }        protected URI writeRevisitNotModified(final WARCWriter w,            final String timestamp,             final URI baseid, final CrawlURI curi,            final ANVLRecord namedFields)     throws IOException {        namedFields.addLabelValue(        		HEADER_KEY_PROFILE, PROFILE_REVISIT_NOT_MODIFIED);        // save just enough context to understand basis of not-modified        if(curi.containsKey(A_HTTP_TRANSACTION)) {            HttpMethodBase method =                 (HttpMethodBase) curi.getObject(A_HTTP_TRANSACTION);            saveHeader(A_ETAG_HEADER,method,namedFields,HEADER_KEY_ETAG);            saveHeader(A_LAST_MODIFIED_HEADER,method,namedFields,            		HEADER_KEY_LAST_MODIFIED);        }        // truncate to zero-length (all necessary info is above)        namedFields.addLabelValue(HEADER_KEY_TRUNCATED,            NAMED_FIELD_TRUNCATED_VALUE_LENGTH);        ReplayInputStream ris =            curi.getHttpRecorder().getRecordedInput().getReplayInputStream();        try {            w.writeRevisitRecord(curi.toString(), timestamp, null, baseid,                namedFields, ris, 0);        } finally {            if (ris !=  null) {                ris.close();            }        }        curi.addAnnotation("warcRevisit:notModified");         return baseid;    }        /**     * Save a header from the given HTTP operation into the      * provider headers under a new name     *      * @param origName header name to get if present     * @param method http operation containing headers     */    protected void saveHeader(String origName, HttpMethodBase method,     		ANVLRecord headers, String newName) {        Header header = method.getResponseHeader(origName);        if(header!=null) {            headers.addLabelValue(newName, header.getValue());        }    }	protected URI writeMetadata(final WARCWriter w,            final String timestamp,            final URI baseid, final CrawlURI curi,            final ANVLRecord namedFields)     throws IOException {        final URI uid = qualifyRecordID(baseid, TYPE, METADATA);        // Get some metadata from the curi.        // TODO: Get all curi metadata.        // TODO: Use other than ANVL (or rename ANVL as NameValue or use        // RFC822 (commons-httpclient?).        ANVLRecord r = new ANVLRecord();        if (curi.isSeed()) {            r.addLabel("seed");        } else {        	if (curi.forceFetch()) {        		r.addLabel("force-fetch");        	}            r.addLabelValue("via", curi.flattenVia());            r.addLabelValue("hopsFromSeed", curi.getPathFromSeed());            if (curi.containsKey(A_SOURCE_TAG)) {                r.addLabelValue("sourceTag", curi.getString(A_SOURCE_TAG));            }        }        long duration = curi.getFetchDuration();        if(duration>-1) {            r.addLabelValue("fetchTimeMs", Long.toString(duration));        }                // Add outlinks though they are effectively useless without anchor text.        Collection<Link> links = curi.getOutLinks();        if (links != null && links.size() > 0) {            for (Link link: links) {                r.addLabelValue("outlink", link.toString());            }        }                // TODO: Other curi fields to write to metadata.        //         // Credentials        //         // fetch-began-time: 1154569278774        // fetch-completed-time: 1154569281816        //        // Annotations.                byte [] b = r.getUTF8Bytes();        w.writeMetadataRecord(curi.toString(), timestamp, ANVLRecord.MIMETYPE,            uid, namedFields, new ByteArrayInputStream(b), b.length);        return uid;    }        protected URI getRecordID() throws IOException {        URI result;        try {            result = GeneratorFactory.getFactory().getRecordID();        } catch (URISyntaxException e) {            throw new IOException(e.toString());        }        return result;    }        protected URI qualifyRecordID(final URI base, final String key,            final String value)    throws IOException {        URI result;        Map<String, String> qualifiers = new HashMap<String, String>(1);        qualifiers.put(key, value);        try {            result = GeneratorFactory.getFactory().                qualifyRecordID(base, qualifiers);        } catch (URISyntaxException e) {            throw new IOException(e.toString());        }        return result;    }          @Override    protected String getFirstrecordStylesheet() {        return "/warcinfobody.xsl";    }    /**     * Return relevant values as header-like fields (here ANVLRecord, but      * spec-defined "application/warc-fields" type when written). Field     * names from from DCMI Terms and the WARC/0.17 specification.     *      * @see org.archive.crawler.framework.WriterPoolProcessor#getFirstrecordBody(java.io.File)     */    @Override    protected String getFirstrecordBody(File orderFile) {        ANVLRecord record = new ANVLRecord(7);        record.addLabelValue("software", "Heritrix/" +                Heritrix.getVersion() + " http://crawler.archive.org");        try {            InetAddress host = InetAddress.getLocalHost();            record.addLabelValue("ip", host.getHostAddress());            record.addLabelValue("hostname", host.getHostName());        } catch (UnknownHostException e) {            logger.log(Level.WARNING,"unable top obtain local crawl engine host",e);        }        record.addLabelValue("format","WARC File Format 0.17");        record.addLabelValue("conformsTo","http://crawler.archive.org/warc/0.17/WARC0.17ISO.doc");        // Get other values from order.xml         try {            Document doc = XmlUtils.getDocument(orderFile);            addIfNotBlank(record,"operator",                    XmlUtils.xpathOrNull(doc,"//meta/operator"));            addIfNotBlank(record,"publisher",                    XmlUtils.xpathOrNull(doc,"//meta/organization"));            addIfNotBlank(record,"audience",                    XmlUtils.xpathOrNull(doc,"//meta/audience"));            addIfNotBlank(record,"isPartOf",                    XmlUtils.xpathOrNull(doc,"//meta/name"));            String rawDate = XmlUtils.xpathOrNull(doc,"//meta/date");            if(StringUtils.isNotBlank(rawDate)) {                Date date;                try {                    date = ArchiveUtils.parse14DigitDate(rawDate);                    addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date));                } catch (ParseException e) {                    logger.log(Level.WARNING,"obtaining warc created date",e);                }            }            addIfNotBlank(record,"description",                    XmlUtils.xpathOrNull(doc,"//meta/description"));            addIfNotBlank(record,"robots",                    XmlUtils.xpathOrNull(doc,                             "//newObject[@name='robots-honoring-policy']/string[@name='type']"));            addIfNotBlank(record,"http-header-user-agent",                    XmlUtils.xpathOrNull(doc,                             "//map[@name='http-headers']/string[@name='user-agent']"));            addIfNotBlank(record,"http-header-from",                    XmlUtils.xpathOrNull(doc,                             "//map[@name='http-headers']/string[@name='from']"));        } catch (IOException e) {            logger.log(Level.WARNING,"obtaining warcinfo",e);        }         // really ugly to return as string, when it may just be merged with         // a couple other fields at write time, but changing would require         // larger refactoring        return record.toString();    }    protected void addIfNotBlank(ANVLRecord record, String label, String value) {        if(StringUtils.isNotBlank(value)) {            record.addLabelValue(label, value);        }    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -