⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 experimentalwarcwriter.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
    protected void writeRecord(final String type, final String url,    		final String create14DigitDate, final String mimetype,    		final URI recordId, ANVLRecord xtraHeaders,            final InputStream contentStream, final long contentLength)    throws IOException {    	if (!TYPES_LIST.contains(type)) {    		throw new IllegalArgumentException("Unknown record type: " + type);    	}    	if (contentLength == 0 &&                (xtraHeaders == null || xtraHeaders.size() <= 0)) {    		throw new IllegalArgumentException("Cannot write record " +    		    "of content-length zero and base headers only.");    	}    	        preWriteRecordTasks();        try {            final String header = createRecordHeader(type, url,            	create14DigitDate, mimetype, recordId, xtraHeaders,            	contentLength);            // TODO: Revisit endcoding of header.            write(header.getBytes(WARC_HEADER_ENCODING));                        if (contentStream != null && contentLength > 0) {                // Write out the header/body separator.                write(CRLF_BYTES); // TODO: should this be written even for zero-length?            	readToLimitFrom(contentStream, contentLength, this.readbuffer);            }                        // Write out the two blank lines at end of all records.            // TODO: Why? Messes up skipping through file. Also not in grammar.            write(CRLF_BYTES);            write(CRLF_BYTES);        } finally {            postWriteRecordTasks();        }    }        protected URI generateRecordId(final Map<String, String> qualifiers)    throws IOException {    	URI rid = null;    	try {    		rid = GeneratorFactory.getFactory().    			getQualifiedRecordID(qualifiers);    	} catch (URISyntaxException e) {    		// Convert to IOE so can let it out.    		throw new IOException(e.getMessage());    	}    	return rid;    }        protected URI generateRecordId(final String key, final String value)    throws IOException {    	URI rid = null;    	try {    		rid = GeneratorFactory.getFactory().    			getQualifiedRecordID(key, value);    	} catch (URISyntaxException e) {    		// Convert to IOE so can let it out.    		throw new IOException(e.getMessage());    	}    	return rid;    }        public URI writeWarcinfoRecord(String filename)	throws IOException {    	return writeWarcinfoRecord(filename, null);    }        public URI writeWarcinfoRecord(String filename, final String description)        	throws IOException {        // Strip .open suffix if present.        if (filename.endsWith(WriterPoolMember.OCCUPIED_SUFFIX)) {        	filename = filename.substring(0,        		filename.length() - WriterPoolMember.OCCUPIED_SUFFIX.length());        }        ANVLRecord record = new ANVLRecord(2);        record.addLabelValue(HEADER_KEY_FILENAME, filename);        if (description != null && description.length() > 0) {        	record.addLabelValue(CONTENT_DESCRIPTION, description);        }        // Add warcinfo body.        byte [] warcinfoBody = null;        if (this.fileMetadata == null) {        	// TODO: What to write into a warcinfo?  What to associate?        	warcinfoBody = "TODO: Unimplemented".getBytes();        } else {        	ByteArrayOutputStream baos = new ByteArrayOutputStream();        	for (final Iterator i = this.fileMetadata.iterator();        			i.hasNext();) {        		baos.write(i.next().toString().getBytes(UTF8Bytes.UTF8));        	}        	warcinfoBody = baos.toByteArray();        }        URI uri = writeWarcinfoRecord("text/xml", record,            new ByteArrayInputStream(warcinfoBody), warcinfoBody.length);        // TODO: If at start of file, and we're writing compressed,        // write out our distinctive GZIP extensions.        return uri;    }        /**     * Write a warcinfo to current file.     * TODO: Write crawl metadata or pointers to crawl description.     * @param mimetype Mimetype of the <code>fileMetadata</code> block.     * @param namedFields Named fields. Pass <code>null</code> if none.     * @param fileMetadata Metadata about this WARC as RDF, ANVL, etc.     * @param fileMetadataLength Length of <code>fileMetadata</code>.     * @throws IOException     * @return Generated record-id made with     * <a href="http://en.wikipedia.org/wiki/Data:_URL">data: scheme</a> and     * the current filename.     */    public URI writeWarcinfoRecord(final String mimetype,    	final ANVLRecord namedFields, final InputStream fileMetadata,    	final long fileMetadataLength)    throws IOException {    	final URI recordid = generateRecordId(TYPE, WARCINFO);    	writeWarcinfoRecord(ArchiveUtils.getLog14Date(), mimetype, recordid,            namedFields, fileMetadata, fileMetadataLength);    	return recordid;    }        /**     * Write a <code>warcinfo</code> to current file.     * The <code>warcinfo</code> type uses its <code>recordId</code> as its URL.     * @param recordId URI to use for this warcinfo.     * @param create14DigitDate Record creation date as 14 digit date.     * @param mimetype Mimetype of the <code>fileMetadata</code>.     * @param namedFields Named fields.     * @param fileMetadata Metadata about this WARC as RDF, ANVL, etc.     * @param fileMetadataLength Length of <code>fileMetadata</code>.     * @throws IOException     */    public void writeWarcinfoRecord(final String create14DigitDate,        final String mimetype, final URI recordId, final ANVLRecord namedFields,    	final InputStream fileMetadata, final long fileMetadataLength)    throws IOException {    	writeRecord(WARCINFO, null, create14DigitDate, mimetype,        	recordId, namedFields, fileMetadata, fileMetadataLength);    }        public void writeRequestRecord(final String url,        final String create14DigitDate, final String mimetype,        final URI recordId,        final ANVLRecord namedFields, final InputStream request,        final long requestLength)    throws IOException {        writeRecord(REQUEST, url, create14DigitDate,            mimetype, recordId, namedFields, request,            requestLength);    }        public void writeResourceRecord(final String url,            final String create14DigitDate, final String mimetype,            final ANVLRecord namedFields, final InputStream response,            final long responseLength)    throws IOException {    	writeResourceRecord(url, create14DigitDate, mimetype, getRecordID(),    			namedFields, response, responseLength);    }        public void writeResourceRecord(final String url,            final String create14DigitDate, final String mimetype,            final URI recordId,            final ANVLRecord namedFields, final InputStream response,            final long responseLength)    throws IOException {        writeRecord(RESOURCE, url, create14DigitDate,            mimetype, recordId, namedFields, response,            responseLength);    }    public void writeResponseRecord(final String url,            final String create14DigitDate, final String mimetype,            final URI recordId,            final ANVLRecord namedFields, final InputStream response,            final long responseLength)    throws IOException {        writeRecord(RESPONSE, url, create14DigitDate,            mimetype, recordId, namedFields, response,            responseLength);    }        public void writeRevisitRecord(final String url,            final String create14DigitDate, final String mimetype,            final URI recordId,            final ANVLRecord namedFields, final InputStream response,            final long responseLength)    throws IOException {        writeRecord(REVISIT, url, create14DigitDate,            mimetype, recordId, namedFields, response,            responseLength);    }        public void writeMetadataRecord(final String url,            final String create14DigitDate, final String mimetype,            final URI recordId,            final ANVLRecord namedFields, final InputStream metadata,            final long metadataLength)    throws IOException {        writeRecord(METADATA, url, create14DigitDate,            mimetype, recordId, namedFields, metadata,            metadataLength);    }        /**     * Convenience method for getting Record-Ids.     * @return A record ID.     * @throws IOException     */    public static URI getRecordID() throws IOException {        URI result;        try {            result = GeneratorFactory.getFactory().getRecordID();        } catch (URISyntaxException e) {            throw new IOException(e.toString());        }        return result;    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -