📄 arcreader.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
                    " some of what was read: " +                    buffer.substring(0, Math.min(buffer.length(), 256)));            }            if (c == LINE_SEPARATOR) {                if (buffer.length() == 0) {                    // Empty line at start of buffer.  Skip it and try again.                    continue;                }                if (list != null) {                    list.add(buffer.toString());                }                // LOOP TERMINATION.                break;            } else if (c == HEADER_FIELD_SEPARATOR) {            	if (!isStrict() && previous == HEADER_FIELD_SEPARATOR) {            		// Early ARCs sometimes had multiple spaces between fields.            		continue;            	}                if (list != null) {                    list.add(buffer.toString());                }                // reset to empty                buffer.setLength(0);            } else {                buffer.append((char)c);            }        }        // List must have at least 3 elements in it and no more than 10.  If        // it has other than this, then bogus parse.        if (list != null && (list.size() < 3 || list.size() > 100)) {            throw new IOException("Unparseable header line: " + list);        }        return read;    }    /**     * Compute metadata fields.     *     * Here we check the meta field has right number of items in it.     *     * @param keys Keys to use composing headerFields map.     * @param values Values to set into the headerFields map.     * @param v The version of this ARC file.     * @param offset Offset into arc file.     *     * @return Metadata structure for this record.     *     * @exception IOException  If no. of keys doesn't match no. of values.     */    private ARCRecordMetaData computeMetaData(List<String> keys,    		List<String> values, String v, long offset)    throws IOException {        if (keys.size() != values.size()) {            List<String> originalValues = values;            if (!isStrict()) {                values = fixSpaceInURL(values, keys.size());                // If values still doesn't match key size, try and do                // further repair.	            if (keys.size() != values.size()) {	            	// Early ARCs had a space in mimetype.	            	if (values.size() == (keys.size() + 1) &&	            			values.get(4).toLowerCase().startsWith("charset=")) {	            		List<String> nuvalues =	            			new ArrayList<String>(keys.size());	            		nuvalues.add(0, values.get(0));	            		nuvalues.add(1, values.get(1));	            		nuvalues.add(2, values.get(2));	            		nuvalues.add(3, values.get(3) + values.get(4));	            		nuvalues.add(4, values.get(5));	            		values = nuvalues;	            	} else if((values.size() + 1) == keys.size() &&                            isLegitimateIPValue(values.get(1)) &&                            isDate(values.get(2)) && isNumber(values.get(3))) {                        // Mimetype is empty.                        List<String> nuvalues =                            new ArrayList<String>(keys.size());                        nuvalues.add(0, values.get(0));                        nuvalues.add(1, values.get(1));                        nuvalues.add(2, values.get(2));                        nuvalues.add(3, "-");                        nuvalues.add(4, values.get(3));                        values = nuvalues;                    }	            }        	}            if (keys.size() != values.size()) {                throw new IOException("Size of field name keys does" +                    " not match count of field values: " + values);            }            // Note that field was fixed on stderr.            logStdErr(Level.WARNING, "Fixed spaces in metadata line at " +            	"offset " + offset +                " Original: " + originalValues + ", New: " + values);        }                Map<Object, Object> headerFields =        	new HashMap<Object, Object>(keys.size() + 2);        for (int i = 0; i < keys.size(); i++) {            headerFields.put(keys.get(i), values.get(i));        }                // Add a check for tabs in URLs.  If any, replace with '%09'.        // See https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966,        // [ 1010966 ] crawl.log has URIs with spaces in them.        String url = (String)headerFields.get(URL_FIELD_KEY);        if (url != null && url.indexOf('\t') >= 0) {            headerFields.put(URL_FIELD_KEY,                TextUtils.replaceAll("\t", url, "%09"));        }        headerFields.put(VERSION_FIELD_KEY, v);        headerFields.put(ABSOLUTE_OFFSET_KEY, new  Long(offset));        return new ARCRecordMetaData(getReaderIdentifier(), headerFields);    }        protected boolean isDate(final String date) {        if (date.length() != 14) {            return false;        }        return isNumber(date);    }        protected boolean isNumber(final String n) {        for (int i = 0; i < n.length(); i++) {            if (!Character.isDigit(n.charAt(i))) {                return false;            }        }        return true;    }        protected boolean isLegitimateIPValue(final String ip) {        if ("-".equals(ip)) {            return true;        }        Matcher m = InetAddressUtil.IPV4_QUADS.matcher(ip);        return m != null && m.matches();    }        /**     * Fix space in URLs.     * The ARCWriter used to write into the ARC URLs with spaces in them.     * See <a     * href="https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966">[ 1010966 ]     * crawl.log has URIs with spaces in them</a>.     * This method does fix up on such headers converting all spaces found     * to '%20'.     * @param values List of metadata values.     * @param requiredSize Expected size of resultant values list.     * @return New list if we successfully fixed up values or original if     * fixup failed.     */    protected List<String> fixSpaceInURL(List<String> values, int requiredSize) {        // Do validity check. 3rd from last is a date of 14 numeric        // characters. The 4th from last is IP, all before the IP        // should be concatenated together with a '%20' joiner.        // In the below, '4' is 4th field from end which has the IP.        if (!(values.size() > requiredSize) || values.size() < 4) {            return values;        }        // Test 3rd field is valid date.        if (!isDate((String) values.get(values.size() - 3))) {            return values;        }        // Test 4th field is valid IP.        if (!isLegitimateIPValue((String) values.get(values.size() - 4))) {            return values;        }        List<String> newValues = new ArrayList<String>(requiredSize);        StringBuffer url = new StringBuffer();        for (int i = 0; i < (values.size() - 4); i++) {            if (i > 0) {                url.append("%20");            }            url.append(values.get(i));        }        newValues.add(url.toString());        for (int i = values.size() - 4; i < values.size(); i++) {            newValues.add(values.get(i));        }        return newValues;    }    	protected boolean isAlignedOnFirstRecord() {		return alignedOnFirstRecord;	}	protected void setAlignedOnFirstRecord(boolean alignedOnFirstRecord) {		this.alignedOnFirstRecord = alignedOnFirstRecord;	}	    /**     * @return Returns the parseHttpHeaders.     */    public boolean isParseHttpHeaders() {        return this.parseHttpHeaders;    }        /**     * @param parse The parseHttpHeaders to set.     */    public void setParseHttpHeaders(boolean parse) {        this.parseHttpHeaders = parse;    }    	public String getFileExtension() {		return ARC_FILE_EXTENSION;	}		public String getDotFileExtension() {		return DOT_ARC_FILE_EXTENSION;	}		protected boolean output(final String format) 	throws IOException, java.text.ParseException {		boolean result = super.output(format);		if(!result && (format.equals(NOHEAD) || format.equals(HEADER))) {			throw new IOException(format +				" format only supported for single Records");		}		return result;	}        public boolean outputRecord(final String format) throws IOException {		boolean result = super.outputRecord(format);		if (result) {			return result;		}		if (format.equals(NOHEAD)) {			// No point digesting if dumping content.			setDigest(false);			ARCRecord r = (ARCRecord) get();			r.skipHttpHeader();			r.dump();			result = true;		} else if (format.equals(HEADER)) {			// No point digesting if dumping content.			setDigest(false);			ARCRecord r = (ARCRecord) get();			r.dumpHttpHeader();			result = true;		}		return result;	}    public void dump(final boolean compress)    throws IOException, java.text.ParseException {        // No point digesting if we're doing a dump.        setDigest(false);        boolean firstRecord = true;        ARCWriter writer = null;        for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) {            ARCRecord r = (ARCRecord)ii.next();            // We're to dump the arc on stdout.            // Get the first record's data if any.            ARCRecordMetaData meta = r.getMetaData();            if (firstRecord) {                firstRecord = false;                // Get an ARCWriter.                ByteArrayOutputStream baos =                    new ByteArrayOutputStream(r.available());                // This is slow but done only once at top of ARC.                while (r.available() > 0) {                    baos.write(r.read());                }                List<String> listOfMetadata = new ArrayList<String>();                listOfMetadata.add(baos.toString(WriterPoolMember.UTF8));                // Assume getArc returns full path to file.  ARCWriter                // or new File will complain if it is otherwise.                writer = new ARCWriter(new AtomicInteger(), System.out,                    new File(meta.getArc()),                    compress, meta.getDate(), listOfMetadata);                continue;
💿 文件大小 9430 K
👤 上传用户 zergwyk
📂 所属分类 Internet/网络编程
🏷️ 相关标签

#lucece
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -