⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 archiveutils.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/* * ArchiveUtils * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/util/ArchiveUtils.java,v 1.38 2007/01/23 00:29:48 gojomo Exp $ * * Created on Jul 7, 2003 * * Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * */package org.archive.util;import java.io.IOException;import java.io.PrintWriter;import java.io.StringWriter;import java.text.NumberFormat;import java.text.ParseException;import java.text.SimpleDateFormat;import java.util.Calendar;import java.util.Date;import java.util.GregorianCalendar;import java.util.Locale;import java.util.TimeZone;/** * Miscellaneous useful methods. * * @author gojomo & others */public class ArchiveUtils {    /**     * Arc-style date stamp in the format yyyyMMddHHmm and UTC time zone.     */    private static final ThreadLocal<SimpleDateFormat>         TIMESTAMP12 = threadLocalDateFormat("yyyyMMddHHmm");;        /**     * Arc-style date stamp in the format yyyyMMddHHmmss and UTC time zone.     */    private static final ThreadLocal<SimpleDateFormat>        TIMESTAMP14 = threadLocalDateFormat("yyyyMMddHHmmss");    /**     * Arc-style date stamp in the format yyyyMMddHHmmssSSS and UTC time zone.     */    private static final ThreadLocal<SimpleDateFormat>         TIMESTAMP17 = threadLocalDateFormat("yyyyMMddHHmmssSSS");    /**     * Log-style date stamp in the format yyyy-MM-dd'T'HH:mm:ss.SSS'Z'     * UTC time zone is assumed.     */    private static final ThreadLocal<SimpleDateFormat>         TIMESTAMP17ISO8601Z = threadLocalDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");        /**     * Log-style date stamp in the format yyyy-MM-dd'T'HH:mm:ss'Z'     * UTC time zone is assumed.     */    private static final ThreadLocal<SimpleDateFormat>        TIMESTAMP14ISO8601Z = threadLocalDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");        /**     * Default character to use padding strings.     */    private static final char DEFAULT_PAD_CHAR = ' ';    /** milliseconds in an hour */     private static final int HOUR_IN_MS = 60 * 60 * 1000;    /** milliseconds in a day */    private static final int DAY_IN_MS = 24 * HOUR_IN_MS;        private static ThreadLocal<SimpleDateFormat> threadLocalDateFormat(final String pattern) {        ThreadLocal<SimpleDateFormat> tl = new ThreadLocal<SimpleDateFormat>() {            protected SimpleDateFormat initialValue() {                SimpleDateFormat df = new SimpleDateFormat(pattern);                df.setTimeZone(TimeZone.getTimeZone("GMT"));                return df;            }        };        return tl;    }        public static int MAX_INT_CHAR_WIDTH =        Integer.toString(Integer.MAX_VALUE).length();        /**     * Utility function for creating arc-style date stamps     * in the format yyyMMddHHmmssSSS.     * Date stamps are in the UTC time zone     * @return the date stamp     */    public static String get17DigitDate(){        return TIMESTAMP17.get().format(new Date());    }    /**     * Utility function for creating arc-style date stamps     * in the format yyyMMddHHmmss.     * Date stamps are in the UTC time zone     * @return the date stamp     */    public static String get14DigitDate(){        return TIMESTAMP14.get().format(new Date());    }    /**     * Utility function for creating arc-style date stamps     * in the format yyyMMddHHmm.     * Date stamps are in the UTC time zone     * @return the date stamp     */    public static String get12DigitDate(){        return TIMESTAMP12.get().format(new Date());    }    /**     * Utility function for creating log timestamps, in     * W3C/ISO8601 format, assuming UTC. Use current time.      *      * Format is yyyy-MM-dd'T'HH:mm:ss.SSS'Z'     *      * @return the date stamp     */    public static String getLog17Date(){        return TIMESTAMP17ISO8601Z.get().format(new Date());    }        /**     * Utility function for creating log timestamps, in     * W3C/ISO8601 format, assuming UTC.      *      * Format is yyyy-MM-dd'T'HH:mm:ss.SSS'Z'     * @param date Date to format.     *      * @return the date stamp     */    public static String getLog17Date(long date){        return TIMESTAMP17ISO8601Z.get().format(new Date(date));    }        /**     * Utility function for creating log timestamps, in     * W3C/ISO8601 format, assuming UTC. Use current time.      *      * Format is yyyy-MM-dd'T'HH:mm:ss'Z'     *      * @return the date stamp     */    public static String getLog14Date(){        return TIMESTAMP14ISO8601Z.get().format(new Date());    }        /**     * Utility function for creating log timestamps, in     * W3C/ISO8601 format, assuming UTC.      *      * Format is yyyy-MM-dd'T'HH:mm:ss'Z'     * @param date long timestamp to format.     *      * @return the date stamp     */    public static String getLog14Date(long date){        return TIMESTAMP14ISO8601Z.get().format(new Date(date));    }        /**     * Utility function for creating log timestamps, in     * W3C/ISO8601 format, assuming UTC.      *      * Format is yyyy-MM-dd'T'HH:mm:ss'Z'     * @param date Date to format.     *      * @return the date stamp     */    public static String getLog14Date(Date date){        return TIMESTAMP14ISO8601Z.get().format(date);    }        /**     * Utility function for creating arc-style date stamps     * in the format yyyyMMddHHmmssSSS.     * Date stamps are in the UTC time zone     *     * @param date milliseconds since epoc     * @return the date stamp     */    public static String get17DigitDate(long date){        return TIMESTAMP17.get().format(new Date(date));    }        public static String get17DigitDate(Date date){        return TIMESTAMP17.get().format(date);    }    /**     * Utility function for creating arc-style date stamps     * in the format yyyyMMddHHmmss.     * Date stamps are in the UTC time zone     *     * @param date milliseconds since epoc     * @return the date stamp     */    public static String get14DigitDate(long date){        return TIMESTAMP14.get().format(new Date(date));    }    public static String get14DigitDate(Date d) {        return TIMESTAMP14.get().format(d);    }    /**     * Utility function for creating arc-style date stamps     * in the format yyyyMMddHHmm.     * Date stamps are in the UTC time zone     *     * @param date milliseconds since epoc     * @return the date stamp     */    public static String get12DigitDate(long date){        return TIMESTAMP12.get().format(new Date(date));    }        public static String get12DigitDate(Date d) {        return TIMESTAMP12.get().format(d);    }        /**     * Parses an ARC-style date.  If passed String is < 12 characters in length,     * we pad.  At a minimum, String should contain a year (>=4 characters).     * Parse will also fail if day or month are incompletely specified.  Depends     * on the above getXXDigitDate methods.     * @param A 4-17 digit date in ARC style (<code>yyyy</code> to     * <code>yyyyMMddHHmmssSSS</code>) formatting.       * @return A Date object representing the passed String.      * @throws ParseException     */    public static Date getDate(String d) throws ParseException {        Date date = null;        if (d == null) {            throw new IllegalArgumentException("Passed date is null");        }        switch (d.length()) {        case 14:            date = ArchiveUtils.parse14DigitDate(d);            break;        case 17:            date = ArchiveUtils.parse17DigitDate(d);            break;        case 12:            date = ArchiveUtils.parse12DigitDate(d);            break;                   case 0:        case 1:        case 2:        case 3:            throw new ParseException("Date string must at least contain a" +                "year: " + d, d.length());                    default:            if (!(d.startsWith("19") || d.startsWith("20"))) {                throw new ParseException("Unrecognized century: " + d, 0);            }            if (d.length() < 8 && (d.length() % 2) != 0) {                throw new ParseException("Incomplete month/date: " + d,                    d.length());            }            StringBuilder sb = new StringBuilder(d);            if (sb.length() < 8) {                for (int i = sb.length(); sb.length() < 8; i += 2) {                    sb.append("01");                }            }            if (sb.length() < 12) {                for (int i = sb.length(); sb.length() < 12; i++) {                    sb.append("0");                }            }            date = ArchiveUtils.parse12DigitDate(sb.toString());        }        return date;    }    /**     * Utility function for parsing arc-style date stamps     * in the format yyyMMddHHmmssSSS.     * Date stamps are in the UTC time zone.  The whole string will not be     * parsed, only the first 17 digits.     *     * @param date an arc-style formatted date stamp     * @return the Date corresponding to the date stamp string     * @throws ParseException if the inputstring was malformed     */    public static Date parse17DigitDate(String date) throws ParseException {        return TIMESTAMP17.get().parse(date);    }    /**     * Utility function for parsing arc-style date stamps     * in the format yyyMMddHHmmss.     * Date stamps are in the UTC time zone.  The whole string will not be     * parsed, only the first 14 digits.     *     * @param date an arc-style formatted date stamp     * @return the Date corresponding to the date stamp string     * @throws ParseException if the inputstring was malformed     */    public static Date parse14DigitDate(String date) throws ParseException{        return TIMESTAMP14.get().parse(date);    }    /**     * Utility function for parsing arc-style date stamps     * in the format yyyMMddHHmm.     * Date stamps are in the UTC time zone.  The whole string will not be     * parsed, only the first 12 digits.     *     * @param date an arc-style formatted date stamp     * @return the Date corresponding to the date stamp string     * @throws ParseException if the inputstring was malformed     */    public static Date parse12DigitDate(String date) throws ParseException{        return TIMESTAMP12.get().parse(date);    }        /**     * Convert 17-digit date format timestamps (as found in crawl.log, for     * example) into a GregorianCalendar object. + * Useful so you can convert     * into milliseconds-since-epoch. Note: it is possible to compute     * milliseconds-since-epoch + * using {@link #parse17DigitDate}.UTC(), but     * that method is deprecated in favor of using Calendar.getTimeInMillis(). + *     * <p/>I probably should have dug into all the utility methods in     * DateFormat.java to parse the timestamp, but this was + * easier. If     * someone wants to fix this to use those methods, please have at it! <p/>     * Mike Schwartz, schwartz at CodeOnTheRoad dot com.     *      * @param timestamp17String     * @return Calendar set to <code>timestamp17String</code>.     */    public static Calendar timestamp17ToCalendar(String timestamp17String) {        GregorianCalendar calendar = new GregorianCalendar();        int year = Integer.parseInt(timestamp17String.substring(0, 4));        int dayOfMonth = Integer.parseInt(timestamp17String.substring(6, 8));        // Month is 0-based        int month = Integer.parseInt(timestamp17String.substring(4, 6)) - 1;        int hourOfDay = Integer.parseInt(timestamp17String.substring(8, 10));        int minute = Integer.parseInt(timestamp17String.substring(10, 12));        int second = Integer.parseInt(timestamp17String.substring(12, 14));        int milliseconds = Integer                .parseInt(timestamp17String.substring(14, 17));        calendar.set(Calendar.YEAR, year);        calendar.set(Calendar.MONTH, month);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -