⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 writerpoolprocessor.java

📁 Heritrix是一个开源,可扩展的web爬虫项目。Heritrix设计成严格按照robots.txt文件的排除指示和META robots标签。
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
    /**     * Version of getAttributes that catches and logs exceptions     * and returns null if failure to fetch the attribute.     * @param name Attribute name.     * @return Attribute or null.     */    public Object getAttributeUnchecked(String name) {        Object result = null;        try {            result = super.getAttribute(name);        } catch (AttributeNotFoundException e) {            logger.warning(e.getLocalizedMessage());        } catch (MBeanException e) {            logger.warning(e.getLocalizedMessage());        } catch (ReflectionException e) {            logger.warning(e.getLocalizedMessage());        }        return result;    }   /**    * Max size we want files to be (bytes).    *    * Default is ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE.  Note that ARC    * files will usually be bigger than maxSize; they'll be maxSize + length    * to next boundary.    * @return ARC maximum size.    */    public long getMaxSize() {        Object obj = getAttributeUnchecked(ATTR_MAX_SIZE_BYTES);        return (obj == null)? DEFAULT_MAX_FILE_SIZE: ((Long)obj).longValue();    }    public String getPrefix() {        Object obj = getAttributeUnchecked(ATTR_PREFIX);        return (obj == null)? WriterPoolMember.DEFAULT_PREFIX: (String)obj;    }    public List<File> getOutputDirs() {        Object obj = getAttributeUnchecked(ATTR_PATH);        List list = (obj == null)? Arrays.asList(DEFAULT_PATH): (StringList)obj;        ArrayList<File> results = new ArrayList<File>();        for (Iterator i = list.iterator(); i.hasNext();) {            String path = (String)i.next();            File f = new File(path);            if (!f.isAbsolute()) {                f = new File(getController().getDisk(), path);            }            if (!f.exists()) {                try {                    f.mkdirs();                } catch (Exception e) {                    e.printStackTrace();                    continue;                }            }            results.add(f);        }        return results;    }        public boolean isCompressed() {        Object obj = getAttributeUnchecked(ATTR_COMPRESS);        return (obj == null)? DEFAULT_COMPRESS:            ((Boolean)obj).booleanValue();    }    /**     * @return Returns the poolMaximumActive.     */    public int getPoolMaximumActive() {        Object obj = getAttributeUnchecked(ATTR_POOL_MAX_ACTIVE);        return (obj == null)? WriterPool.DEFAULT_MAX_ACTIVE:            ((Integer)obj).intValue();    }    /**     * @return Returns the poolMaximumWait.     */    public int getPoolMaximumWait() {        Object obj = getAttributeUnchecked(ATTR_POOL_MAX_WAIT);        return (obj == null)? WriterPool.DEFAULT_MAXIMUM_WAIT:            ((Integer)obj).intValue();    }    public String getSuffix() {        Object obj = getAttributeUnchecked(ATTR_SUFFIX);        String sfx = (obj == null)?            WriterPoolMember.DEFAULT_SUFFIX: (String)obj;        if (sfx != null && sfx.trim().                equals(WriterPoolMember.HOSTNAME_VARIABLE)) {            String str = "localhost.localdomain";            try {                str = InetAddress.getLocalHost().getHostName();            } catch (UnknownHostException ue) {                logger.severe("Failed getHostAddress for this host: " + ue);            }            sfx = str;        }        return sfx;    }        public long getMaxToWrite() {        Object obj = getAttributeUnchecked(ATTR_MAX_BYTES_WRITTEN);        return (obj == null)? 0: ((Long)obj).longValue();    }	public void crawlEnding(String sExitMessage) {		this.pool.close();	}	public void crawlEnded(String sExitMessage) {        // sExitMessage is unused.	}    /* (non-Javadoc)     * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)     */    public void crawlStarted(String message) {        // TODO Auto-generated method stub    }        protected String getCheckpointStateFile() {    	return this.getClass().getName() + ".state";    }        public void crawlCheckpoint(File checkpointDir) throws IOException {        int serial = getSerialNo().get();        if (this.pool.getNumActive() > 0) {            // If we have open active Archive files, up the serial number            // so after checkpoint, we start at one past current number and            // so the number we serialize, is one past current serialNo.            // All this serial number manipulation should be fine in here since            // we're paused checkpointing (Revisit if this assumption changes).            serial = getSerialNo().incrementAndGet();        }        saveCheckpointSerialNumber(checkpointDir, serial);        // Close all ARCs on checkpoint.        try {            this.pool.close();        } finally {            // Reopen on checkpoint.            setupPool(new AtomicInteger(serial));        }    }    	public void crawlPausing(String statusMessage) {        // sExitMessage is unused.	}	public void crawlPaused(String statusMessage) {        // sExitMessage is unused.	}	public void crawlResuming(String statusMessage) {        // sExitMessage is unused.	}	    private void readObject(ObjectInputStream stream)    throws IOException, ClassNotFoundException {        stream.defaultReadObject();        ObjectPlusFilesInputStream coistream =            (ObjectPlusFilesInputStream)stream;        coistream.registerFinishTask( new Runnable() {            public void run() {            	setupPool(new AtomicInteger());            }        });    }	protected WriterPool getPool() {		return pool;	}	protected void setPool(WriterPool pool) {		this.pool = pool;	}	protected long getTotalBytesWritten() {		return totalBytesWritten;	}	protected void setTotalBytesWritten(long totalBytesWritten) {        this.totalBytesWritten = totalBytesWritten;    }	    /**     * Called out of {@link #initialTasks()} when recovering a checkpoint.     * Restore state.     */    protected void checkpointRecover() {        int serialNo = loadCheckpointSerialNumber();        if (serialNo != -1) {            getSerialNo().set(serialNo);        }    }    /**     * @return Serial number from checkpoint state file or if unreadable, -1     * (Client should check for -1).     */    protected int loadCheckpointSerialNumber() {        int result = -1;                // If in recover mode, read in the Writer serial number saved        // off when we checkpointed.        File stateFile = new File(getSettingsHandler().getOrder()                .getController().getCheckpointRecover().getDirectory(),                getCheckpointStateFile());        if (!stateFile.exists()) {            logger.info(stateFile.getAbsolutePath()                    + " doesn't exist so cannot restore Writer serial number.");        } else {            DataInputStream dis = null;            try {                dis = new DataInputStream(new FileInputStream(stateFile));                result = dis.readShort();            } catch (FileNotFoundException e) {                e.printStackTrace();            } catch (IOException e) {                e.printStackTrace();            } finally {                try {                    if (dis != null) {                        dis.close();                    }                } catch (IOException e) {                    e.printStackTrace();                }            }        }        return result;    }        protected void saveCheckpointSerialNumber(final File checkpointDir,            final int serialNo)    throws IOException {        // Write out the current state of the ARCWriter serial number.        File f = new File(checkpointDir, getCheckpointStateFile());        DataOutputStream dos = new DataOutputStream(new FileOutputStream(f));        try {            dos.writeShort(serialNo);        } finally {            dos.close();        }    }        /**     * Return list of metadatas to add to first arc file metadata record.     *      * Default is to stylesheet the order file.  To specify stylesheet,     * override {@link #getFirstrecordStylesheet()}.     *     * Get xml files from settingshandler.  Currently order file is the     * only xml file.  We're NOT adding seeds to meta data.     *     * @return List of strings and/or files to add to arc file as metadata or     * null.     */    public synchronized List<String> getMetadata() {        if (this.cachedMetadata != null) {            return this.cachedMetadata;        }        return cacheMetadata();    }        protected synchronized List<String> cacheMetadata() {        if (this.cachedMetadata != null) {            return this.cachedMetadata;        }                // If no stylesheet, return empty metadata.        if (getFirstrecordStylesheet() == null ||                getFirstrecordStylesheet().length() == 0) {            this.cachedMetadata = new ArrayList<String>(1);            this.cachedMetadata.add("");            return this.cachedMetadata;        }                List<String> result = null;        if (!XMLSettingsHandler.class.isInstance(getSettingsHandler())) {            logger.warning("Expected xml settings handler (No warcinfo).");            // Early return            return result;        }                XMLSettingsHandler xsh = (XMLSettingsHandler)getSettingsHandler();        File orderFile = xsh.getOrderFile();        if (!orderFile.exists() || !orderFile.canRead()) {                logger.severe("File " + orderFile.getAbsolutePath() +                    " is does not exist or is not readable.");        } else {            result = new ArrayList<String>(1);            result.add(getFirstrecordBody(orderFile));        }        this.cachedMetadata = result;        return this.cachedMetadata;    }        /**     * @preturn Full path to stylesheet (Its read off the CLASSPATH     * as resource).     */    protected String getFirstrecordStylesheet() {        return null;    }    /**     * Write the arc metadata body content.     *     * Its based on the order xml file but into this base we'll add other info     * such as machine ip.     *     * @param orderFile Order file.     *     * @return String that holds the arc metaheader body.     */    protected String getFirstrecordBody(File orderFile) {        String result = null;        TransformerFactory factory = TransformerFactory.newInstance();        Templates templates = null;        Transformer xformer = null;        try {            templates = factory.newTemplates(new StreamSource(                this.getClass().getResourceAsStream(getFirstrecordStylesheet())));            xformer = templates.newTransformer();            // Below parameter names must match what is in the stylesheet.            xformer.setParameter("software", "Heritrix " +                Heritrix.getVersion() + " http://crawler.archive.org");            xformer.setParameter("ip",                InetAddress.getLocalHost().getHostAddress());            xformer.setParameter("hostname",                InetAddress.getLocalHost().getHostName());            StreamSource source = new StreamSource(                new FileInputStream(orderFile));            StringWriter writer = new StringWriter();            StreamResult target = new StreamResult(writer);            xformer.transform(source, target);            result= writer.toString();        } catch (TransformerConfigurationException e) {            logger.severe("Failed transform " + e);        } catch (FileNotFoundException e) {            logger.severe("Failed transform, file not found " + e);        } catch (UnknownHostException e) {            logger.severe("Failed transform, unknown host " + e);        } catch(TransformerException e) {            SourceLocator locator = e.getLocator();            int col = locator.getColumnNumber();            int line = locator.getLineNumber();            String publicId = locator.getPublicId();            String systemId = locator.getSystemId();            logger.severe("Transform error " + e + ", col " + col + ", line " +                line + ", publicId " + publicId + ", systemId " + systemId);        }        return result;    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -