📄 extractaction.java

📁 java写的crawler
💻 JAVA
字号:
/* * WebSphinx web-crawling toolkit * * Copyright (c) 1998-2002 Carnegie Mellon University.  All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */package websphinx.workbench;import websphinx.*;import java.io.File;import java.io.IOException;public class ExtractAction implements Action, CrawlListener {    Pattern pattern;    String filename;    boolean useBrowser;    boolean textOnly;        transient File file;    transient RecordTransformer records;    transient boolean noFields;        public ExtractAction (Pattern pattern, boolean useBrowser, String filename, boolean textOnly) {        this.pattern = pattern;        this.filename = filename;        this.useBrowser = useBrowser;        this.textOnly = textOnly;    }        public boolean equals (Object object) {        if (! (object instanceof ExtractAction))            return false;        ExtractAction a = (ExtractAction)object;        return same (a.filename, filename)            && a.useBrowser == useBrowser            && a.pattern.equals (pattern)            && a.textOnly == textOnly;    }    private boolean same (String s1, String s2) {        if (s1 == null || s2 == null)            return s1 == s2;        else            return s1.equals (s2);    }    public Pattern getPattern () {        return pattern;    }    public boolean getUseBrowser () {        return useBrowser;    }    public String getFilename () {        return filename;    }    public boolean getTextOnly () {        return textOnly;    }        public void connected (Crawler crawler) {        crawler.addCrawlListener (this);    }    public void disconnected (Crawler crawler) {        crawler.removeCrawlListener (this);    }    private void showit () {      Browser browser = Context.getBrowser();      if (browser != null)        browser.show (file);    }    public synchronized void visit (Page page) {        try {            int n = 0;            PatternMatcher m = pattern.match (page);            for (Region r = m.nextMatch(); r != null; r = m.nextMatch()) {                Object[] fields;                if (noFields) {                    fields = new Object[1];                    fields[0] = r;                }                else                    fields = (Object[])r.getFields (Pattern.groups);                                    records.writeRecord (fields, textOnly);                ++n;            }                        if (n > 0)              records.flush ();        } catch (IOException e) {            throw new RuntimeException (e.toString());        }    }    /**     * Notify that the crawler started.     */    public synchronized void started (CrawlEvent event){        if (records == null) {            try {                file = (filename != null)                  ? new File (filename)                  : Access.getAccess ().makeTemporaryFile ("extract", ".html");                                records = new RecordTransformer (file.toString());                                String[] fieldNames = pattern.getFieldNames ();                noFields = (fieldNames.length == 0);                records.setProlog (records.getProlog ()                                   + makeTableHeader (fieldNames));            } catch (IOException e) {                System.err.println (e); // FIX: use GUI when available            }                }    }    private String makeTableHeader (String[] fieldNames) {        String result = "<TR>\n<TH>\n";        if (fieldNames.length == 0)            result += "<TH>\n";        else            for (int i=0; i<fieldNames.length; ++i)                result += "<TH>" + fieldNames[i] + "\n";        return result;    }        /**     * Notify that the crawler ran out of links to crawl     */    public synchronized void stopped (CrawlEvent event){        try {            if (records != null) {                records.close ();                records = null;                if (useBrowser)                  showit ();            }        } catch (IOException e) {            System.err.println (e); // FIX: use GUI when available        }    }    /**     * Notify that the crawler's state was cleared.     */    public synchronized void cleared (CrawlEvent event){        try {            if (records != null) {                records.close ();                records = null;                if (useBrowser)                  showit ();            }        } catch (IOException e) {            System.err.println (e); // FIX: use GUI when available        }    }    /**     * Notify that the crawler timed out.     */    public synchronized void timedOut (CrawlEvent event){        try {            records.close ();            records = null;            if (useBrowser)              showit ();        } catch (IOException e) {            System.err.println (e); // FIX: use GUI when available        }    }    /**     * Notify that the crawler is paused.     */    public synchronized void paused (CrawlEvent event){        try {            records.flush ();            if (useBrowser)              showit ();        } catch (IOException e) {            System.err.println (e); // FIX: use GUI when available        }    }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -