⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 recordtransformer.java

📁 一个用java语言编写的网络爬虫程序
💻 JAVA
字号:
/* * WebSPHINX web crawling toolkit * Copyright (C) 1998,1999 Carnegie Mellon University  *  * This library is free software; you can redistribute it * and/or modify it under the terms of the GNU Library * General Public License as published by the Free Software  * Foundation, version 2. * * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/ */package websphinx;import java.io.*;import java.net.URL;import websphinx.util.Str;public class RecordTransformer extends RewritableLinkTransformer {    String prolog = "<HTML><HEAD><TITLE>Extracted Records</TITLE></HEAD><BODY><TABLE>\n";    String epilog = "</TABLE></BODY></HTML>\n";    String recordStart = "<TR>\n<TD><A HREF=\"%u\">%n.</A>\n";    String recordEnd =   "\n";    String recordDivider = "";        String fieldStart = "  <TD>";    String fieldEnd   = "\n";    String fieldDivider = "";    int nRecords = 0;    public RecordTransformer (String filename) throws IOException {        super (filename);    }    public synchronized void setProlog (String prolog) {        this.prolog = prolog;    }    public synchronized String getProlog () {        return prolog;    }    public synchronized void setEpilog (String epilog) {        this.epilog = epilog;    }    public synchronized String getEpilog () {        return epilog;    }    public synchronized void setRecordStart (String recordStart) {        this.recordStart = recordStart;    }    public synchronized String getRecordStart () {        return recordStart;    }    public synchronized void setRecordEnd (String recordEnd) {        this.recordEnd = recordEnd;    }    public synchronized String getRecordEnd () {        return recordEnd;    }    public synchronized void setRecordDivider (String recordDivider) {        this.recordDivider = recordDivider;    }    public synchronized String getRecordDivider () {        return recordDivider;    }    public synchronized void setFieldStart (String fieldStart) {        this.fieldStart = fieldStart;    }    public synchronized String getFieldStart () {        return fieldStart;    }    public synchronized void setFieldEnd (String fieldEnd) {        this.fieldEnd = fieldEnd;    }    public synchronized String getFieldEnd () {        return fieldEnd;    }    public synchronized void setFieldDivider (String fieldDivider) {        this.fieldDivider = fieldDivider;    }    public synchronized String getFieldDivider () {        return fieldDivider;    }    /**     * Flush the record page to disk.  Temporarily writes the epilog.     */    public synchronized void flush () throws IOException {        long p = getFilePointer ();        if (nRecords == 0)            emit (prolog);        emit (epilog);        seek (p);        super.flush ();    }            public synchronized int getRecordCount () {        return nRecords;    }    public synchronized void writeRecord (Object[] fields, boolean asText) throws IOException {        ++nRecords;        emit ((nRecords == 1) ? prolog : recordDivider);                URL url = urlOfFirstRegion (fields);                emitTemplate (recordStart, url, nRecords);        for (int i=0; i<fields.length; ++i) {            if (i > 0)                emit (fieldDivider);            emit (fieldStart);                        Object f = fields[i];            if (f instanceof Region) {                Region r = (Region)fields[i];                if (asText)                    write (r.toText());                else                    write (r);            }            else                write (f.toString ());                            emit (fieldEnd);        }        emitTemplate (recordEnd, url, nRecords);    }        private URL urlOfFirstRegion (Object[] fields) {        for (int i=0; i<fields.length; ++i)            if (fields[i] instanceof Region) {                Region r = (Region)fields[i];                return r.getSource().getURL();            }        return null;    }    private void emitTemplate (String template, URL url, int record) throws IOException {        if (template == null || template.length() == 0)            return;                    template = Str.replace (template, "%n", String.valueOf (record));        template = Str.replace (template, "%u", url != null ? url.toString () : "");        emit (template);    }    /*     * Testing     *    public static void main (String[] args) throws Exception {        Pattern p = new Tagexp (args[0].replace ('_', ' ') );        RecordTransformer records = new RecordTransformer (args[1]);        for (int i=2; i<args.length; ++i) {            Page page = new Page (new Link (args[i]));            PatternMatcher m = p.match (page);            for (Region r = m.nextMatch(); r != null; r = m.nextMatch())                records.writeRecord (r.getFields (Pattern.groups), false);        }        records.close ();    }     */}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -