⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htmltransformer.java

📁 一个用java语言编写的网络爬虫程序
💻 JAVA
字号:
/* * WebSPHINX web crawling toolkit * Copyright (C) 1998,1999 Carnegie Mellon University  *  * This library is free software; you can redistribute it * and/or modify it under the terms of the GNU Library * General Public License as published by the Free Software  * Foundation, version 2. * * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/ */package websphinx;import java.io.File;import java.io.OutputStream;import java.io.RandomAccessFile;import java.io.IOException;//#ifdef JDK1.1 import java.io.Writer;import java.io.OutputStreamWriter;//#endif JDK1.1/*#ifdef JDK1.0import java.io.PrintStream;#endif JDK1.0*/public class HTMLTransformer {//#ifdef JDK1.1    private Writer stream;          // output stream//#endif JDK1.1/*#ifdef JDK1.0    private PrintStream stream;    // output stream#endif JDK1.0*/    private boolean openedStream = false;              // we opened the stream, so we'd better close it    private RandomAccessFile readwrite; // output file    private HTMLTransformer next;      // next HTMLTransformer in the filter chain    private HTMLTransformer head;      // head of filter chain    private HTMLTransformer tail;      // tail of filter chain    // these fields are only valid on the tail element in the filter    // chain    private String content;         // content of page being printed    private int emitStart, emitEnd; // start and end of pending region                                    //   (the last region in the page which                                    //    has been emit()ed but not actually                                    //    written)    private int transformEnd;       // end of region being transformed        /**     * Make an HTMLTransformer that writes pages to a     * stream.     * @param out Stream to receive HTML output     */    public HTMLTransformer (OutputStream out) {        head = tail = this;        next = null;        setOutput (out);    }    /**     * Make an HTMLTransformer that writes pages to a     * file.     * @param filename Name of file to receive HTML output     * @exception IOException if file cannot be opened     */    public HTMLTransformer (String filename) throws IOException {        head = tail = this;        next = null;                openFile (filename, false);    }    /**     * Make an HTMLTransformer that writes pages to a     * file.     * @param filename Name of file to receive HTML output     * @param seekable True if file should be opened for random access     */    public HTMLTransformer (String filename, boolean seekable) throws IOException {        head = tail = this;        next = null;                openFile (filename, seekable);    }    /**     * Make an HTMLTransformer that writes pages to a     * stream.     * @param stream Stream to receive HTML output     *///#ifdef JDK1.1     public HTMLTransformer (Writer stream) {        head = tail = this;        next = null;        setOutput (stream);    }//#endif JDK1.1    /**     * Make an HTMLTransformer that writes pages to a     * downstream HTMLTransformer.  Use this constructor     * to chain together several HTMLTransformers.     * @param next HTMLTransformer to receive HTML output     */    public HTMLTransformer (HTMLTransformer next) {        this.next = next;        tail = next != null ? next.tail : this;        for (HTMLTransformer u = this; u != null; u = u.next)            u.head = this;    }    private void openFile (String filename, boolean seekable) throws IOException {        File file = new File (filename);        // open a stream first, to truncate the file to 0        OutputStream out = SecurityPolicy.getPolicy ().writeFile (file, false);                if (!seekable)            setOutput (out);        else {            out.close ();            RandomAccessFile raf = SecurityPolicy.getPolicy ().readWriteFile (file);            setRandomAccessFile (raf);        }                    openedStream = true;    }//#ifdef JDK1.1     public void setOutput (OutputStream out) {        if (next == null)            stream = new OutputStreamWriter (out);        else            next.setOutput (out);    }    public void setOutput (Writer out) {        if (next == null)            stream = out;        else            next.setOutput (out);    }    public Writer getOutput () {        return tail.stream;    }//#endif JDK1.1/*#ifdef JDK1.0    public void setOutput (OutputStream out) {        if (next == null)            stream = new PrintStream (out);        else            next.setOutput (out);    }    public OutputStream getOutput () {        return tail.stream;    }#endif JDK1.0*/    public void setRandomAccessFile (RandomAccessFile raf) {        if (next == null)            readwrite = raf;        else            next.setRandomAccessFile (raf);    }        public RandomAccessFile getRandomAccessFile () {        return tail.readwrite;    }    /**     * Writes a literal string through the HTML transformer     * (without parsing it or transforming it).     * @param string String to write     */    public synchronized void write (String string) throws IOException {        if (next == null)            emit (string);        else            next.write (string);    }    /**     * Writes a chunk of HTML through the HTML transformer.     * @param region Region to write     */    public synchronized void write (Region region) throws IOException {        if (next == null) {            emitPendingRegion ();                        String oldContent = content;            int oldEmitStart = emitStart;            int oldEmitEnd = emitEnd;            int oldTransformEnd = transformEnd;                        content = region.getSource().getContent ();            emitStart = emitEnd = region.getStart ();            transformEnd = region.getEnd ();            processElementsInRegion (region.getRootElement(),                                      region.getStart(),                                     region.getEnd());            emitPendingRegion ();            content = oldContent;            emitStart = oldEmitStart;            emitEnd = oldEmitEnd;            transformEnd = oldTransformEnd;        }        else            next.write (region);    }    /**     * Writes a page through the HTML transformer.     * @param page Page to write     */    public synchronized void writePage (Page page) throws IOException {        if (next == null) {            write (page);        }        else            next.writePage (page);    }    /**     * Flushes transformer to its destination stream.     * Empties any buffers in the transformer chain.     */    public synchronized void flush () throws IOException {        if (next == null) {            emitPendingRegion ();            if (stream != null)                stream.flush ();        }        else            next.flush ();    }    /**     * Close the transformer.  Flushes all buffered data     * to disk by calling flush().  This call may be     * time-consuming!  Don't use the transformer again after     * closing it.     * @exception IOException if an I/O error occurs     */    public synchronized void close () throws IOException {        flush ();        if (next == null) {            if (openedStream) {                if (stream != null)                    stream.close ();                if (readwrite != null)                    readwrite.close ();            }        }        else            next.close ();    }        /**     * Finalizes the transformer (calling close()).     */    protected void finalize() throws Throwable {        close ();    }    /**     * Get the file pointer.     * @return current file pointer     * @exception IOException if this transformer not opened for random access     */    public long getFilePointer () throws IOException {        if (readwrite == null)            throw new IOException ("HTMLTransformer not opened for random access");        return readwrite.getFilePointer ();    }    /**     * Seek to a file position.     * @param pos file position to seek     * @exception IOException if this transformer not opened for random access     */    public void seek (long pos) throws IOException {        if (readwrite == null)            throw new IOException ("HTMLTransformer not opened for random access");        readwrite.seek (pos);    }    /**     * Transform an element by passing it through the entire     * filter chain.     * @param elem Element to be transformed     */    protected void transformElement (Element elem) throws IOException {        head.handleElement (elem);    }    /**     * Transform the contents of an element.  Passes     * the child elements through the filter chain     * and emits the text between them.     * @param elem Element whose contents should be transformed     */    protected void transformContents (Element elem) throws IOException {        Tag startTag = elem.getStartTag ();        Tag endTag = elem.getEndTag ();        tail.processElementsInRegion (elem.getChild(),                           startTag.getEnd(),                           endTag != null ? endTag.getStart() : elem.getEnd());    }    /**     * Handle the transformation of an HTML element.     * Override this method to modify the HTML as it is     * written.     * @param elem Element to transform     */    protected void handleElement (Element elem) throws IOException {        if (next == null) {            Tag startTag = elem.getStartTag ();            Tag endTag = elem.getEndTag ();                        emit (startTag);            transformContents (elem);            if (endTag != null)                emit (endTag);        }        else            next.handleElement (elem);    }    /**     * Emit a region on the transformer chain's final output.     * (The region isn't passed through the chain.)     * @param r Region to emit     */    protected void emit (Region r) throws IOException {        tail.emitInternal (r.getSource().getContent(), r.getStart(), r.getEnd ());    }    /**     * Emit a string on the transformer chain's final output.     * @param string String to emit     */    protected void emit (String string) throws IOException {        tail.emitInternal (string, 0, string.length());    }    private void processElementsInRegion (Element elem, int start, int end) throws IOException {        if (this != tail)            throw new RuntimeException ("processElementsInRegion not called on tail");                    int p = start;                if (elem != null && elem.getSource().getContent() == content)            end = Math.min (end, transformEnd);        while (elem != null && elem.getStartTag().getEnd() <= end) {            emitInternal (content, p, elem.getStart());            transformElement (elem);            p = elem.getEnd ();                    elem = elem.getNext ();        }        emitInternal (content, Math.min (p, end), end);    }    private void emitInternal (String str, int start, int end) throws IOException {        if (this != tail)            throw new RuntimeException ("emitInternal not called on tail");                    if (str == content) {            start = Math.min (start, transformEnd);            end = Math.min (end, transformEnd);                            if (start == emitEnd)                emitEnd = end; // just extend the pending emit region            else {                emitPendingRegion ();                emitStart = start;                emitEnd = end;            }        }        else {            emitPendingRegion ();            writeStream (str.substring (start, end));        }    }    private void emitPendingRegion () throws IOException {        if (this != tail)            throw new RuntimeException ("emitPendingRegion not called on tail");                    if (emitStart != emitEnd) {            writeStream (content.substring (emitStart, emitEnd));            emitStart = emitEnd;        }    }        private void writeStream (String s) throws IOException {        if (stream != null) {            //#ifdef JDK1.1             stream.write (s);            //#endif JDK1.1            /*#ifdef JDK1.0            stream.print (s);            #endif JDK1.0*/        }        else            readwrite.writeBytes (s);    }    /*     * Testing     *    public static void main (String[] args) throws Exception {        Link link = new Link (args[0]);        Page page = new Page (link);        OutputStream out = (args.length >= 2)            ? (OutputStream)new java.io.FileOutputStream (args[1])            : (OutputStream)System.out;        HTMLTransformer unparser = new TestTransformer (out);        int len = page.getLength();        unparser.write (new Region (page, 0, 3*len/4));                    unparser.close ();    }     */}    /*     * Testing     *class TestTransformer extends HTMLTransformer {    public TestTransformer (OutputStream out) {        super (out);    }        protected void handleElement (Element elem) throws IOException {        System.out.println ("handling <" + elem.getTagName() + ">");        super.handleElement (elem);    }}    */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -