📄 page.java

📁 html 解析处理代码
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
     * For details see <code>writeObject()</code>.     * @param in The object stream to decode.     * @exception IOException If there is a deserialization problem with     * the stream.     * @exception ClassNotFoundException If the deserialized class can't be     * located with the current classpath and class loader.     */    private void readObject (ObjectInputStream in)        throws            IOException,            ClassNotFoundException    {        boolean fromurl;        int offset;        String href;        URL url;        Cursor cursor;        fromurl = in.readBoolean ();        if (fromurl)        {            offset = in.readInt ();            href = (String)in.readObject ();            in.defaultReadObject ();            // open the URL            if (null != getUrl ())            {                url = new URL (getUrl ());                try                {                    setConnection (url.openConnection ());                }                catch (ParserException pe)                {                    throw new IOException (pe.getMessage ());                }            }            cursor = new Cursor (this, 0);            for (int i = 0; i < offset; i++)                try                {                    getCharacter (cursor);                }                catch (ParserException pe)                {                    throw new IOException (pe.getMessage ());                }            setUrl (href);        }        else        {            href = (String)in.readObject ();            in.defaultReadObject ();            setUrl (href);        }    }    /**     * Reset the page by resetting the source of characters.     */    public void reset ()    {        getSource ().reset ();        mIndex = new PageIndex (this); // todo: is this really necessary?    }    /**     * Close the page by destroying the source of characters.     * @exception IOException If destroying the source encounters an error.     */    public void close () throws IOException    {        if (null != getSource ())            getSource ().destroy ();    }    /**     * Clean up this page, releasing resources.     * Calls <code>close()</code>.     * @exception Throwable if <code>close()</code> throws an     * <code>IOException</code>.     */    protected void finalize ()        throws            Throwable    {        close ();    }    /**     * Get the connection, if any.     * @return The connection object for this page, or null if this page     * is built from a stream or a string.     */    public URLConnection getConnection ()    {        return (mConnection);    }    /**     * Set the URLConnection to be used by this page.     * Starts reading from the given connection.     * This also resets the current url.     * @param connection The connection to use.     * It will be connected by this method.     * @exception ParserException If the <code>connect()</code> method fails,     * or an I/O error occurs opening the input stream or the character set     * designated in the HTTP header is unsupported.     */    public void setConnection (URLConnection connection)        throws            ParserException    {        Stream stream;        String type;        String charset;        String contentEncoding;        mConnection = connection;        try        {            getConnection ().connect ();        }        catch (UnknownHostException uhe)        {            throw new ParserException ("Connect to "                + mConnection.getURL ().toExternalForm () + " failed.", uhe);        }        catch (IOException ioe)        {            throw new ParserException ("Exception connecting to "                + mConnection.getURL ().toExternalForm ()                + " (" + ioe.getMessage () + ").", ioe);        }        type = getContentType ();        charset = getCharset (type);        try        {            contentEncoding = connection.getContentEncoding();            if ((null != contentEncoding)                && (-1 != contentEncoding.indexOf ("gzip")))            {                stream = new Stream (new GZIPInputStream (                    getConnection ().getInputStream ()));            }            else if ((null != contentEncoding)                && (-1 != contentEncoding.indexOf ("deflate")))            {                stream = new Stream (new InflaterInputStream (                    getConnection ().getInputStream (), new Inflater (true)));            }            else            {                stream = new Stream (getConnection ().getInputStream ());            }            try            {                mSource = new InputStreamSource (stream, charset);            }            catch (UnsupportedEncodingException uee)            {//                StringBuffer msg;////                msg = new StringBuffer (1024);//                msg.append (getConnection ().getURL ().toExternalForm ());//                msg.append (" has an encoding (");//                msg.append (charset);//                msg.append (") which is not supported, using ");//                msg.append (DEFAULT_CHARSET);//                System.out.println (msg.toString ());                charset = DEFAULT_CHARSET;                mSource = new InputStreamSource (stream, charset);            }        }        catch (IOException ioe)        {            throw new ParserException ("Exception getting input stream from "                + mConnection.getURL ().toExternalForm ()                + " (" + ioe.getMessage () + ").", ioe);        }        mUrl = connection.getURL ().toExternalForm ();        mIndex = new PageIndex (this);    }    /**     * Get the URL for this page.     * This is only available if the page has a connection     * (<code>getConnection()</code> returns non-null), or the document base has     * been set via a call to <code>setUrl()</code>.     * @return The url for the connection, or <code>null</code> if there is     * no conenction or the document base has not been set.     */    public String getUrl ()    {        return (mUrl);    }    /**     * Set the URL for this page.     * This doesn't affect the contents of the page, just the interpretation     * of relative links from this point forward.     * @param url The new URL.     */    public void setUrl (String url)    {        mUrl = url;    }    /**     * Gets the baseUrl.     * @return The base URL for this page, or <code>null</code> if not set.     */    public String getBaseUrl ()    {        return (mBaseUrl);    }    /**     * Sets the baseUrl.     * @param url The base url for this page.     */    public void setBaseUrl (String url)    {        mBaseUrl = url;    }    /**     * Get the source this page is reading from.     * @return The current source.     */    public Source getSource ()    {        return (mSource);    }    /**     * Try and extract the content type from the HTTP header.     * @return The content type.     */    public String getContentType ()    {        URLConnection connection;        String content;        String ret;        ret = DEFAULT_CONTENT_TYPE;        connection = getConnection ();        if (null != connection)        {            // can't use connection#getContentType            // see Bug #1467712 Page#getCharset never works            content = connection.getHeaderField ("Content-Type");            if (null != content)                ret = content;        }        return (ret);    }    /**     * Read the character at the given cursor position.     * The cursor position can be only behind or equal to the     * current source position.     * Returns end of lines (EOL) as \n, by converting \r and \r\n to \n,     * and updates the end-of-line index accordingly.     * Advances the cursor position by one (or two in the \r\n case).     * @param cursor The position to read at.     * @return The character at that position, and modifies the cursor to     * prepare for the next read. If the source is exhausted a zero is returned.     * @exception ParserException If an IOException on the underlying source     * occurs, or an attempt is made to read characters in the future (the     * cursor position is ahead of the underlying stream)     */    public char getCharacter (Cursor cursor)        throws            ParserException    {        int i;        int offset;        char ret;        i = cursor.getPosition ();        offset = mSource.offset ();        if (offset == i)            try            {                i = mSource.read ();                if (Source.EOF == i)                    ret = EOF;                else                {                    ret = (char)i;                    cursor.advance ();                }            }            catch (IOException ioe)            {                throw new ParserException (                    "problem reading a character at position "                    + cursor.getPosition (), ioe);            }        else if (offset > i)        {            // historic read            try            {                ret = mSource.getCharacter (i);            }            catch (IOException ioe)            {                throw new ParserException (                    "can't read a character at position "                    + i, ioe);            }            cursor.advance ();        }        else            // hmmm, we could skip ahead, but then what about the EOL index            throw new ParserException (                "attempt to read future characters from source "                + i + " > " + mSource.offset ());        // handle \r        if ('\r' == ret)        {   // switch to single character EOL            ret = '\n';            // check for a \n in the next position            if (mSource.offset () == cursor.getPosition ())                try                {                    i = mSource.read ();                    if (Source.EOF == i)                    {                        // do nothing                    }                    else if ('\n' == (char)i)                        cursor.advance ();                    else                        try                        {                            mSource.unread ();                        }                        catch (IOException ioe)                        {                            throw new ParserException (                                "can't unread a character at position "                                + cursor.getPosition (), ioe);                        }                }                catch (IOException ioe)                {                    throw new ParserException (                        "problem reading a character at position "                        + cursor.getPosition (), ioe);                }            else                try                {                    if ('\n' == mSource.getCharacter (cursor.getPosition ()))                        cursor.advance ();                }                catch (IOException ioe)                {                    throw new ParserException (                        "can't read a character at position "                        + cursor.getPosition (), ioe);                }        }        if ('\n' == ret)            // update the EOL index in any case            mIndex.add (cursor);        return (ret);    }    /**     * Return a character.     * Handles end of lines (EOL) specially, retreating the cursor twice for     * the '\r\n' case.     * The cursor position is moved back by one (or two in the \r\n case).     * @param cursor The position to 'unread' at.     * @exception ParserException If an IOException on the underlying source     * occurs.     */    public void ungetCharacter (Cursor cursor)        throws            ParserException    {        int i;        char ch;        cursor.retreat ();        i = cursor.getPosition ();        try        {            ch = mSource.getCharacter (i);            if (('\n' == ch) && (0 != i))            {                ch = mSource.getCharacter (i - 1);                if ('\r' == ch)                    cursor.retreat ();            }        }        catch (IOException ioe)        {            throw new ParserException (                "can't read a character at position "                + cursor.getPosition (), ioe);        }    }    /**     * Get the current encoding being used.     * @return The encoding used to convert characters.     */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -