📄 charactertranslationtest.java

📁 html解析包可以很方便的解析html 纯java 实现
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
// HTMLParser Library $Name: v1_6_20051112 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2004 Derick Oswald//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests/CharacterTranslationTest.java,v $// $Author: derrickoswald $// $Date: 2004/07/31 16:42:32 $// $Revision: 1.46 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.tests.utilTests;import java.io.ByteArrayInputStream;import java.io.ByteArrayOutputStream;import java.io.File;import java.io.FileInputStream;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.io.PrintStream;import java.io.PrintWriter;import java.lang.reflect.Field;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.Random;import org.htmlparser.Node;import org.htmlparser.Parser;import org.htmlparser.Remark;import org.htmlparser.Tag;import org.htmlparser.Text;import org.htmlparser.tags.LinkTag;import org.htmlparser.tests.ParserTestCase;import org.htmlparser.util.CharacterReference;import org.htmlparser.util.NodeIterator;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;import org.htmlparser.util.Translate;import org.htmlparser.util.sort.Sort;public class CharacterTranslationTest    extends        ParserTestCase{    static    {        System.setProperty ("org.htmlparser.tests.utilTests.CharacterTranslationTest", "CharacterTranslationTest");    }    /**     * The list of references.     */    protected static CharacterReference[] mReferences;        public CharacterTranslationTest (String name)    {        super (name);    }    /**     * Class loader to access the compiled character references.     */    class SimpleClassLoader extends ClassLoader    {        /**         * The class path for this class loader.         */        String mRoot;        public SimpleClassLoader (String root)        {            if (!root.endsWith (File.separator))                root += File.separator;            mRoot = root;        }        public Class loadClass (String className)            throws                ClassNotFoundException        {            return (loadClass (className, true));        }                public synchronized Class loadClass (String className, boolean resolveIt)            throws                ClassNotFoundException        {            byte data[];            FileInputStream in;            Class ret;                        try            {                // try system class loader                ret = super.findSystemClass (className);            }            catch (ClassNotFoundException e)            {                try                {                    in = new FileInputStream (mRoot + className + ".class");                    data = new byte[in.available ()];                    in.read (data);                    in.close ();                    ret = defineClass (className, data, 0, data.length);                    if (null == ret)                        throw new ClassFormatError ();                    if (resolveIt)                        resolveClass (ret);                }                catch (IOException ioe)                {                    throw new ClassNotFoundException ();                }            }                        return (ret);        }    }    /**     * Create a character reference translation class source file.     * Usage:     * <pre>     *     java -classpath .:lib/htmlparser.jar Generate > Translate.java     * </pre>     * Derived from HTMLStringFilter.java provided as an example with the     * htmlparser.jar file available at     * <a href="http://htmlparser.sourceforge.net">htmlparser.sourceforge.net</a>     * written by Somik Raha (     * <a href='mailto:somik@industriallogic.com?     * subject=htmlparser'>somik@industriallogic. com</a>     * <a href="http://industriallogic.com">http://industriallogic.com</a>).     * @author <a href='mailto:DerrickOswald@users.sourceforge.net?subject=Character Reference Translation class'>Derrick Oswald</a>     */    public class Generate    {        /**         * The working parser.         */        protected Parser mParser;        protected String nl = System.getProperty ("line.separator", "\n");                /**         * Create a Generate object.         * Sets up the generation by creating a new <code>Parser</code> pointed         * at <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">http://www.w3.org/TR/REC-html40/sgml/entities.html</a>         * with the standard scanners registered.         */        public Generate ()            throws ParserException        {            mParser = new Parser ("http://www.w3.org/TR/REC-html40/sgml/entities.html");        }        /**         * Translate character references.         * After generating the Translate class we could use it         * to do this job, but that would involve a bootstrap         * problem, so this method does the reference conversion         * for a very tiny subset (enough  to understand the w3.org         * page).         * @param string The raw string.         * @return The string with character references fixed.         */        public String translate (String string)        {            int index;            int amp;            StringBuffer ret;            ret = new StringBuffer (4096);            index = 0;            while ((index < string.length ()) && (-1 != (amp = string.indexOf ('&', index))))            {                // include the part before the special character                ret.append (string.substring (index, amp));                if (string.startsWith ("&nbsp;", amp))                {                    ret.append (" ");                    index = amp + 6;                }                else if (string.startsWith ("&lt;", amp))                {                    ret.append ("<");                    index = amp + 4;                }                else if (string.startsWith ("&gt;", amp))                {                    ret.append (">");                    index = amp + 4;                }                else if (string.startsWith ("&amp;", amp))                {                    ret.append ("&");                    index = amp + 5;                }                else if (string.startsWith ("&quote;", amp))                {                    ret.append ("\"");                    index = amp + 7;                }                else if (string.startsWith ("&divide;", amp))                {                    //ret.append ('\u00F7');                    //index = amp + 8;                    ret.append ("&");                    index = amp + 1;                }                else if (string.startsWith ("&copy;", amp))                {                    //ret.append ('\u00A9');                    //index = amp + 6;                    ret.append ("&");                    index = amp + 1;                }                else                {                    System.out.println ("unknown special character starting with " + string.substring (amp, amp + 7));                    ret.append ("&");                    index = amp + 1;                }            }            ret.append (string.substring (index));            return (ret.toString ());        }        public void gather (Node node, StringBuffer buffer)        {            NodeList children;            if (node instanceof Text)            {                // Node is a plain string                // Cast it to an HTMLText                Text stringNode = (Text)node;                // Retrieve the data from the object                buffer.append (stringNode.getText ());            }            else if (node instanceof LinkTag)            {                // Node is a link                // Cast it to an HTMLLinkTag                LinkTag linkNode = (LinkTag)node;                // Retrieve the data from the object and print it                buffer.append (linkNode.getLinkText ());            }            else if (node instanceof Tag)            {                String name = ((Tag)node).getTagName ();                if (name.equals ("BR") || name.equals ("P"))                    buffer.append (nl);                else                {                    children = ((Tag)node).getChildren ();                    if (null != children)                        for (int i = 0; i < children.size (); i++)                            gather (children.elementAt (i), buffer);                }            }            else if (node instanceof Remark)            {            }            else            {                System.out.println ();                System.out.println(node.toString());            }        }        /**         * Find the lowest index of whitespace (space or newline).         * @param string The string to look in.         * @param index Where to start looking.         * @return -1 if there is no whitespace, the minimum index otherwise.         */        public int indexOfWhitespace (String string, int index)        {            int space;            int cr;            int ret;            space = string.indexOf (" ", index);            cr = string.indexOf (nl, index);            if (-1 == space)                ret = cr;            else if (-1 == cr)                ret = space;            else                ret = Math.min (space, cr);            return (ret);        }        /**         * Rewrite the comment string.         * In the sgml table, the comments are of the form:         * <pre>         * -- latin capital letter I with diaeresis,         *             U+00CF ISOlat1         * </pre>         * so we just want to make a one-liner without the spaces and newlines.         * @param string The raw comment.         * @return The single line comment.         */        public String pack (String string)        {            int index;            int spaces;            StringBuffer ret;            ret = new StringBuffer (string.length ());            if (string.startsWith ("-- "))                string = string.substring (3);            // remove doublespaces            index = 0;            while ((index < string.length ()) && (-1 != (spaces = indexOfWhitespace (string, index))))            {                ret.append (string.substring (index, spaces));                ret.append (" ");                while ((spaces < string.length ()) && Character.isWhitespace (string.charAt (spaces)))                    spaces++;                index = spaces;            }            if (index < string.length ())                ret.append (string.substring (index));            return (ret.toString ());        }        /**         * Pretty up a comment string.         * @param string The comment to operate on.         * @return The beautiful comment string.         */        public String pretty (String string)        {            int index;            int spaces;            StringBuffer ret;
12 3 4 下一页
💿 文件大小 2128 K
👤 上传用户 hcwlxhyq
📂 所属分类其他
🏷️ 相关标签

#html #java
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -