📄 charactertranslationtest.java

📁 html解析包可以很方便的解析html 纯java 实现
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
            ret = new StringBuffer (string.length ());            // newline instead of doublespaces            index = 0;            while ((index < string.length ()) && (-1 != (spaces = string.indexOf ("  ", index))))            {                ret.append ("        // " + string.substring (index, spaces));                if (!string.substring (index, spaces).endsWith (nl))                    ret.append (nl);                while ((spaces < string.length ()) && Character.isWhitespace (string.charAt (spaces)))                    spaces++;                index = spaces;            }            if (index < string.length ())                ret.append ("        // " + string.substring (index));            return (ret.toString ());        }        /**         * Pad a string on the left with the given character to the length specified.         * @param string The string to pad         * @param character The character to pad with.         * @param length The size to pad to.         * @return The padded string.         */        public String pad (String string, char character, int length)        {            StringBuffer ret;            ret = new StringBuffer (length);            ret.append (string);            while (length > ret.length ())                ret.insert (0, character);            return (ret.toString ());        }        /**         * Convert the textual representation of the numeric character reference to a character.         * @param string The numeric character reference (in quotes).         * @return The character represented by the numeric character reference.         *         */        public String unicode (String string)        {            int code;            if (string.startsWith ("\"&#") && string.endsWith (";\""))            {                string = string.substring (3, string.length () - 2);                try                {                    code = Integer.parseInt (string);                    string = "'\\u" + pad (Integer.toHexString (code), '0', 4) + "'";                }                catch (Exception e)                {                    e.printStackTrace ();                }                return (string);            }            else                return (string);        }        /**         * Parse the sgml declaration for character entity reference         * name, equivalent numeric character reference and a comment.         * Emit a java hash table 'put' with the name as the key, the         * numeric character as the value and comment the insertion         * with the comment.         * @param string The contents of the sgml declaration.         * @param out The sink for output.         */        public void extract (String string, PrintWriter out)        {            int space;            String token;            String code;            if (string.startsWith ("<!--"))                out.println (pretty (string.substring (4, string.length () - 3).trim ()));            else if (string.startsWith ("<!ENTITY"))            {                string = string.substring (8, string.length () - 3).trim ();                if (-1 != (space = string.indexOf (" ")))                {                    token = string.substring (0, space);                    string = string.substring (space).trim ();                    if (string.startsWith ("CDATA"))                    {                        string = string.substring (5).trim ();                        if (-1 != (space = string.indexOf (" ")))                        {                            code = string.substring (0, space).trim ();                            code = unicode (code);                            string = string.substring (space).trim ();                            out.println (                                "        new CharacterReference (\"" + token + "\","                                // no token is larger than 8 characters - yet                                + pad (code, ' ', code.length () + 9 - token.length ()) + "),"                                + " // "                                + pack (string));                        }                        else                            out.println (string);                    }                    else                        out.println (string);                }                else                    out.println (string);            }            else                out.println (string);        }        /**         * Extract special characters.         * Scan the string looking for substrings of the form:         * <pre>         * &lt;!ENTITY nbsp   CDATA "&amp;#160;" -- no-break space = non-breaking space, U+00A0 ISOnum --&gt;         * </pre>         * and emit a java definition for each.         * @param string The raw string from w3.org.         * @param out The sink for output.         */        public void sgml (String string, PrintWriter out)        {            int index;            int begin;            int end;            index = 0;            while (-1 != (begin = string.indexOf ("<", index)))            {                if (-1 != (end = string.indexOf ("-->", begin)))                {                    extract (string.substring (begin, end + 3), out);                    index = end + 3;                }                else                    index = begin + 1;            }        }        /**         * Pull out text elements from the HTML.         * @param out The sink for output.         */        public void parse (PrintWriter out)            throws                ParserException        {            Node node;            StringBuffer buffer = new StringBuffer (4096);            // Run through an enumeration of html elements, and pick up            // only those that are plain string.            for (NodeIterator e = mParser.elements (); e.hasMoreNodes ();)            {                node = e.nextNode ();                gather (node, buffer);            }            String text = translate (buffer.toString ());            sgml (text, out);        }    }    public CharacterReference[] getReferences ()    {        final String class_name = "CharacterEntityReferenceList";        String paths;        String path;        String source;        PrintWriter out;        Generate generate;        SimpleClassLoader loader;        Class hello;        Field field;        CharacterReference[] ret;        ret = mReferences;        if (null == ret)        {            paths = System.getProperty ("java.class.path");            path = System.getProperty ("user.home");            if (!path.endsWith (File.separator))                path += File.separator;            source = path + class_name + ".java";            try            {                // create it                generate = new Generate ();                out = new PrintWriter (new FileWriter (source));                out.println ("import org.htmlparser.util.CharacterReference;");                out.println ();                out.println ("/** Generated by " + this.getClass ().getName () + " **/");                out.println ("public class " + class_name);                out.println ("{");                out.println ("    /**");                out.println ("     * Table mapping character to entity reference.");                out.println ("     */");                out.println ("    public static final CharacterReference[] mCharacterReferences =");                out.println ("    {");                generate.parse (out);                out.println ("    };");                out.println ("}");                out.close ();                // compile it                if (0 == com.sun.tools.javac.Main.compile (new String[] {"-classpath", paths, source}))                {                    try                    {                        // load it                        loader = new SimpleClassLoader (path);                        hello = loader.loadClass (class_name);                        try                        {                            // get the references                            field = hello.getField ("mCharacterReferences");                            ret = (CharacterReference[])field.get (null);                            Sort.QuickSort (ret);                        }                        catch (IllegalAccessException iae)                        {                            fail ("references not accessible");                        }                        catch (NoSuchFieldException nsfe)                        {                            fail ("references not found");                        }                    }                    catch (ClassNotFoundException cnfe)                    {                        fail ("couldn't load class");                    }                    finally                    {                        File classfile;                        classfile = new File (path + class_name + ".class");                        classfile.delete ();                    }                }                else                    fail ("couldn't compile class");                mReferences = ret;            }            catch (IOException ioe)            {                fail ("couldn't write class");            }            catch (ParserException ioe)            {                fail ("couldn't parse w3.org entities list");            }        }                return (ret);    }    public void testInitialCharacterEntityReference ()    {        assertEquals (            "character entity reference at start of string doesn't work",            "\u00f7 is the division sign.",            Translate.decode ("&divide; is the division sign."));    }    public void testInitialNumericCharacterReference1 ()    {        assertEquals (            "numeric character reference at start of string doesn't work",            "\u00f7 is the division sign.",            Translate.decode ("&#247; is the division sign."));    }    public void testInitialNumericCharacterReference2 ()    {        assertEquals (            "numeric character reference at start of string doesn't work",            "\u00f7 is the division sign.",            Translate.decode ("&#0247; is the division sign."));    }    public void testInitialHexNumericCharacterReference1 ()    {        assertEquals (            "numeric character reference at start of string doesn't work",            "\u00f7 is the division sign.",            Translate.decode ("&#xf7; is the division sign."));    }    public void testInitialHexNumericCharacterReference2 ()    {        assertEquals (            "numeric character reference at start of string doesn't work",            "\u00f7 is the division sign.",            Translate.decode ("&#xF7; is the division sign."));    }    public void testInitialHexNumericCharacterReference3 ()    {        assertEquals (            "numeric character reference at start of string doesn't work",            "\u00f7 is the division sign.",            Translate.decode ("&#x0f7; is the division sign."));    }    public void testInitialHexNumericCharacterReference4 ()    {        assertEquals (            "numeric character reference at start of string doesn't work",            "\u00f7 is the division sign.",            Translate.decode ("&#x0F7; is the division sign."));    }    public void testInitialHexNumericCharacterReference5 ()    {        assertEquals (            "numeric character reference at start of string doesn't work",            "\u00f7 is the division sign.",            Translate.decode ("&#Xf7; is the division sign."));    }    public void testInitialHexNumericCharacterReference6 ()    {        assertEquals (            "numeric character reference at start of string doesn't work",            "\u00f7 is the division sign.",            Translate.decode ("&#XF7; is the division sign."));    }    public void testInitialHexNumericCharacterReference7 ()    {        assertEquals (            "numeric character reference at start of string doesn't work",            "\u00f7 is the division sign.",            Translate.decode ("&#X0f7; is the division sign."));    }    public void testInitialHexNumericCharacterReference8 ()    {        assertEquals (            "numeric character reference at start of string doesn't work",            "\u00f7 is the division sign.",            Translate.decode ("&#X0F7; is the division sign."));    }    public void testInitialCharacterEntityReferenceWithoutSemi ()    {        assertEquals (            "character entity reference without a semicolon at start of string doesn't work",            "\u00f7 is the division sign.",            Translate.decode ("&divide is the division sign."));    }
💿 文件大小 2128 K
👤 上传用户 hcwlxhyq
📂 所属分类其他
🏷️ 相关标签

#html #java
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -