📄 translate.java

📁 html 解析处理代码
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2004 Derrick Oswald//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/Translate.java,v $// $Author: derrickoswald $// $Date: 2006/06/04 19:17:21 $// $Revision: 1.47 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.util;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.OutputStreamWriter;import java.io.PrintStream;import java.io.PrintWriter;import java.io.Reader;import java.io.UnsupportedEncodingException;import org.htmlparser.util.sort.Sort;/** * Extended character entity reference. * Handles kernels within other strings, just for lookup purposes. */class CharacterReferenceEx extends CharacterReference{    /**     * The starting point in the string.     */    protected int mStart;    /**     * The ending point in the string.     */    protected int mEnd;    /**     * Zero args constructor.     * This object is only ever used after setting the kernel, start and end.     */    public CharacterReferenceEx ()    {        super ("", 0);    }    /**     * Set the starting point of the kernel.     */    public void setStart (int start)    {        mStart = start;    }    /**     * Set the supposed ending point.     * This only specifies an upper bound on the kernel length.     */    public void setEnd (int end)    {        mEnd = end;    }    /**     * Get this CharacterReference's kernel.     * @return The kernel in the equivalent character entity reference.     */    public String getKernel ()    {        return (mKernel.substring (mStart, mEnd));    }    //    // Ordered interface    //    /**     * Compare one reference to another.     * @see org.htmlparser.util.sort.Ordered     */    public int compare (Object that)    {        CharacterReference r;        String kernel;        int length;        int ret;        ret = 0;        r = (CharacterReference)that;        kernel = r.getKernel ();        length = kernel.length ();        for (int i = mStart, j = 0; i < mEnd; i++, j++)        {            if (j >= length)            {                ret = 1;                break;            }            ret = mKernel.charAt (i) - kernel.charAt (j);            if (0 != ret)                break;        }        return (ret);    }}/** * Translate numeric character references and character entity references to unicode characters. * Based on tables found at <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html"> * http://www.w3.org/TR/REC-html40/sgml/entities.html</a> * <p>Typical usage: * <pre> *      String s = Translate.decode (getTextFromHtmlPage ()); * </pre> * or * <pre> *      String s = "&lt;HTML&gt;" + Translate.encode (getArbitraryText ()) + "&lt;/HTML&gt;"; * </pre> */public class Translate{    /**     * If this member is set <code>true</code>, decoding of streams is     * done line by line in order to reduce the maximum memory required.     */    static public boolean DECODE_LINE_BY_LINE = false;    /**     * If this member is set <code>true</code>, encoding of numeric character     * references uses hexadecimal digits, i.e. &amp;#x25CB;, instead of decimal     * digits.     */    static public boolean ENCODE_HEXADECIMAL = false;    /**     * Table mapping entity reference kernel to character.     * This is sorted by kernel when the class is loaded.     */    protected static final CharacterReference[] mCharacterReferences =    {        // Portions © International Organization for Standardization 1986        // Permission to copy in any form is granted for use with        // conforming SGML systems and applications as defined in        // ISO 8879, provided this notice is included in all copies.        // Character entity set. Typical invocation:        // <!ENTITY % HTMLlat1 PUBLIC        // "-//W3C//ENTITIES Latin 1//EN//HTML">        // %HTMLlat1;        new CharacterReference ("nbsp",     '\u00a0'), // no-break space = non-breaking space, U+00A0 ISOnum        new CharacterReference ("iexcl",    '\u00a1'), // inverted exclamation mark, U+00A1 ISOnum        new CharacterReference ("cent",     '\u00a2'), // cent sign, U+00A2 ISOnum        new CharacterReference ("pound",    '\u00a3'), // pound sign, U+00A3 ISOnum        new CharacterReference ("curren",   '\u00a4'), // currency sign, U+00A4 ISOnum        new CharacterReference ("yen",      '\u00a5'), // yen sign = yuan sign, U+00A5 ISOnum        new CharacterReference ("brvbar",   '\u00a6'), // broken bar = broken vertical bar, U+00A6 ISOnum        new CharacterReference ("sect",     '\u00a7'), // section sign, U+00A7 ISOnum        new CharacterReference ("uml",      '\u00a8'), // diaeresis = spacing diaeresis, U+00A8 ISOdia        new CharacterReference ("copy",     '\u00a9'), // copyright sign, U+00A9 ISOnum        new CharacterReference ("ordf",     '\u00aa'), // feminine ordinal indicator, U+00AA ISOnum        new CharacterReference ("laquo",    '\u00ab'), // left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum        new CharacterReference ("not",      '\u00ac'), // not sign, U+00AC ISOnum        new CharacterReference ("shy",      '\u00ad'), // soft hyphen = discretionary hyphen, U+00AD ISOnum        new CharacterReference ("reg",      '\u00ae'), // registered sign = registered trade mark sign, U+00AE ISOnum        new CharacterReference ("macr",     '\u00af'), // macron = spacing macron = overline = APL overbar, U+00AF ISOdia        new CharacterReference ("deg",      '\u00b0'), // degree sign, U+00B0 ISOnum        new CharacterReference ("plusmn",   '\u00b1'), // plus-minus sign = plus-or-minus sign, U+00B1 ISOnum        new CharacterReference ("sup2",     '\u00b2'), // superscript two = superscript digit two = squared, U+00B2 ISOnum        new CharacterReference ("sup3",     '\u00b3'), // superscript three = superscript digit three = cubed, U+00B3 ISOnum        new CharacterReference ("acute",    '\u00b4'), // acute accent = spacing acute, U+00B4 ISOdia        new CharacterReference ("micro",    '\u00b5'), // micro sign, U+00B5 ISOnum        new CharacterReference ("para",     '\u00b6'), // pilcrow sign = paragraph sign, U+00B6 ISOnum        new CharacterReference ("middot",   '\u00b7'), // middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum        new CharacterReference ("cedil",    '\u00b8'), // cedilla = spacing cedilla, U+00B8 ISOdia        new CharacterReference ("sup1",     '\u00b9'), // superscript one = superscript digit one, U+00B9 ISOnum        new CharacterReference ("ordm",     '\u00ba'), // masculine ordinal indicator, U+00BA ISOnum        new CharacterReference ("raquo",    '\u00bb'), // right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum        new CharacterReference ("frac14",   '\u00bc'), // vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum        new CharacterReference ("frac12",   '\u00bd'), // vulgar fraction one half = fraction one half, U+00BD ISOnum        new CharacterReference ("frac34",   '\u00be'), // vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum        new CharacterReference ("iquest",   '\u00bf'), // inverted question mark = turned question mark, U+00BF ISOnum        new CharacterReference ("Agrave",   '\u00c0'), // latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1        new CharacterReference ("Aacute",   '\u00c1'), // latin capital letter A with acute, U+00C1 ISOlat1        new CharacterReference ("Acirc",    '\u00c2'), // latin capital letter A with circumflex, U+00C2 ISOlat1        new CharacterReference ("Atilde",   '\u00c3'), // latin capital letter A with tilde, U+00C3 ISOlat1        new CharacterReference ("Auml",     '\u00c4'), // latin capital letter A with diaeresis, U+00C4 ISOlat1        new CharacterReference ("Aring",    '\u00c5'), // latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1        new CharacterReference ("AElig",    '\u00c6'), // latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1        new CharacterReference ("Ccedil",   '\u00c7'), // latin capital letter C with cedilla, U+00C7 ISOlat1        new CharacterReference ("Egrave",   '\u00c8'), // latin capital letter E with grave, U+00C8 ISOlat1        new CharacterReference ("Eacute",   '\u00c9'), // latin capital letter E with acute, U+00C9 ISOlat1        new CharacterReference ("Ecirc",    '\u00ca'), // latin capital letter E with circumflex, U+00CA ISOlat1        new CharacterReference ("Euml",     '\u00cb'), // latin capital letter E with diaeresis, U+00CB ISOlat1        new CharacterReference ("Igrave",   '\u00cc'), // latin capital letter I with grave, U+00CC ISOlat1        new CharacterReference ("Iacute",   '\u00cd'), // latin capital letter I with acute, U+00CD ISOlat1        new CharacterReference ("Icirc",    '\u00ce'), // latin capital letter I with circumflex, U+00CE ISOlat1        new CharacterReference ("Iuml",     '\u00cf'), // latin capital letter I with diaeresis, U+00CF ISOlat1        new CharacterReference ("ETH",      '\u00d0'), // latin capital letter ETH, U+00D0 ISOlat1        new CharacterReference ("Ntilde",   '\u00d1'), // latin capital letter N with tilde, U+00D1 ISOlat1        new CharacterReference ("Ograve",   '\u00d2'), // latin capital letter O with grave, U+00D2 ISOlat1        new CharacterReference ("Oacute",   '\u00d3'), // latin capital letter O with acute, U+00D3 ISOlat1        new CharacterReference ("Ocirc",    '\u00d4'), // latin capital letter O with circumflex, U+00D4 ISOlat1        new CharacterReference ("Otilde",   '\u00d5'), // latin capital letter O with tilde, U+00D5 ISOlat1        new CharacterReference ("Ouml",     '\u00d6'), // latin capital letter O with diaeresis, U+00D6 ISOlat1        new CharacterReference ("times",    '\u00d7'), // multiplication sign, U+00D7 ISOnum        new CharacterReference ("Oslash",   '\u00d8'), // latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1        new CharacterReference ("Ugrave",   '\u00d9'), // latin capital letter U with grave, U+00D9 ISOlat1        new CharacterReference ("Uacute",   '\u00da'), // latin capital letter U with acute, U+00DA ISOlat1        new CharacterReference ("Ucirc",    '\u00db'), // latin capital letter U with circumflex, U+00DB ISOlat1        new CharacterReference ("Uuml",     '\u00dc'), // latin capital letter U with diaeresis, U+00DC ISOlat1        new CharacterReference ("Yacute",   '\u00dd'), // latin capital letter Y with acute, U+00DD ISOlat1        new CharacterReference ("THORN",    '\u00de'), // latin capital letter THORN, U+00DE ISOlat1        new CharacterReference ("szlig",    '\u00df'), // latin small letter sharp s = ess-zed, U+00DF ISOlat1        new CharacterReference ("agrave",   '\u00e0'), // latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1        new CharacterReference ("aacute",   '\u00e1'), // latin small letter a with acute, U+00E1 ISOlat1        new CharacterReference ("acirc",    '\u00e2'), // latin small letter a with circumflex, U+00E2 ISOlat1        new CharacterReference ("atilde",   '\u00e3'), // latin small letter a with tilde, U+00E3 ISOlat1        new CharacterReference ("auml",     '\u00e4'), // latin small letter a with diaeresis, U+00E4 ISOlat1        new CharacterReference ("aring",    '\u00e5'), // latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1        new CharacterReference ("aelig",    '\u00e6'), // latin small letter ae = latin small ligature ae, U+00E6 ISOlat1        new CharacterReference ("ccedil",   '\u00e7'), // latin small letter c with cedilla, U+00E7 ISOlat1        new CharacterReference ("egrave",   '\u00e8'), // latin small letter e with grave, U+00E8 ISOlat1        new CharacterReference ("eacute",   '\u00e9'), // latin small letter e with acute, U+00E9 ISOlat1        new CharacterReference ("ecirc",    '\u00ea'), // latin small letter e with circumflex, U+00EA ISOlat1        new CharacterReference ("euml",     '\u00eb'), // latin small letter e with diaeresis, U+00EB ISOlat1        new CharacterReference ("igrave",   '\u00ec'), // latin small letter i with grave, U+00EC ISOlat1        new CharacterReference ("iacute",   '\u00ed'), // latin small letter i with acute, U+00ED ISOlat1        new CharacterReference ("icirc",    '\u00ee'), // latin small letter i with circumflex, U+00EE ISOlat1        new CharacterReference ("iuml",     '\u00ef'), // latin small letter i with diaeresis, U+00EF ISOlat1        new CharacterReference ("eth",      '\u00f0'), // latin small letter eth, U+00F0 ISOlat1        new CharacterReference ("ntilde",   '\u00f1'), // latin small letter n with tilde, U+00F1 ISOlat1        new CharacterReference ("ograve",   '\u00f2'), // latin small letter o with grave, U+00F2 ISOlat1        new CharacterReference ("oacute",   '\u00f3'), // latin small letter o with acute, U+00F3 ISOlat1        new CharacterReference ("ocirc",    '\u00f4'), // latin small letter o with circumflex, U+00F4 ISOlat1        new CharacterReference ("otilde",   '\u00f5'), // latin small letter o with tilde, U+00F5 ISOlat1        new CharacterReference ("ouml",     '\u00f6'), // latin small letter o with diaeresis, U+00F6 ISOlat1        new CharacterReference ("divide",   '\u00f7'), // division sign, U+00F7 ISOnum        new CharacterReference ("oslash",   '\u00f8'), // latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1        new CharacterReference ("ugrave",   '\u00f9'), // latin small letter u with grave, U+00F9 ISOlat1        new CharacterReference ("uacute",   '\u00fa'), // latin small letter u with acute, U+00FA ISOlat1        new CharacterReference ("ucirc",    '\u00fb'), // latin small letter u with circumflex, U+00FB ISOlat1        new CharacterReference ("uuml",     '\u00fc'), // latin small letter u with diaeresis, U+00FC ISOlat1        new CharacterReference ("yacute",   '\u00fd'), // latin small letter y with acute, U+00FD ISOlat1        new CharacterReference ("thorn",    '\u00fe'), // latin small letter thorn, U+00FE ISOlat1        new CharacterReference ("yuml",     '\u00ff'), // latin small letter y with diaeresis, U+00FF ISOlat1        // Mathematical, Greek and Symbolic characters for HTML        // Character entity set. Typical invocation:        // <!ENTITY % HTMLsymbol PUBLIC        // "-//W3C//ENTITIES Symbols//EN//HTML">        // %HTMLsymbol;        // Portions © International Organization for Standardization 1986:        // Permission to copy in any form is granted for use with        // conforming SGML systems and applications as defined in        // ISO 8879, provided this notice is included in all copies.        // Relevant ISO entity set is given unless names are newly introduced.        // New names (i.e., not in ISO 8879 list) do not clash with any        // existing ISO 8879 entity names. ISO 10646 character numbers        // are given for each character, in hex. CDATA values are decimal        // conversions of the ISO 10646 values and refer to the document        // character set. Names are ISO 10646 names.        // Latin Extended-B        new CharacterReference ("fnof",     '\u0192'), // latin small f with hook = function = florin, U+0192 ISOtech        // Greek        new CharacterReference ("Alpha",    '\u0391'), // greek capital letter alpha, U+0391        new CharacterReference ("Beta",     '\u0392'), // greek capital letter beta, U+0392        new CharacterReference ("Gamma",    '\u0393'), // greek capital letter gamma, U+0393 ISOgrk3        new CharacterReference ("Delta",    '\u0394'), // greek capital letter delta, U+0394 ISOgrk3        new CharacterReference ("Epsilon",  '\u0395'), // greek capital letter epsilon, U+0395        new CharacterReference ("Zeta",     '\u0396'), // greek capital letter zeta, U+0396
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -