📄 translate.java

📁 html 解析处理代码
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
        new CharacterReference ("Eta",      '\u0397'), // greek capital letter eta, U+0397        new CharacterReference ("Theta",    '\u0398'), // greek capital letter theta, U+0398 ISOgrk3        new CharacterReference ("Iota",     '\u0399'), // greek capital letter iota, U+0399        new CharacterReference ("Kappa",    '\u039a'), // greek capital letter kappa, U+039A        new CharacterReference ("Lambda",   '\u039b'), // greek capital letter lambda, U+039B ISOgrk3        new CharacterReference ("Mu",       '\u039c'), // greek capital letter mu, U+039C        new CharacterReference ("Nu",       '\u039d'), // greek capital letter nu, U+039D        new CharacterReference ("Xi",       '\u039e'), // greek capital letter xi, U+039E ISOgrk3        new CharacterReference ("Omicron",  '\u039f'), // greek capital letter omicron, U+039F        new CharacterReference ("Pi",       '\u03a0'), // greek capital letter pi, U+03A0 ISOgrk3        new CharacterReference ("Rho",      '\u03a1'), // greek capital letter rho, U+03A1        // there is no Sigmaf, and no U+03A2 character either        new CharacterReference ("Sigma",    '\u03a3'), // greek capital letter sigma, U+03A3 ISOgrk3        new CharacterReference ("Tau",      '\u03a4'), // greek capital letter tau, U+03A4        new CharacterReference ("Upsilon",  '\u03a5'), // greek capital letter upsilon, U+03A5 ISOgrk3        new CharacterReference ("Phi",      '\u03a6'), // greek capital letter phi, U+03A6 ISOgrk3        new CharacterReference ("Chi",      '\u03a7'), // greek capital letter chi, U+03A7        new CharacterReference ("Psi",      '\u03a8'), // greek capital letter psi, U+03A8 ISOgrk3        new CharacterReference ("Omega",    '\u03a9'), // greek capital letter omega, U+03A9 ISOgrk3        new CharacterReference ("alpha",    '\u03b1'), // greek small letter alpha, U+03B1 ISOgrk3        new CharacterReference ("beta",     '\u03b2'), // greek small letter beta, U+03B2 ISOgrk3        new CharacterReference ("gamma",    '\u03b3'), // greek small letter gamma, U+03B3 ISOgrk3        new CharacterReference ("delta",    '\u03b4'), // greek small letter delta, U+03B4 ISOgrk3        new CharacterReference ("epsilon",  '\u03b5'), // greek small letter epsilon, U+03B5 ISOgrk3        new CharacterReference ("zeta",     '\u03b6'), // greek small letter zeta, U+03B6 ISOgrk3        new CharacterReference ("eta",      '\u03b7'), // greek small letter eta, U+03B7 ISOgrk3        new CharacterReference ("theta",    '\u03b8'), // greek small letter theta, U+03B8 ISOgrk3        new CharacterReference ("iota",     '\u03b9'), // greek small letter iota, U+03B9 ISOgrk3        new CharacterReference ("kappa",    '\u03ba'), // greek small letter kappa, U+03BA ISOgrk3        new CharacterReference ("lambda",   '\u03bb'), // greek small letter lambda, U+03BB ISOgrk3        new CharacterReference ("mu",       '\u03bc'), // greek small letter mu, U+03BC ISOgrk3        new CharacterReference ("nu",       '\u03bd'), // greek small letter nu, U+03BD ISOgrk3        new CharacterReference ("xi",       '\u03be'), // greek small letter xi, U+03BE ISOgrk3        new CharacterReference ("omicron",  '\u03bf'), // greek small letter omicron, U+03BF NEW        new CharacterReference ("pi",       '\u03c0'), // greek small letter pi, U+03C0 ISOgrk3        new CharacterReference ("rho",      '\u03c1'), // greek small letter rho, U+03C1 ISOgrk3        new CharacterReference ("sigmaf",   '\u03c2'), // greek small letter final sigma, U+03C2 ISOgrk3        new CharacterReference ("sigma",    '\u03c3'), // greek small letter sigma, U+03C3 ISOgrk3        new CharacterReference ("tau",      '\u03c4'), // greek small letter tau, U+03C4 ISOgrk3        new CharacterReference ("upsilon",  '\u03c5'), // greek small letter upsilon, U+03C5 ISOgrk3        new CharacterReference ("phi",      '\u03c6'), // greek small letter phi, U+03C6 ISOgrk3        new CharacterReference ("chi",      '\u03c7'), // greek small letter chi, U+03C7 ISOgrk3        new CharacterReference ("psi",      '\u03c8'), // greek small letter psi, U+03C8 ISOgrk3        new CharacterReference ("omega",    '\u03c9'), // greek small letter omega, U+03C9 ISOgrk3        new CharacterReference ("thetasym", '\u03d1'), // greek small letter theta symbol, U+03D1 NEW        new CharacterReference ("upsih",    '\u03d2'), // greek upsilon with hook symbol, U+03D2 NEW        new CharacterReference ("piv",      '\u03d6'), // greek pi symbol, U+03D6 ISOgrk3        // General Punctuation        new CharacterReference ("bull",     '\u2022'), // bullet = black small circle, U+2022 ISOpub        // bullet is NOT the same as bullet operator, U+2219        new CharacterReference ("hellip",   '\u2026'), // horizontal ellipsis = three dot leader, U+2026 ISOpub        new CharacterReference ("prime",    '\u2032'), // prime = minutes = feet, U+2032 ISOtech        new CharacterReference ("Prime",    '\u2033'), // double prime = seconds = inches, U+2033 ISOtech        new CharacterReference ("oline",    '\u203e'), // overline = spacing overscore, U+203E NEW        new CharacterReference ("frasl",    '\u2044'), // fraction slash, U+2044 NEW        // Letterlike Symbols        new CharacterReference ("weierp",   '\u2118'), // script capital P = power set = Weierstrass p, U+2118 ISOamso        new CharacterReference ("image",    '\u2111'), // blackletter capital I = imaginary part, U+2111 ISOamso        new CharacterReference ("real",     '\u211c'), // blackletter capital R = real part symbol, U+211C ISOamso        new CharacterReference ("trade",    '\u2122'), // trade mark sign, U+2122 ISOnum        new CharacterReference ("alefsym",  '\u2135'), // alef symbol = first transfinite cardinal, U+2135 NEW        // alef symbol is NOT the same as hebrew letter alef,        // U+05D0 although the same glyph could be used to depict both characters        // Arrows        new CharacterReference ("larr",     '\u2190'), // leftwards arrow, U+2190 ISOnum        new CharacterReference ("uarr",     '\u2191'), // upwards arrow, U+2191 ISOnum        new CharacterReference ("rarr",     '\u2192'), // rightwards arrow, U+2192 ISOnum        new CharacterReference ("darr",     '\u2193'), // downwards arrow, U+2193 ISOnum        new CharacterReference ("harr",     '\u2194'), // left right arrow, U+2194 ISOamsa        new CharacterReference ("crarr",    '\u21b5'), // downwards arrow with corner leftwards = carriage return, U+21B5 NEW        new CharacterReference ("lArr",     '\u21d0'), // leftwards double arrow, U+21D0 ISOtech        // ISO 10646 does not say that lArr is the same as the 'is implied by' arrow        // but also does not have any other character for that function. So ? lArr can        // be used for 'is implied by' as ISOtech suggests        new CharacterReference ("uArr",     '\u21d1'), // upwards double arrow, U+21D1 ISOamsa        new CharacterReference ("rArr",     '\u21d2'), // rightwards double arrow, U+21D2 ISOtech        // ISO 10646 does not say this is the 'implies' character but does not have         // another character with this function so ?        // rArr can be used for 'implies' as ISOtech suggests        new CharacterReference ("dArr",     '\u21d3'), // downwards double arrow, U+21D3 ISOamsa        new CharacterReference ("hArr",     '\u21d4'), // left right double arrow, U+21D4 ISOamsa        // Mathematical Operators        new CharacterReference ("forall",   '\u2200'), // for all, U+2200 ISOtech        new CharacterReference ("part",     '\u2202'), // partial differential, U+2202 ISOtech        new CharacterReference ("exist",    '\u2203'), // there exists, U+2203 ISOtech        new CharacterReference ("empty",    '\u2205'), // empty set = null set = diameter, U+2205 ISOamso        new CharacterReference ("nabla",    '\u2207'), // nabla = backward difference, U+2207 ISOtech        new CharacterReference ("isin",     '\u2208'), // element of, U+2208 ISOtech        new CharacterReference ("notin",    '\u2209'), // not an element of, U+2209 ISOtech        new CharacterReference ("ni",       '\u220b'), // contains as member, U+220B ISOtech        // should there be a more memorable name than 'ni'?        new CharacterReference ("prod",     '\u220f'), // n-ary product = product sign, U+220F ISOamsb        // prod is NOT the same character as U+03A0 'greek capital letter pi' though        // the same glyph might be used for both        new CharacterReference ("sum",      '\u2211'), // n-ary sumation, U+2211 ISOamsb        // sum is NOT the same character as U+03A3 'greek capital letter sigma'        // though the same glyph might be used for both        new CharacterReference ("minus",    '\u2212'), // minus sign, U+2212 ISOtech        new CharacterReference ("lowast",   '\u2217'), // asterisk operator, U+2217 ISOtech        new CharacterReference ("radic",    '\u221a'), // square root = radical sign, U+221A ISOtech        new CharacterReference ("prop",     '\u221d'), // proportional to, U+221D ISOtech        new CharacterReference ("infin",    '\u221e'), // infinity, U+221E ISOtech        new CharacterReference ("ang",      '\u2220'), // angle, U+2220 ISOamso        new CharacterReference ("and",      '\u2227'), // logical and = wedge, U+2227 ISOtech        new CharacterReference ("or",       '\u2228'), // logical or = vee, U+2228 ISOtech        new CharacterReference ("cap",      '\u2229'), // intersection = cap, U+2229 ISOtech        new CharacterReference ("cup",      '\u222a'), // union = cup, U+222A ISOtech        new CharacterReference ("int",      '\u222b'), // integral, U+222B ISOtech        new CharacterReference ("there4",   '\u2234'), // therefore, U+2234 ISOtech        new CharacterReference ("sim",      '\u223c'), // tilde operator = varies with = similar to, U+223C ISOtech        // tilde operator is NOT the same character as the tilde, U+007E,        // although the same glyph might be used to represent both        new CharacterReference ("cong",     '\u2245'), // approximately equal to, U+2245 ISOtech        new CharacterReference ("asymp",    '\u2248'), // almost equal to = asymptotic to, U+2248 ISOamsr        new CharacterReference ("ne",       '\u2260'), // not equal to, U+2260 ISOtech        new CharacterReference ("equiv",    '\u2261'), // identical to, U+2261 ISOtech        new CharacterReference ("le",       '\u2264'), // less-than or equal to, U+2264 ISOtech        new CharacterReference ("ge",       '\u2265'), // greater-than or equal to, U+2265 ISOtech        new CharacterReference ("sub",      '\u2282'), // subset of, U+2282 ISOtech        new CharacterReference ("sup",      '\u2283'), // superset of, U+2283 ISOtech        // note that nsup, 'not a superset of, U+2283' is not covered by the Symbol         // font encoding and is not included. Should it be, for symmetry?        // It is in ISOamsn        new CharacterReference ("nsub",     '\u2284'), // not a subset of, U+2284 ISOamsn        new CharacterReference ("sube",     '\u2286'), // subset of or equal to, U+2286 ISOtech        new CharacterReference ("supe",     '\u2287'), // superset of or equal to, U+2287 ISOtech        new CharacterReference ("oplus",    '\u2295'), // circled plus = direct sum, U+2295 ISOamsb        new CharacterReference ("otimes",   '\u2297'), // circled times = vector product, U+2297 ISOamsb        new CharacterReference ("perp",     '\u22a5'), // up tack = orthogonal to = perpendicular, U+22A5 ISOtech        new CharacterReference ("sdot",     '\u22c5'), // dot operator, U+22C5 ISOamsb        // dot operator is NOT the same character as U+00B7 middle dot        // Miscellaneous Technical        new CharacterReference ("lceil",    '\u2308'), // left ceiling = apl upstile, U+2308 ISOamsc        new CharacterReference ("rceil",    '\u2309'), // right ceiling, U+2309 ISOamsc        new CharacterReference ("lfloor",   '\u230a'), // left floor = apl downstile, U+230A ISOamsc        new CharacterReference ("rfloor",   '\u230b'), // right floor, U+230B ISOamsc        new CharacterReference ("lang",     '\u2329'), // left-pointing angle bracket = bra, U+2329 ISOtech        // lang is NOT the same character as U+003C 'less than'         // or U+2039 'single left-pointing angle quotation mark'        new CharacterReference ("rang",     '\u232a'), // right-pointing angle bracket = ket, U+232A ISOtech        // rang is NOT the same character as U+003E 'greater than'         // or U+203A 'single right-pointing angle quotation mark'        // Geometric Shapes        new CharacterReference ("loz",      '\u25ca'), // lozenge, U+25CA ISOpub        // Miscellaneous Symbols        new CharacterReference ("spades",   '\u2660'), // black spade suit, U+2660 ISOpub        // black here seems to mean filled as opposed to hollow        new CharacterReference ("clubs",    '\u2663'), // black club suit = shamrock, U+2663 ISOpub        new CharacterReference ("hearts",   '\u2665'), // black heart suit = valentine, U+2665 ISOpub        new CharacterReference ("diams",    '\u2666'), // black diamond suit, U+2666 ISOpub        // Special characters for HTML        // Character entity set. Typical invocation:        // <!ENTITY % HTMLspecial PUBLIC        // "-//W3C//ENTITIES Special//EN//HTML">        // %HTMLspecial;        // Portions © International Organization for Standardization 1986:        // Permission to copy in any form is granted for use with        // conforming SGML systems and applications as defined in        // ISO 8879, provided this notice is included in all copies.        // Relevant ISO entity set is given unless names are newly introduced.        // New names (i.e., not in ISO 8879 list) do not clash with any        // existing ISO 8879 entity names. ISO 10646 character numbers        // are given for each character, in hex. CDATA values are decimal        // conversions of the ISO 10646 values and refer to the document        // character set. Names are ISO 10646 names.        // C0 Controls and Basic Latin        new CharacterReference ("quot",     '\u0022'), // quotation mark = APL quote, U+0022 ISOnum        new CharacterReference ("amp",      '\u0026'), // ampersand, U+0026 ISOnum        new CharacterReference ("lt",       '\u003c'), // less-than sign, U+003C ISOnum        new CharacterReference ("gt",       '\u003e'), // greater-than sign, U+003E ISOnum        // Latin Extended-A        new CharacterReference ("OElig",    '\u0152'), // latin capital ligature OE, U+0152 ISOlat2        new CharacterReference ("oelig",    '\u0153'), // latin small ligature oe, U+0153 ISOlat2        // ligature is a misnomer, this is a separate character in some languages        new CharacterReference ("Scaron",   '\u0160'), // latin capital letter S with caron, U+0160 ISOlat2        new CharacterReference ("scaron",   '\u0161'), // latin small letter s with caron, U+0161 ISOlat2        new CharacterReference ("Yuml",     '\u0178'), // latin capital letter Y with diaeresis, U+0178 ISOlat2        // Spacing Modifier Letters        new CharacterReference ("circ",     '\u02c6'), // modifier letter circumflex accent, U+02C6 ISOpub        new CharacterReference ("tilde",    '\u02dc'), // small tilde, U+02DC ISOdia        // General Punctuation        new CharacterReference ("ensp",     '\u2002'), // en space, U+2002 ISOpub        new CharacterReference ("emsp",     '\u2003'), // em space, U+2003 ISOpub        new CharacterReference ("thinsp",   '\u2009'), // thin space, U+2009 ISOpub        new CharacterReference ("zwnj",     '\u200c'), // zero width non-joiner, U+200C NEW RFC 2070        new CharacterReference ("zwj",      '\u200d'), // zero width joiner, U+200D NEW RFC 2070        new CharacterReference ("lrm",      '\u200e'), // left-to-right mark, U+200E NEW RFC 2070        new CharacterReference ("rlm",      '\u200f'), // right-to-left mark, U+200F NEW RFC 2070        new CharacterReference ("ndash",    '\u2013'), // en dash, U+2013 ISOpub        new CharacterReference ("mdash",    '\u2014'), // em dash, U+2014 ISOpub        new CharacterReference ("lsquo",    '\u2018'), // left single quotation mark, U+2018 ISOnum        new CharacterReference ("rsquo",    '\u2019'), // right single quotation mark, U+2019 ISOnum        new CharacterReference ("sbquo",    '\u201a'), // single low-9 quotation mark, U+201A NEW        new CharacterReference ("ldquo",    '\u201c'), // left double quotation mark, U+201C ISOnum        new CharacterReference ("rdquo",    '\u201d'), // right double quotation mark, U+201D ISOnum        new CharacterReference ("bdquo",    '\u201e'), // double low-9 quotation mark, U+201E NEW        new CharacterReference ("dagger",   '\u2020'), // dagger, U+2020 ISOpub        new CharacterReference ("Dagger",   '\u2021'), // double dagger, U+2021 ISOpub        new CharacterReference ("permil",   '\u2030'), // per mille sign, U+2030 ISOtech        new CharacterReference ("lsaquo",   '\u2039'), // single left-pointing angle quotation mark, U+2039 ISO proposed        // lsaquo is proposed but not yet ISO standardized        new CharacterReference ("rsaquo",   '\u203a'), // single right-pointing angle quotation mark, U+203A ISO proposed        // rsaquo is proposed but not yet ISO standardized        new CharacterReference ("euro",     '\u20ac'), // euro sign, U+20AC NEW    };    /**     * The dividing point between a simple table lookup and a binary search.     * Characters below the break point are stored in a sparse array allowing     * direct index lookup.     */    protected static final int BREAKPOINT = 0x100;    /**     * List of references sorted by character.     * The first part of this array, up to <code>BREAKPOINT</code> is stored     * in a direct translational table, indexing into the table with a character     * yields the reference. The second part is dense and sorted by character,     * suitable for binary lookup.     */    protected static final CharacterReference[] mCharacterList;    static    {        int index;        CharacterReference item;        int character;        // count below the break point        index = 0;        for (int i = 0; i < mCharacterReferences.length; i++)            if (mCharacterReferences[i].getCharacter () < BREAKPOINT)                index++;        // allocate enough for the linear table and remainder        mCharacterList = new CharacterReference[BREAKPOINT + mCharacterReferences.length - index];        index = BREAKPOINT;        for (int i = 0; i < mCharacterReferences.length; i++)        {            item = mCharacterReferences[i];            character = mCharacterReferences[i].getCharacter ();            if (character < BREAKPOINT)                mCharacterList[character] = item;            else            {                // use a linear search and insertion sort, done only once                int x = BREAKPOINT;                while (x < index)                    if (mCharacterList[x].getCharacter () > character)                        break;                    else                        x++;                int y = index - 1;                while (y >= x)                {                    mCharacterList[y + 1] = mCharacterList[y];                    y--;                }                mCharacterList[x] = item;                index++;            }        }        // reorder the original array into kernel order        Sort.QuickSort (mCharacterReferences);    }    /**     * Private constructor.     * This class is fully static and thread safe.     */    private Translate ()    {    }    /**     * Binary search for a reference.     * @param array The array of <code>CharacterReference</code> objects.     * @param ref The character to search for.     * @param lo The lower index within which to look.     * @param hi The upper index within which to look.     * @return The index at which reference was found or is to be inserted.     */    protected static int lookup (CharacterReference[] array, char ref, int lo, int hi)    {   int num;        int mid;        int half;        int result;        int ret;        ret = -1;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -