📄 sgml.java
字号:
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.util;import java.util.HashMap;import java.util.Map;/** * The <code>Sgml</code> class contains static methods for processing SGML * into unicode characters. There is a method {@link #entityToCharacter(String)} * which returns the unicode character corresponding to an SGML entity. There * is also a method {@link #replaceEntities(String,String)} which performs * a substitution for entities in an input string. * * <p>See the following document for a complete list of over 1000 * entities known by this class: * * <ul> * <li>John Cowan's <a href="http://unicode.org/Public/MAPPINGS/VENDORS/MISC/SGML.TXT">SGML to Unicode Mapping</a> * </li> * </ul> * * @author Bob Carpenter (from data provided by John Cowan) * @version 3.2.0 * @since LingPipe3.2 */public class Sgml { /** * Returns the character represented by the specified SGML entity, * or <code>null</code> if the entity is undefined. Note that the * SGML entity should be passed in without its preceding ampersand * or following semicolon. * * @param entity Name of SGML entity (without initial ampersand * and final semicolon). * @return The character for the entity, or <code>null</code> if * it is undefined. */ public static Character entityToCharacter(String entity) { return SGML_MAP.get(entity); } /** * Returns the result of replacing all the entities appearing * in the specified string with their corresponding unicode * characters, using the specified replacement string for * unknown entities. * * @param in Input string. * @param unknownReplacement String with which to replace unknown * entities. * @return The input string with entities replaced with their * corresponding characters. */ public static String replaceEntities(String in, String unknownReplacement) { int ampIndex = in.indexOf('&'); if (ampIndex < 0) return in; int semicolonIndex = in.indexOf(';',ampIndex+1); if (semicolonIndex < 0) return in; StringBuilder sb = new StringBuilder(); int start = 0; while (true) { sb.append(in.substring(start,ampIndex)); String entity = in.substring(ampIndex+1,semicolonIndex); Character replacement = SGML_MAP.get(entity); sb.append(replacement != null ? replacement : unknownReplacement); start = semicolonIndex+1; ampIndex = in.indexOf('&',start); if (ampIndex < 0) return sb + in.substring(start); semicolonIndex = in.indexOf(';',ampIndex+1); if (semicolonIndex < 0) return sb + in.substring(start); } } /** * Convenience method to call {@link #replaceEntities(String,String)} * with the question marked used for unknown entities. * * @param in Input string. * @return The input string with entities replaced with their * corresponding characters. */ public static String replaceEntities(String in) { return replaceEntities(in,"?"); } // Author: John Cowan <cowan@ccil.org> // Date: 25 July 1997 // from: http://unicode.org/Public/MAPPINGS/VENDORS/MISC/SGML.TXT static final Map<String,Character> SGML_MAP = new HashMap<String,Character>(1500); static { SGML_MAP.put("Aacgr",'\u0386'); // GREEK CAPITAL LETTER ALPHA WITH TONOS SGML_MAP.put("aacgr",'\u03AC'); // GREEK SMALL LETTER ALPHA WITH TONOS SGML_MAP.put("Aacute",'\u00C1'); // LATIN CAPITAL LETTER A WITH ACUTE SGML_MAP.put("aacute",'\u00E1'); // LATIN SMALL LETTER A WITH ACUTE SGML_MAP.put("Abreve",'\u0102'); // LATIN CAPITAL LETTER A WITH BREVE SGML_MAP.put("abreve",'\u0103'); // LATIN SMALL LETTER A WITH BREVE SGML_MAP.put("Acirc",'\u00C2'); // LATIN CAPITAL LETTER A WITH CIRCUMFLEX SGML_MAP.put("acirc",'\u00E2'); // LATIN SMALL LETTER A WITH CIRCUMFLEX SGML_MAP.put("acute",'\u00B4'); // ACUTE ACCENT SGML_MAP.put("Acy",'\u0410'); // CYRILLIC CAPITAL LETTER A SGML_MAP.put("acy",'\u0430'); // CYRILLIC SMALL LETTER A SGML_MAP.put("AElig",'\u00C6'); // LATIN CAPITAL LETTER AE SGML_MAP.put("aelig",'\u00E6'); // LATIN SMALL LETTER AE SGML_MAP.put("Agr",'\u0391'); // GREEK CAPITAL LETTER ALPHA SGML_MAP.put("agr",'\u03B1'); // GREEK SMALL LETTER ALPHA SGML_MAP.put("Agrave",'\u00C0'); // LATIN CAPITAL LETTER A WITH GRAVE SGML_MAP.put("agrave",'\u00E0'); // LATIN SMALL LETTER A WITH GRAVE SGML_MAP.put("alefsym",'\u2135'); // ALEF SYMBOL SGML_MAP.put("aleph",'\u2135'); // ALEF SYMBOL SGML_MAP.put("Alpha",'\u0391'); // GREEK CAPITAL LETTER ALPHA SGML_MAP.put("alpha",'\u03B1'); // GREEK SMALL LETTER ALPHA SGML_MAP.put("Amacr",'\u0100'); // LATIN CAPITAL LETTER A WITH MACRON SGML_MAP.put("amacr",'\u0101'); // LATIN SMALL LETTER A WITH MACRON SGML_MAP.put("amalg",'\u2210'); // N-ARY COPRODUCT SGML_MAP.put("amp",'\u0026'); // AMPERSAND SGML_MAP.put("and",'\u2227'); // LOGICAL AND SGML_MAP.put("ang",'\u2220'); // ANGLE SGML_MAP.put("ang90",'\u221F'); // RIGHT ANGLE SGML_MAP.put("angmsd",'\u2221'); // MEASURED ANGLE SGML_MAP.put("angsph",'\u2222'); // SPHERICAL ANGLE SGML_MAP.put("angst",'\u212B'); // ANGSTROM SIGN SGML_MAP.put("Aogon",'\u0104'); // LATIN CAPITAL LETTER A WITH OGONEK SGML_MAP.put("aogon",'\u0105'); // LATIN SMALL LETTER A WITH OGONEK SGML_MAP.put("ap",'\u2248'); // ALMOST EQUAL TO SGML_MAP.put("ape",'\u224A'); // ALMOST EQUAL OR EQUAL TO SGML_MAP.put("apos",'\u02BC'); // MODIFIER LETTER APOSTROPHE SGML_MAP.put("Aring",'\u00C5'); // LATIN CAPITAL LETTER A WITH RING ABOVE SGML_MAP.put("aring",'\u00E5'); // LATIN SMALL LETTER A WITH RING ABOVE SGML_MAP.put("ast",'\u002A'); // ASTERISK SGML_MAP.put("asymp",'\u2248'); // ALMOST EQUAL TO SGML_MAP.put("Atilde",'\u00C3'); // LATIN CAPITAL LETTER A WITH TILDE SGML_MAP.put("atilde",'\u00E3'); // LATIN SMALL LETTER A WITH TILDE SGML_MAP.put("Auml",'\u00C4'); // LATIN CAPITAL LETTER A WITH DIAERESIS SGML_MAP.put("auml",'\u00E4'); // LATIN SMALL LETTER A WITH DIAERESIS SGML_MAP.put("b.alpha",'\u03B1'); // GREEK SMALL LETTER ALPHA SGML_MAP.put("barwed",'\u22BC'); // NAND SGML_MAP.put("Barwed",'\u2306'); // PERSPECTIVE SGML_MAP.put("b.beta",'\u03B2'); // GREEK SMALL LETTER BETA SGML_MAP.put("b.chi",'\u03C7'); // GREEK SMALL LETTER CHI SGML_MAP.put("bcong",'\u224C'); // ALL EQUAL TO SGML_MAP.put("Bcy",'\u0411'); // CYRILLIC CAPITAL LETTER BE SGML_MAP.put("bcy",'\u0431'); // CYRILLIC SMALL LETTER BE SGML_MAP.put("b.Delta",'\u0394'); // GREEK CAPITAL LETTER DELTA SGML_MAP.put("b.delta",'\u03B4'); // GREEK SMALL LETTER DELTA SGML_MAP.put("bdquo",'\u201E'); // DOUBLE LOW-9 QUOTATION MARK SGML_MAP.put("becaus",'\u2235'); // BECAUSE SGML_MAP.put("bepsi",'\u220D'); // SMALL CONTAINS AS MEMBER SGML_MAP.put("b.epsi",'\u03B5'); // GREEK SMALL LETTER EPSILON SGML_MAP.put("b.epsis",'\u03B5'); // GREEK SMALL LETTER EPSILON SGML_MAP.put("b.epsiv",'\u03B5'); // GREEK SMALL LETTER EPSILON SGML_MAP.put("bernou",'\u212C'); // SCRIPT CAPITAL B SGML_MAP.put("Beta",'\u0392'); // GREEK CAPITAL LETTER BETA SGML_MAP.put("beta",'\u03B2'); // GREEK SMALL LETTER BETA SGML_MAP.put("b.eta",'\u03B7'); // GREEK SMALL LETTER ETA SGML_MAP.put("beth",'\u2136'); // BET SYMBOL SGML_MAP.put("b.Gamma",'\u0393'); // GREEK CAPITAL LETTER GAMMA SGML_MAP.put("b.gamma",'\u03B3'); // GREEK SMALL LETTER GAMMA SGML_MAP.put("b.gammad",'\u03DC'); // GREEK LETTER DIGAMMA SGML_MAP.put("Bgr",'\u0392'); // GREEK CAPITAL LETTER BETA SGML_MAP.put("bgr",'\u03B2'); // GREEK SMALL LETTER BETA SGML_MAP.put("b.iota",'\u03B9'); // GREEK SMALL LETTER IOTA SGML_MAP.put("b.kappa",'\u03BA'); // GREEK SMALL LETTER KAPPA SGML_MAP.put("b.kappav",'\u03F0'); // GREEK KAPPA SYMBOL SGML_MAP.put("b.Lambda",'\u039B'); // GREEK CAPITAL LETTER LAMDA SGML_MAP.put("b.lambda",'\u03BB'); // GREEK SMALL LETTER LAMDA SGML_MAP.put("blank",'\u2423'); // OPEN BOX SGML_MAP.put("blk12",'\u2592'); // MEDIUM SHADE SGML_MAP.put("blk14",'\u2591'); // LIGHT SHADE SGML_MAP.put("blk34",'\u2593'); // DARK SHADE SGML_MAP.put("block",'\u2588'); // FULL BLOCK SGML_MAP.put("b.mu",'\u03BC'); // GREEK SMALL LETTER MU SGML_MAP.put("b.nu",'\u03BD'); // GREEK SMALL LETTER NU SGML_MAP.put("b.Omega",'\u03A9'); // GREEK CAPITAL LETTER OMEGA SGML_MAP.put("b.omega",'\u03CE'); // GREEK SMALL LETTER OMEGA WITH TONOS SGML_MAP.put("bottom",'\u22A5'); // UP TACK SGML_MAP.put("bowtie",'\u22C8'); // BOWTIE SGML_MAP.put("boxdl",'\u2510'); // BOX DRAWINGS LIGHT DOWN AND LEFT SGML_MAP.put("boxdL",'\u2555'); // BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE SGML_MAP.put("boxDl",'\u2556'); // BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE SGML_MAP.put("boxDL",'\u2557'); // BOX DRAWINGS DOUBLE DOWN AND LEFT SGML_MAP.put("boxdr",'\u250C'); // BOX DRAWINGS LIGHT DOWN AND RIGHT SGML_MAP.put("boxdR",'\u2552'); // BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE SGML_MAP.put("boxDr",'\u2553'); // BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE SGML_MAP.put("boxDR",'\u2554'); // BOX DRAWINGS DOUBLE DOWN AND RIGHT SGML_MAP.put("boxh",'\u2500'); // BOX DRAWINGS LIGHT HORIZONTAL SGML_MAP.put("boxH",'\u2550'); // BOX DRAWINGS DOUBLE HORIZONTAL SGML_MAP.put("boxhd",'\u252C'); // BOX DRAWINGS LIGHT DOWN AND HORIZONTAL
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -