📄 charset.java
字号:
/* * @(#)Charset.java 1.40 05/01/15 * * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. */package java.nio.charset;import java.nio.ByteBuffer;import java.nio.CharBuffer;import java.nio.charset.spi.CharsetProvider;import java.security.AccessController;import java.security.AccessControlException;import java.security.PrivilegedAction;import java.util.Collections;import java.util.HashSet;import java.util.Iterator;import java.util.Locale;import java.util.Map;import java.util.NoSuchElementException;import java.util.Set;import java.util.SortedMap;import java.util.TreeMap;import sun.misc.ASCIICaseInsensitiveComparator;import sun.misc.Service;import sun.misc.ServiceConfigurationError;import sun.nio.cs.StandardCharsets;import sun.nio.cs.ThreadLocalCoders;/** * A named mapping between sequences of sixteen-bit Unicode characters and * sequences of bytes. This class defines methods for creating decoders and * encoders and for retrieving the various names associated with a charset. * Instances of this class are immutable. * * <p> This class also defines static methods for testing whether a particular * charset is supported, for locating charset instances by name, and for * constructing a map that contains every charset for which support is * available in the current Java virtual machine. Support for new charsets can * be added via the service-provider interface defined in the {@link * java.nio.charset.spi.CharsetProvider} class. * * <p> All of the methods defined in this class are safe for use by multiple * concurrent threads. * * * <a name="names"><a name="charenc"> * <h4>Charset names</h4> * * <p> Charsets are named by strings composed of the following characters: * * <ul> * * <li> The uppercase letters <tt>'A'</tt> through <tt>'Z'</tt> * (<tt>'\u0041'</tt> through <tt>'\u005a'</tt>), * * <li> The lowercase letters <tt>'a'</tt> through <tt>'z'</tt> * (<tt>'\u0061'</tt> through <tt>'\u007a'</tt>), * * <li> The digits <tt>'0'</tt> through <tt>'9'</tt> * (<tt>'\u0030'</tt> through <tt>'\u0039'</tt>), * * <li> The dash character <tt>'-'</tt> * (<tt>'\u002d'</tt>, <small>HYPHEN-MINUS</small>), * * <li> The period character <tt>'.'</tt> * (<tt>'\u002e'</tt>, <small>FULL STOP</small>), * * <li> The colon character <tt>':'</tt> * (<tt>'\u003a'</tt>, <small>COLON</small>), and * * <li> The underscore character <tt>'_'</tt> * (<tt>'\u005f'</tt>, <small>LOW LINE</small>). * * </ul> * * A charset name must begin with either a letter or a digit. The empty string * is not a legal charset name. Charset names are not case-sensitive; that is, * case is always ignored when comparing charset names. Charset names * generally follow the conventions documented in <a * href="http://ietf.org/rfc/rfc2278.txt"><i>RFC 2278: IANA Charset * Registration Procedures</i></a>. * * <p> Every charset has a <i>canonical name</i> and may also have one or more * <i>aliases</i>. The canonical name is returned by {@link #name name} method * of this class. Canonical names are, by convention, usually in upper case. * The aliases of a charset are returned by the {@link #aliases aliases} * method. * * <a name="hn"> * * <p> Some charsets have an <i>historical name</i> that is defined for * compatibility with previous versions of the Java platform. A charset's * historical name is either its canonical name or one of its aliases. The * historical name is returned by the <tt>getEncoding()</tt> methods of the * {@link java.io.InputStreamReader#getEncoding InputStreamReader} and {@link * java.io.OutputStreamWriter#getEncoding OutputStreamWriter} classes. * * <a name="iana"> * * <p> If a charset listed in the <a * href="http://www.iana.org/assignments/character-sets"><i>IANA Charset * Registry</i></a> is supported by an implementation of the Java platform then * its canonical name must be the name listed in the registry. Many charsets * are given more than one name in the registry, in which case the registry * identifies one of the names as <i>MIME-preferred</i>. If a charset has more * than one registry name then its canonical name must be the MIME-preferred * name and the other names in the registry must be valid aliases. If a * supported charset is not listed in the IANA registry then its canonical name * must begin with one of the strings <tt>"X-"</tt> or <tt>"x-"</tt>. * * <p> The IANA charset registry does change over time, and so the canonical * name and the aliases of a particular charset may also change over time. To * ensure compatibility it is recommended that no alias ever be removed from a * charset, and that if the canonical name of a charset is changed then its * previous canonical name be made into an alias. * * * <h4>Standard charsets</h4> * * <p> Every implementation of the Java platform is required to support the * following standard charsets. Consult the release documentation for your * implementation to see if any other charsets are supported. * * <blockquote><table width="80%" summary="Description of standard charsets"> * <tr><th><p align="left">Charset</p></th><th><p align="left">Description</p></th></tr> * <tr><td valign=top><tt>US-ASCII</tt></td> * <td>Seven-bit ASCII, a.k.a. <tt>ISO646-US</tt>, * a.k.a. the Basic Latin block of the Unicode character set</td></tr> * <tr><td valign=top><tt>ISO-8859-1 </tt></td> * <td>ISO Latin Alphabet No. 1, a.k.a. <tt>ISO-LATIN-1</tt></td></tr> * <tr><td valign=top><tt>UTF-8</tt></td> * <td>Eight-bit UCS Transformation Format</td></tr> * <tr><td valign=top><tt>UTF-16BE</tt></td> * <td>Sixteen-bit UCS Transformation Format, * big-endian byte order</td></tr> * <tr><td valign=top><tt>UTF-16LE</tt></td> * <td>Sixteen-bit UCS Transformation Format, * little-endian byte order</td></tr> * <tr><td valign=top><tt>UTF-16</tt></td> * <td>Sixteen-bit UCS Transformation Format, * byte order identified by an optional byte-order mark</td></tr> * </table></blockquote> * * <p> The <tt>UTF-8</tt> charset is specified by <a * href="http://ietf.org/rfc/rfc2279.txt"><i>RFC 2279</i></a>; the * transformation format upon which it is based is specified in * Amendment 2 of ISO 10646-1 and is also described in * § 3.8 of <a * href="http://www.unicode.org/unicode/standard/standard.html"><i>The Unicode * Standard, Version 3.0</i></a> (<a * href="http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html">amended</a>). * * <p> The <tt>UTF-16</tt> charsets are specified by <a * href="http://ietf.org/rfc/rfc2781.txt"><i>RFC 2781</i></a>; the * transformation formats upon which they are based are specified in * Amendment 1 of ISO 10646-1 and are also described in * § 3.8 of <a * href="http://www.unicode.org/unicode/standard/standard.html"><i>The Unicode * Standard, Version 3.0</i></a>. * * <p> The <tt>UTF-16</tt> charsets use sixteen-bit quantities and are * therefore sensitive to byte order. In these encodings the byte order of a * stream may be indicated by an initial <i>byte-order mark</i> represented by * the Unicode character <tt>'\uFEFF'</tt>. Byte-order marks are handled * as follows: * * <ul> * * <li><p> When decoding, the <tt>UTF-16BE</tt> and <tt>UTF-16LE</tt> * charsets ignore byte-order marks; when encoding, they do not write * byte-order marks. </p></li> * * <li><p> When decoding, the <tt>UTF-16</tt> charset interprets a byte-order * mark to indicate the byte order of the stream but defaults to big-endian * if there is no byte-order mark; when encoding, it uses big-endian byte * order and writes a big-endian byte-order mark. </p></li> * * </ul> * * In any case, when a byte-order mark is read at the beginning of a decoding * operation it is omitted from the resulting sequence of characters. Byte * order marks occuring after the first element of an input sequence are not * omitted since the same code is used to represent <small>ZERO-WIDTH * NON-BREAKING SPACE</small>. * * <p> Every instance of the Java virtual machine has a default charset, which * may or may not be one of the standard charsets. The default charset is * determined during virtual-machine startup and typically depends upon the * locale and charset being used by the underlying operating system. </p> * * * <h4>Terminology</h4> * * <p> The name of this class is taken from the terms used in <a * href="http://ietf.org/rfc/rfc2278.txt""><i>RFC 2278</i></a>. In that * document a <i>charset</i> is defined as the combination of a coded character * set and a character-encoding scheme. * * <p> A <i>coded character set</i> is a mapping between a set of abstract * characters and a set of integers. US-ASCII, ISO 8859-1, * JIS X 0201, and full Unicode, which is the same as * ISO 10646-1, are examples of coded character sets. * * <p> A <i>character-encoding scheme</i> is a mapping between a coded * character set and a set of octet (eight-bit byte) sequences. UTF-8, UCS-2, * UTF-16, ISO 2022, and EUC are examples of character-encoding schemes. * Encoding schemes are often associated with a particular coded character set; * UTF-8, for example, is used only to encode Unicode. Some schemes, however, * are associated with multiple character sets; EUC, for example, can be used * to encode characters in a variety of Asian character sets. * * <p> When a coded character set is used exclusively with a single * character-encoding scheme then the corresponding charset is usually named * for the character set; otherwise a charset is usually named for the encoding * scheme and, possibly, the locale of the character sets that it supports. * Hence <tt>US-ASCII</tt> is the name of the charset for US-ASCII while * <tt>EUC-JP</tt> is the name of the charset that encodes the * JIS X 0201, JIS X 0208, and JIS X 0212 * character sets. * * <p> The native coded character set of the Java programming language is that * of the first seventeen planes of the Unicode version 3.0 character set; * that is, it consists in the <i>basic multilingual plane</i> (BMP) of Unicode * version 1 plus the next sixteen planes of Unicode version 3. This * is because the language's internal representation of characters uses the * UTF-16 encoding, which encodes the BMP directly and uses <i>surrogate * pairs</i>, a simple escape mechanism, to encode the other planes. Hence a * charset in the Java platform defines a mapping between sequences of * sixteen-bit values in UTF-16 and sequences of bytes. </p> * * * @author Mark Reinhold * @author JSR-51 Expert Group * @version 1.40, 05/01/15 * @since 1.4 * * @see CharsetDecoder * @see CharsetEncoder * @see java.nio.charset.spi.CharsetProvider */public abstract class Charset implements Comparable{ /* -- Static methods -- */ /** * Checks that the given string is a legal charset name. </p> * * @param s * A purported charset name * * @throws IllegalCharsetNameException * If the given name is not a legal charset name */ private static void checkName(String s) { int n = s.length(); for (int i = 0; i < n; i++) { char c = s.charAt(i); if (c >= 'A' && c <= 'Z') continue; if (c >= 'a' && c <= 'z') continue; if (c >= '0' && c <= '9') continue; if (c == '-') continue; if (c == ':') continue; if (c == '_') continue; if (c == '.') continue; throw new IllegalCharsetNameException(s); } } /* The standard set of charsets */ private static CharsetProvider standardProvider = new StandardCharsets(); // Cache of the most-recently-returned charsets, // along with the names that were used to find them //
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -