📄 charset.java

📁 JAVA基本类源代码,大家可以学习学习!
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* * @(#)Charset.java	1.40 05/01/15 * * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. */package java.nio.charset;import java.nio.ByteBuffer;import java.nio.CharBuffer;import java.nio.charset.spi.CharsetProvider;import java.security.AccessController;import java.security.AccessControlException;import java.security.PrivilegedAction;import java.util.Collections;import java.util.HashSet;import java.util.Iterator;import java.util.Locale;import java.util.Map;import java.util.NoSuchElementException;import java.util.Set;import java.util.SortedMap;import java.util.TreeMap;import sun.misc.ASCIICaseInsensitiveComparator;import sun.misc.Service;import sun.misc.ServiceConfigurationError;import sun.nio.cs.StandardCharsets;import sun.nio.cs.ThreadLocalCoders;/** * A named mapping between sequences of sixteen-bit Unicode characters and * sequences of bytes.  This class defines methods for creating decoders and * encoders and for retrieving the various names associated with a charset. * Instances of this class are immutable. * * <p> This class also defines static methods for testing whether a particular * charset is supported, for locating charset instances by name, and for * constructing a map that contains every charset for which support is * available in the current Java virtual machine.  Support for new charsets can * be added via the service-provider interface defined in the {@link * java.nio.charset.spi.CharsetProvider} class. * * <p> All of the methods defined in this class are safe for use by multiple * concurrent threads. * * * <a name="names"><a name="charenc"> * <h4>Charset names</h4> * * <p> Charsets are named by strings composed of the following characters: * * <ul> * *   <li> The uppercase letters <tt>'A'</tt> through <tt>'Z'</tt> *        (<tt>'&#92;u0041'</tt>&nbsp;through&nbsp;<tt>'&#92;u005a'</tt>), *  *   <li> The lowercase letters <tt>'a'</tt> through <tt>'z'</tt> *        (<tt>'&#92;u0061'</tt>&nbsp;through&nbsp;<tt>'&#92;u007a'</tt>), *  *   <li> The digits <tt>'0'</tt> through <tt>'9'</tt> *        (<tt>'&#92;u0030'</tt>&nbsp;through&nbsp;<tt>'&#92;u0039'</tt>), *  *   <li> The dash character <tt>'-'</tt> *        (<tt>'&#92;u002d'</tt>,&nbsp;<small>HYPHEN-MINUS</small>), * *   <li> The period character <tt>'.'</tt> *        (<tt>'&#92;u002e'</tt>,&nbsp;<small>FULL STOP</small>), *  *   <li> The colon character <tt>':'</tt> *        (<tt>'&#92;u003a'</tt>,&nbsp;<small>COLON</small>), and *  *   <li> The underscore character <tt>'_'</tt> *        (<tt>'&#92;u005f'</tt>,&nbsp;<small>LOW&nbsp;LINE</small>). *  * </ul> * * A charset name must begin with either a letter or a digit.  The empty string * is not a legal charset name.  Charset names are not case-sensitive; that is, * case is always ignored when comparing charset names.  Charset names * generally follow the conventions documented in <a * href="http://ietf.org/rfc/rfc2278.txt"><i>RFC&nbsp;2278:&nbsp;IANA Charset * Registration Procedures</i></a>. * * <p> Every charset has a <i>canonical name</i> and may also have one or more * <i>aliases</i>.  The canonical name is returned by {@link #name name} method * of this class.  Canonical names are, by convention, usually in upper case. * The aliases of a charset are returned by the {@link #aliases aliases} * method. * * <a name="hn"> * * <p> Some charsets have an <i>historical name</i> that is defined for * compatibility with previous versions of the Java platform.  A charset's * historical name is either its canonical name or one of its aliases.  The * historical name is returned by the <tt>getEncoding()</tt> methods of the * {@link java.io.InputStreamReader#getEncoding InputStreamReader} and {@link * java.io.OutputStreamWriter#getEncoding OutputStreamWriter} classes. * * <a name="iana"> * * <p> If a charset listed in the <a * href="http://www.iana.org/assignments/character-sets"><i>IANA Charset * Registry</i></a> is supported by an implementation of the Java platform then * its canonical name must be the name listed in the registry.  Many charsets * are given more than one name in the registry, in which case the registry * identifies one of the names as <i>MIME-preferred</i>.  If a charset has more * than one registry name then its canonical name must be the MIME-preferred * name and the other names in the registry must be valid aliases.  If a * supported charset is not listed in the IANA registry then its canonical name * must begin with one of the strings <tt>"X-"</tt> or <tt>"x-"</tt>. * * <p> The IANA charset registry does change over time, and so the canonical * name and the aliases of a particular charset may also change over time.  To * ensure compatibility it is recommended that no alias ever be removed from a * charset, and that if the canonical name of a charset is changed then its * previous canonical name be made into an alias. * * * <h4>Standard charsets</h4> * * <p> Every implementation of the Java platform is required to support the * following standard charsets.  Consult the release documentation for your * implementation to see if any other charsets are supported. *  * <blockquote><table width="80%" summary="Description of standard charsets"> * <tr><th><p align="left">Charset</p></th><th><p align="left">Description</p></th></tr> * <tr><td valign=top><tt>US-ASCII</tt></td> *     <td>Seven-bit ASCII, a.k.a. <tt>ISO646-US</tt>, *         a.k.a. the Basic Latin block of the Unicode character set</td></tr> * <tr><td valign=top><tt>ISO-8859-1&nbsp;&nbsp;</tt></td> *     <td>ISO Latin Alphabet No. 1, a.k.a. <tt>ISO-LATIN-1</tt></td></tr> * <tr><td valign=top><tt>UTF-8</tt></td> *     <td>Eight-bit UCS Transformation Format</td></tr> * <tr><td valign=top><tt>UTF-16BE</tt></td> *     <td>Sixteen-bit UCS Transformation Format, *         big-endian byte&nbsp;order</td></tr> * <tr><td valign=top><tt>UTF-16LE</tt></td> *     <td>Sixteen-bit UCS Transformation Format, *         little-endian byte&nbsp;order</td></tr> * <tr><td valign=top><tt>UTF-16</tt></td> *     <td>Sixteen-bit UCS Transformation Format, *         byte&nbsp;order identified by an optional byte-order mark</td></tr> * </table></blockquote> *  * <p> The <tt>UTF-8</tt> charset is specified by <a * href="http://ietf.org/rfc/rfc2279.txt"><i>RFC&nbsp;2279</i></a>; the * transformation format upon which it is based is specified in * Amendment&nbsp;2 of ISO&nbsp;10646-1 and is also described in * &#167;&nbsp;3.8 of <a * href="http://www.unicode.org/unicode/standard/standard.html"><i>The Unicode * Standard, Version&nbsp;3.0</i></a> (<a * href="http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html">amended</a>). * * <p> The <tt>UTF-16</tt> charsets are specified by <a * href="http://ietf.org/rfc/rfc2781.txt"><i>RFC&nbsp;2781</i></a>; the * transformation formats upon which they are based are specified in * Amendment&nbsp;1 of ISO&nbsp;10646-1 and are also described in * &#167;&nbsp;3.8 of <a * href="http://www.unicode.org/unicode/standard/standard.html"><i>The Unicode * Standard, Version&nbsp;3.0</i></a>. * * <p> The <tt>UTF-16</tt> charsets use sixteen-bit quantities and are * therefore sensitive to byte order.  In these encodings the byte order of a * stream may be indicated by an initial <i>byte-order mark</i> represented by * the Unicode character <tt>'&#92;uFEFF'</tt>.  Byte-order marks are handled * as follows: * * <ul> * *   <li><p> When decoding, the <tt>UTF-16BE</tt> and <tt>UTF-16LE</tt> *   charsets ignore byte-order marks; when encoding, they do not write *   byte-order marks. </p></li> * *   <li><p> When decoding, the <tt>UTF-16</tt> charset interprets a byte-order *   mark to indicate the byte order of the stream but defaults to big-endian *   if there is no byte-order mark; when encoding, it uses big-endian byte *   order and writes a big-endian byte-order mark. </p></li> * * </ul> * * In any case, when a byte-order mark is read at the beginning of a decoding * operation it is omitted from the resulting sequence of characters.  Byte * order marks occuring after the first element of an input sequence are not * omitted since the same code is used to represent <small>ZERO-WIDTH * NON-BREAKING SPACE</small>. * * <p> Every instance of the Java virtual machine has a default charset, which * may or may not be one of the standard charsets.  The default charset is * determined during virtual-machine startup and typically depends upon the * locale and charset being used by the underlying operating system. </p> * * * <h4>Terminology</h4> * * <p> The name of this class is taken from the terms used in <a * href="http://ietf.org/rfc/rfc2278.txt""><i>RFC&nbsp;2278</i></a>.  In that * document a <i>charset</i> is defined as the combination of a coded character * set and a character-encoding scheme. * * <p> A <i>coded character set</i> is a mapping between a set of abstract * characters and a set of integers.  US-ASCII, ISO&nbsp;8859-1, * JIS&nbsp;X&nbsp;0201, and full Unicode, which is the same as * ISO&nbsp;10646-1, are examples of coded character sets. * * <p> A <i>character-encoding scheme</i> is a mapping between a coded * character set and a set of octet (eight-bit byte) sequences.  UTF-8, UCS-2, * UTF-16, ISO&nbsp;2022, and EUC are examples of character-encoding schemes. * Encoding schemes are often associated with a particular coded character set; * UTF-8, for example, is used only to encode Unicode.  Some schemes, however, * are associated with multiple character sets; EUC, for example, can be used * to encode characters in a variety of Asian character sets. * * <p> When a coded character set is used exclusively with a single * character-encoding scheme then the corresponding charset is usually named * for the character set; otherwise a charset is usually named for the encoding * scheme and, possibly, the locale of the character sets that it supports. * Hence <tt>US-ASCII</tt> is the name of the charset for US-ASCII while * <tt>EUC-JP</tt> is the name of the charset that encodes the * JIS&nbsp;X&nbsp;0201, JIS&nbsp;X&nbsp;0208, and JIS&nbsp;X&nbsp;0212 * character sets. * * <p> The native coded character set of the Java programming language is that * of the first seventeen planes of the Unicode version&nbsp;3.0 character set; * that is, it consists in the <i>basic multilingual plane</i> (BMP) of Unicode * version&nbsp;1 plus the next sixteen planes of Unicode version&nbsp;3.  This * is because the language's internal representation of characters uses the * UTF-16 encoding, which encodes the BMP directly and uses <i>surrogate * pairs</i>, a simple escape mechanism, to encode the other planes.  Hence a * charset in the Java platform defines a mapping between sequences of * sixteen-bit values in UTF-16 and sequences of bytes. </p> * * * @author Mark Reinhold * @author JSR-51 Expert Group * @version 1.40, 05/01/15 * @since 1.4 * * @see CharsetDecoder * @see CharsetEncoder * @see java.nio.charset.spi.CharsetProvider */public abstract class Charset    implements Comparable{    /* -- Static methods -- */    /**     * Checks that the given string is a legal charset name. </p>     *     * @param  s     *         A purported charset name     *     * @throws  IllegalCharsetNameException     *          If the given name is not a legal charset name     */    private static void checkName(String s) {	int n = s.length();	for (int i = 0; i < n; i++) {	    char c = s.charAt(i);	    if (c >= 'A' && c <= 'Z') continue;	    if (c >= 'a' && c <= 'z') continue;	    if (c >= '0' && c <= '9') continue;	    if (c == '-') continue;	    if (c == ':') continue;	    if (c == '_') continue;	    if (c == '.') continue;	    throw new IllegalCharsetNameException(s);	}    }    /* The standard set of charsets */    private static CharsetProvider standardProvider = new StandardCharsets();    // Cache of the most-recently-returned charsets,     // along with the names that were used to find them     //
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -