📄 rfc2640.txt
字号:
// to load UTF8 values
while (ucs4_buf != ucs4_endbuf)
{
if ( *ucs4_buf <= 0x7F) // ASCII chars no conversion needed
{
*t_utf8_buf++ = (unsigned char) *ucs4_buf;
utf8_len++;
ucs4_buf++;
}
else
if ( *ucs4_buf <= 0x07FF ) // In the 2 byte utf-8 range
{
*t_utf8_buf++= (unsigned char) (0xC0 + (*ucs4_buf/0x40));
*t_utf8_buf++= (unsigned char) (0x80 + (*ucs4_buf%0x40));
utf8_len+=2;
ucs4_buf++;
}
else
if ( *ucs4_buf <= 0xFFFF ) /* In the 3 byte utf-8 range. The
values 0x0000FFFE, 0x0000FFFF
and 0x0000D800 - 0x0000DFFF do
not occur in UCS-4 */
{
*t_utf8_buf++= (unsigned char) (0xE0 +
(*ucs4_buf/0x1000));
*t_utf8_buf++= (unsigned char) (0x80 +
((*ucs4_buf/0x40)%0x40));
*t_utf8_buf++= (unsigned char) (0x80 + (*ucs4_buf%0x40));
utf8_len+=3;
ucs4_buf++;
}
else
if ( *ucs4_buf <= 0x1FFFFF ) //In the 4 byte utf-8 range
{
*t_utf8_buf++= (unsigned char) (0xF0 +
(*ucs4_buf/0x040000));
Curtin Proposed Standard [Page 21]
RFC 2640 FTP Internalization July 1999
*t_utf8_buf++= (unsigned char) (0x80 +
((*ucs4_buf/0x10000)%0x40));
*t_utf8_buf++= (unsigned char) (0x80 +
((*ucs4_buf/0x40)%0x40));
*t_utf8_buf++= (unsigned char) (0x80 + (*ucs4_buf%0x40));
utf8_len+=4;
ucs4_buf++;
}
else
if ( *ucs4_buf <= 0x03FFFFFF )//In the 5 byte utf-8 range
{
*t_utf8_buf++= (unsigned char) (0xF8 +
(*ucs4_buf/0x01000000));
*t_utf8_buf++= (unsigned char) (0x80 +
((*ucs4_buf/0x040000)%0x40));
*t_utf8_buf++= (unsigned char) (0x80 +
((*ucs4_buf/0x1000)%0x40));
*t_utf8_buf++= (unsigned char) (0x80 +
((*ucs4_buf/0x40)%0x40));
*t_utf8_buf++= (unsigned char) (0x80 +
(*ucs4_buf%0x40));
utf8_len+=5;
ucs4_buf++;
}
else
if ( *ucs4_buf <= 0x7FFFFFFF )//In the 6 byte utf-8 range
{
*t_utf8_buf++= (unsigned char)
(0xF8 +(*ucs4_buf/0x40000000));
*t_utf8_buf++= (unsigned char) (0x80 +
((*ucs4_buf/0x01000000)%0x40));
*t_utf8_buf++= (unsigned char) (0x80 +
((*ucs4_buf/0x040000)%0x40));
*t_utf8_buf++= (unsigned char) (0x80 +
((*ucs4_buf/0x1000)%0x40));
*t_utf8_buf++= (unsigned char) (0x80 +
((*ucs4_buf/0x40)%0x40));
*t_utf8_buf++= (unsigned char) (0x80 +
(*ucs4_buf%0x40));
utf8_len+=6;
ucs4_buf++;
}
}
return (utf8_len);
}
Curtin Proposed Standard [Page 22]
RFC 2640 FTP Internalization July 1999
B.2.2 Conversion from UTF-8 to Local Character Set
When moving from UTF-8 encoding to the local character set the
reverse procedure is used. First the UTF-8 encoding is transformed
into the UCS-4 character set. The UCS-4 is then converted to the
local character set from a mapping table (i.e. the opposite of the
table used to form the UCS-4 character code).
To convert from UTF-8 to UCS-4 the free bits (those that do not
define UTF-8 sequence size or signify continuation bytes) in a UTF-8
sequence are concatenated as a bit string. The bits are then
distributed into a four-byte sequence starting from the least
significant bits. Those bits not assigned a bit in the four-byte
sequence are padded with ZERO bits. The following routine converts
the UTF-8 encoding to UCS-4 character codes:
int utf8_to_ucs4 (unsigned long *ucs4_buf, unsigned int utf8_len,
unsigned char *utf8_buf)
{
const unsigned char *utf8_endbuf = utf8_buf + utf8_len;
unsigned int ucs_len=0;
while (utf8_buf != utf8_endbuf)
{
if ((*utf8_buf & 0x80) == 0x00) /*ASCII chars no conversion
needed */
{
*ucs4_buf++ = (unsigned long) *utf8_buf;
utf8_buf++;
ucs_len++;
}
else
if ((*utf8_buf & 0xE0)== 0xC0) //In the 2 byte utf-8 range
{
*ucs4_buf++ = (unsigned long) (((*utf8_buf - 0xC0) * 0x40)
+ ( *(utf8_buf+1) - 0x80));
utf8_buf += 2;
ucs_len++;
}
else
if ( (*utf8_buf & 0xF0) == 0xE0 ) /*In the 3 byte utf-8
range */
{
*ucs4_buf++ = (unsigned long) (((*utf8_buf - 0xE0) * 0x1000)
+ (( *(utf8_buf+1) - 0x80) * 0x40)
+ ( *(utf8_buf+2) - 0x80));
Curtin Proposed Standard [Page 23]
RFC 2640 FTP Internalization July 1999
utf8_buf+=3;
ucs_len++;
}
else
if ((*utf8_buf & 0xF8) == 0xF0) /* In the 4 byte utf-8
range */
{
*ucs4_buf++ = (unsigned long)
(((*utf8_buf - 0xF0) * 0x040000)
+ (( *(utf8_buf+1) - 0x80) * 0x1000)
+ (( *(utf8_buf+2) - 0x80) * 0x40)
+ ( *(utf8_buf+3) - 0x80));
utf8_buf+=4;
ucs_len++;
}
else
if ((*utf8_buf & 0xFC) == 0xF8) /* In the 5 byte utf-8
range */
{
*ucs4_buf++ = (unsigned long)
(((*utf8_buf - 0xF8) * 0x01000000)
+ ((*(utf8_buf+1) - 0x80) * 0x040000)
+ (( *(utf8_buf+2) - 0x80) * 0x1000)
+ (( *(utf8_buf+3) - 0x80) * 0x40)
+ ( *(utf8_buf+4) - 0x80));
utf8_buf+=5;
ucs_len++;
}
else
if ((*utf8_buf & 0xFE) == 0xFC) /* In the 6 byte utf-8
range */
{
*ucs4_buf++ = (unsigned long)
(((*utf8_buf - 0xFC) * 0x40000000)
+ ((*(utf8_buf+1) - 0x80) * 0x010000000)
+ ((*(utf8_buf+2) - 0x80) * 0x040000)
+ (( *(utf8_buf+3) - 0x80) * 0x1000)
+ (( *(utf8_buf+4) - 0x80) * 0x40)
+ ( *(utf8_buf+5) - 0x80));
utf8_buf+=6;
ucs_len++;
}
}
return (ucs_len);
}
Curtin Proposed Standard [Page 24]
RFC 2640 FTP Internalization July 1999
B.2.3 ISO/IEC 8859-8 Example
This example demonstrates mapping ISO/IEC 8859-8 character set to
UTF-8 and back to ISO/IEC 8859-8. As noted earlier, the Hebrew letter
"VAV" is convertd from the ISO/IEC 8859-8 character code 0xE4 to the
corresponding 4 byte ISO/IEC 10646 code of 0x000005D5 by a simple
lookup of a conversion/mapping file.
The UCS-4 character code is transformed into UTF-8 using the
ucs4_to_utf8 routine described earlier by:
1. Because the UCS-4 character is between 0x80 and 0x07FF it will map
to a 2 byte UTF-8 sequence.
2. The first byte is defined by (0xC0 + (0x000005D5 / 0x40)) = 0xD7.
3. The second byte is defined by (0x80 + (0x000005D5 % 0x40)) = 0x95.
The UTF-8 encoding is transferred back to UCS-4 by using the
utf8_to_ucs4 routine described earlier by:
1. Because the first byte of the sequence, when the '&' operator with
a value of 0xE0 is applied, will produce 0xC0 (0xD7 & 0xE0 = 0xC0)
the UTF-8 is a 2 byte sequence.
2. The four byte UCS-4 character code is produced by (((0xD7 - 0xC0)
* 0x40) + (0x95 -0x80)) = 0x000005D5.
Finally, the UCS-4 character code is converted to ISO/IEC 8859-8
character code (using the mapping table which matches ISO/IEC 8859-8
to UCS-4 ) to produce the original 0xE4 code for the Hebrew letter
"VAV".
B.2.4 Vendor Codepage Example
This example demonstrates the mapping of a codepage to UTF-8 and back
to a vendor codepage. Mapping between vendor codepages can be done in
a very similar manner as described above. For instance both the PC
and Mac codepages reflect the character set from the Thai standard
TIS 620-2533. The character code on both platforms for the Thai
letter "SO SO" is 0xAB. This character can then be mapped into the
UCS-4 by way of a conversion/mapping file to produce the UCS-4 code
of 0x0E0B.
The UCS-4 character code is transformed into UTF-8 using the
ucs4_to_utf8 routine described earlier by:
1. Because the UCS-4 character is between 0x0800 and 0xFFFF it will
map to a 3 byte UTF-8 sequence.
2. The first byte is defined by (0xE0 + (0x00000E0B / 0x1000) = 0xE0.
Curtin Proposed Standard [Page 25]
RFC 2640 FTP Internalization July 1999
3. The second byte is defined by (0x80 + ((0x00000E0B / 0x40) %
0x40))) = 0xB8.
4. The third byte is defined by (0x80 + (0x00000E0B % 0x40)) = 0x8B.
The UTF-8 encoding is transferred back to UCS-4 by using the
utf8_to_ucs4 routine described earlier by:
1. Because the first byte of the sequence, when the '&' operator with
a value of 0xF0 is applied, will produce 0xE0 (0xE0 & 0xF0 = 0xE0)
the UTF-8 is a 3 byte sequence.
2. The four byte UCS-4 character code is produced by (((0xE0 - 0xE0)
* 0x1000) + ((0xB8 - 0x80) * 0x40) + (0x8B -0x80) = 0x0000E0B.
Finally, the UCS-4 character code is converted to either the PC or
MAC codepage character code (using the mapping table which matches
codepage to UCS-4 ) to produce the original 0xAB code for the Thai
letter "SO SO".
B.3 Pseudo Code for a High-Quality Translating Server
if utf8_valid(fn)
{
attempt to convert fn to the local charset, producing localfn
if (conversion fails temporarily) return error
if (conversion succeeds)
{
attempt to open localfn
if (open fails temporarily) return error
if (open succeeds) return success
}
}
attempt to open fn
if (open fails temporarily) return error
if (open succeeds) return success
return permanent error
Curtin Proposed Standard [Page 26]
RFC 2640 FTP Internalization July 1999
Full Copyright Statement
Copyright (C) The Internet Society (1999). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Acknowledgement
Funding for the RFC Editor function is currently provided by the
Internet Society.
Curtin Proposed Standard [Page 27]
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -