📄 rfc2640.txt
字号:
// to load UTF8 values while (ucs4_buf != ucs4_endbuf) { if ( *ucs4_buf <= 0x7F) // ASCII chars no conversion needed { *t_utf8_buf++ = (unsigned char) *ucs4_buf; utf8_len++; ucs4_buf++; } else if ( *ucs4_buf <= 0x07FF ) // In the 2 byte utf-8 range { *t_utf8_buf++= (unsigned char) (0xC0 + (*ucs4_buf/0x40)); *t_utf8_buf++= (unsigned char) (0x80 + (*ucs4_buf%0x40)); utf8_len+=2; ucs4_buf++; } else if ( *ucs4_buf <= 0xFFFF ) /* In the 3 byte utf-8 range. The values 0x0000FFFE, 0x0000FFFF and 0x0000D800 - 0x0000DFFF do not occur in UCS-4 */ { *t_utf8_buf++= (unsigned char) (0xE0 + (*ucs4_buf/0x1000)); *t_utf8_buf++= (unsigned char) (0x80 + ((*ucs4_buf/0x40)%0x40)); *t_utf8_buf++= (unsigned char) (0x80 + (*ucs4_buf%0x40)); utf8_len+=3; ucs4_buf++; } else if ( *ucs4_buf <= 0x1FFFFF ) //In the 4 byte utf-8 range { *t_utf8_buf++= (unsigned char) (0xF0 + (*ucs4_buf/0x040000));Curtin Proposed Standard [Page 21]RFC 2640 FTP Internalization July 1999 *t_utf8_buf++= (unsigned char) (0x80 + ((*ucs4_buf/0x10000)%0x40)); *t_utf8_buf++= (unsigned char) (0x80 + ((*ucs4_buf/0x40)%0x40)); *t_utf8_buf++= (unsigned char) (0x80 + (*ucs4_buf%0x40)); utf8_len+=4; ucs4_buf++; } else if ( *ucs4_buf <= 0x03FFFFFF )//In the 5 byte utf-8 range { *t_utf8_buf++= (unsigned char) (0xF8 + (*ucs4_buf/0x01000000)); *t_utf8_buf++= (unsigned char) (0x80 + ((*ucs4_buf/0x040000)%0x40)); *t_utf8_buf++= (unsigned char) (0x80 + ((*ucs4_buf/0x1000)%0x40)); *t_utf8_buf++= (unsigned char) (0x80 + ((*ucs4_buf/0x40)%0x40)); *t_utf8_buf++= (unsigned char) (0x80 + (*ucs4_buf%0x40)); utf8_len+=5; ucs4_buf++; } else if ( *ucs4_buf <= 0x7FFFFFFF )//In the 6 byte utf-8 range { *t_utf8_buf++= (unsigned char) (0xF8 +(*ucs4_buf/0x40000000)); *t_utf8_buf++= (unsigned char) (0x80 + ((*ucs4_buf/0x01000000)%0x40)); *t_utf8_buf++= (unsigned char) (0x80 + ((*ucs4_buf/0x040000)%0x40)); *t_utf8_buf++= (unsigned char) (0x80 + ((*ucs4_buf/0x1000)%0x40)); *t_utf8_buf++= (unsigned char) (0x80 + ((*ucs4_buf/0x40)%0x40)); *t_utf8_buf++= (unsigned char) (0x80 + (*ucs4_buf%0x40)); utf8_len+=6; ucs4_buf++; } } return (utf8_len); }Curtin Proposed Standard [Page 22]RFC 2640 FTP Internalization July 1999B.2.2 Conversion from UTF-8 to Local Character Set When moving from UTF-8 encoding to the local character set the reverse procedure is used. First the UTF-8 encoding is transformed into the UCS-4 character set. The UCS-4 is then converted to the local character set from a mapping table (i.e. the opposite of the table used to form the UCS-4 character code). To convert from UTF-8 to UCS-4 the free bits (those that do not define UTF-8 sequence size or signify continuation bytes) in a UTF-8 sequence are concatenated as a bit string. The bits are then distributed into a four-byte sequence starting from the least significant bits. Those bits not assigned a bit in the four-byte sequence are padded with ZERO bits. The following routine converts the UTF-8 encoding to UCS-4 character codes: int utf8_to_ucs4 (unsigned long *ucs4_buf, unsigned int utf8_len, unsigned char *utf8_buf) { const unsigned char *utf8_endbuf = utf8_buf + utf8_len; unsigned int ucs_len=0; while (utf8_buf != utf8_endbuf) { if ((*utf8_buf & 0x80) == 0x00) /*ASCII chars no conversion needed */ { *ucs4_buf++ = (unsigned long) *utf8_buf; utf8_buf++; ucs_len++; } else if ((*utf8_buf & 0xE0)== 0xC0) //In the 2 byte utf-8 range { *ucs4_buf++ = (unsigned long) (((*utf8_buf - 0xC0) * 0x40) + ( *(utf8_buf+1) - 0x80)); utf8_buf += 2; ucs_len++; } else if ( (*utf8_buf & 0xF0) == 0xE0 ) /*In the 3 byte utf-8 range */ { *ucs4_buf++ = (unsigned long) (((*utf8_buf - 0xE0) * 0x1000) + (( *(utf8_buf+1) - 0x80) * 0x40) + ( *(utf8_buf+2) - 0x80));Curtin Proposed Standard [Page 23]RFC 2640 FTP Internalization July 1999 utf8_buf+=3; ucs_len++; } else if ((*utf8_buf & 0xF8) == 0xF0) /* In the 4 byte utf-8 range */ { *ucs4_buf++ = (unsigned long) (((*utf8_buf - 0xF0) * 0x040000) + (( *(utf8_buf+1) - 0x80) * 0x1000) + (( *(utf8_buf+2) - 0x80) * 0x40) + ( *(utf8_buf+3) - 0x80)); utf8_buf+=4; ucs_len++; } else if ((*utf8_buf & 0xFC) == 0xF8) /* In the 5 byte utf-8 range */ { *ucs4_buf++ = (unsigned long) (((*utf8_buf - 0xF8) * 0x01000000) + ((*(utf8_buf+1) - 0x80) * 0x040000) + (( *(utf8_buf+2) - 0x80) * 0x1000) + (( *(utf8_buf+3) - 0x80) * 0x40) + ( *(utf8_buf+4) - 0x80)); utf8_buf+=5; ucs_len++; } else if ((*utf8_buf & 0xFE) == 0xFC) /* In the 6 byte utf-8 range */ { *ucs4_buf++ = (unsigned long) (((*utf8_buf - 0xFC) * 0x40000000) + ((*(utf8_buf+1) - 0x80) * 0x010000000) + ((*(utf8_buf+2) - 0x80) * 0x040000) + (( *(utf8_buf+3) - 0x80) * 0x1000) + (( *(utf8_buf+4) - 0x80) * 0x40) + ( *(utf8_buf+5) - 0x80)); utf8_buf+=6; ucs_len++; } } return (ucs_len); }Curtin Proposed Standard [Page 24]RFC 2640 FTP Internalization July 1999B.2.3 ISO/IEC 8859-8 Example This example demonstrates mapping ISO/IEC 8859-8 character set to UTF-8 and back to ISO/IEC 8859-8. As noted earlier, the Hebrew letter "VAV" is convertd from the ISO/IEC 8859-8 character code 0xE4 to the corresponding 4 byte ISO/IEC 10646 code of 0x000005D5 by a simple lookup of a conversion/mapping file. The UCS-4 character code is transformed into UTF-8 using the ucs4_to_utf8 routine described earlier by: 1. Because the UCS-4 character is between 0x80 and 0x07FF it will map to a 2 byte UTF-8 sequence. 2. The first byte is defined by (0xC0 + (0x000005D5 / 0x40)) = 0xD7. 3. The second byte is defined by (0x80 + (0x000005D5 % 0x40)) = 0x95. The UTF-8 encoding is transferred back to UCS-4 by using the utf8_to_ucs4 routine described earlier by: 1. Because the first byte of the sequence, when the '&' operator with a value of 0xE0 is applied, will produce 0xC0 (0xD7 & 0xE0 = 0xC0) the UTF-8 is a 2 byte sequence. 2. The four byte UCS-4 character code is produced by (((0xD7 - 0xC0) * 0x40) + (0x95 -0x80)) = 0x000005D5. Finally, the UCS-4 character code is converted to ISO/IEC 8859-8 character code (using the mapping table which matches ISO/IEC 8859-8 to UCS-4 ) to produce the original 0xE4 code for the Hebrew letter "VAV".B.2.4 Vendor Codepage Example This example demonstrates the mapping of a codepage to UTF-8 and back to a vendor codepage. Mapping between vendor codepages can be done in a very similar manner as described above. For instance both the PC and Mac codepages reflect the character set from the Thai standard TIS 620-2533. The character code on both platforms for the Thai letter "SO SO" is 0xAB. This character can then be mapped into the UCS-4 by way of a conversion/mapping file to produce the UCS-4 code of 0x0E0B. The UCS-4 character code is transformed into UTF-8 using the ucs4_to_utf8 routine described earlier by: 1. Because the UCS-4 character is between 0x0800 and 0xFFFF it will map to a 3 byte UTF-8 sequence. 2. The first byte is defined by (0xE0 + (0x00000E0B / 0x1000) = 0xE0.Curtin Proposed Standard [Page 25]RFC 2640 FTP Internalization July 1999 3. The second byte is defined by (0x80 + ((0x00000E0B / 0x40) % 0x40))) = 0xB8. 4. The third byte is defined by (0x80 + (0x00000E0B % 0x40)) = 0x8B. The UTF-8 encoding is transferred back to UCS-4 by using the utf8_to_ucs4 routine described earlier by: 1. Because the first byte of the sequence, when the '&' operator with a value of 0xF0 is applied, will produce 0xE0 (0xE0 & 0xF0 = 0xE0) the UTF-8 is a 3 byte sequence. 2. The four byte UCS-4 character code is produced by (((0xE0 - 0xE0) * 0x1000) + ((0xB8 - 0x80) * 0x40) + (0x8B -0x80) = 0x0000E0B. Finally, the UCS-4 character code is converted to either the PC or MAC codepage character code (using the mapping table which matches codepage to UCS-4 ) to produce the original 0xAB code for the Thai letter "SO SO".B.3 Pseudo Code for a High-Quality Translating Server if utf8_valid(fn) { attempt to convert fn to the local charset, producing localfn if (conversion fails temporarily) return error if (conversion succeeds) { attempt to open localfn if (open fails temporarily) return error if (open succeeds) return success } } attempt to open fn if (open fails temporarily) return error if (open succeeds) return success return permanent errorCurtin Proposed Standard [Page 26]RFC 2640 FTP Internalization July 1999Full Copyright Statement Copyright (C) The Internet Society (1999). All Rights Reserved. This document and translations of it may be copied and furnished to others, and derivative works that comment on or otherwise explain it or assist in its implementation may be prepared, copied, published and distributed, in whole or in part, without restriction of any kind, provided that the above copyright notice and this paragraph are included on all such copies and derivative works. However, this document itself may not be modified in any way, such as by removing the copyright notice or references to the Internet Society or other Internet organizations, except as needed for the purpose of developing Internet standards in which case the procedures for copyrights defined in the Internet Standards process must be followed, or as required to translate it into languages other than English. The limited permissions granted above are perpetual and will not be revoked by the Internet Society or its successors or assigns. This document and the information contained herein is provided on an "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.Acknowledgement Funding for the RFC Editor function is currently provided by the Internet Society.Curtin Proposed Standard [Page 27]
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -