📄 rfc2640.txt

📁 著名的RFC文档,其中有一些文档是已经翻译成中文的的.
💻 TXT
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
                                          // to load UTF8 values    while (ucs4_buf != ucs4_endbuf)    {     if ( *ucs4_buf <= 0x7F)    // ASCII chars no conversion needed     {      *t_utf8_buf++ = (unsigned char) *ucs4_buf;      utf8_len++;      ucs4_buf++;     }     else      if ( *ucs4_buf <= 0x07FF ) // In the 2 byte utf-8 range      {        *t_utf8_buf++= (unsigned char) (0xC0 + (*ucs4_buf/0x40));        *t_utf8_buf++= (unsigned char) (0x80 + (*ucs4_buf%0x40));        utf8_len+=2;        ucs4_buf++;      }      else        if ( *ucs4_buf <= 0xFFFF ) /* In the 3 byte utf-8 range. The                                    values 0x0000FFFE, 0x0000FFFF                                    and 0x0000D800 - 0x0000DFFF do                                    not occur in UCS-4 */        {         *t_utf8_buf++= (unsigned char) (0xE0 +                        (*ucs4_buf/0x1000));         *t_utf8_buf++= (unsigned char) (0x80 +                        ((*ucs4_buf/0x40)%0x40));         *t_utf8_buf++= (unsigned char) (0x80 + (*ucs4_buf%0x40));         utf8_len+=3;         ucs4_buf++;         }        else         if ( *ucs4_buf <= 0x1FFFFF ) //In the 4 byte utf-8 range         {          *t_utf8_buf++= (unsigned char) (0xF0 +                         (*ucs4_buf/0x040000));Curtin                     Proposed Standard                   [Page 21]RFC 2640                  FTP Internalization                  July 1999          *t_utf8_buf++= (unsigned char) (0x80 +                         ((*ucs4_buf/0x10000)%0x40));          *t_utf8_buf++= (unsigned char) (0x80 +                         ((*ucs4_buf/0x40)%0x40));          *t_utf8_buf++= (unsigned char) (0x80 + (*ucs4_buf%0x40));          utf8_len+=4;          ucs4_buf++;         }         else          if ( *ucs4_buf <= 0x03FFFFFF )//In the 5 byte utf-8 range          {           *t_utf8_buf++= (unsigned char) (0xF8 +                          (*ucs4_buf/0x01000000));           *t_utf8_buf++= (unsigned char) (0x80 +                          ((*ucs4_buf/0x040000)%0x40));           *t_utf8_buf++= (unsigned char) (0x80 +                          ((*ucs4_buf/0x1000)%0x40));           *t_utf8_buf++= (unsigned char) (0x80 +                          ((*ucs4_buf/0x40)%0x40));           *t_utf8_buf++= (unsigned char) (0x80 +                          (*ucs4_buf%0x40));           utf8_len+=5;           ucs4_buf++;          }          else          if ( *ucs4_buf <= 0x7FFFFFFF )//In the 6 byte utf-8 range           {             *t_utf8_buf++= (unsigned char)                            (0xF8 +(*ucs4_buf/0x40000000));             *t_utf8_buf++= (unsigned char) (0x80 +                            ((*ucs4_buf/0x01000000)%0x40));             *t_utf8_buf++= (unsigned char) (0x80 +                            ((*ucs4_buf/0x040000)%0x40));             *t_utf8_buf++= (unsigned char) (0x80 +                            ((*ucs4_buf/0x1000)%0x40));             *t_utf8_buf++= (unsigned char) (0x80 +                            ((*ucs4_buf/0x40)%0x40));             *t_utf8_buf++= (unsigned char) (0x80 +                            (*ucs4_buf%0x40));             utf8_len+=6;             ucs4_buf++;           }    }    return (utf8_len);   }Curtin                     Proposed Standard                   [Page 22]RFC 2640                  FTP Internalization                  July 1999B.2.2 Conversion from UTF-8 to Local Character Set   When moving from UTF-8 encoding to the local character set the   reverse procedure is used. First the UTF-8 encoding is transformed   into the UCS-4 character set. The UCS-4 is then converted to the   local character set from a mapping table (i.e. the opposite of the   table used to form the UCS-4 character code).   To convert from UTF-8 to UCS-4 the free bits (those that do not   define UTF-8 sequence size or signify continuation bytes) in a UTF-8   sequence are concatenated as a bit string. The bits are then   distributed into a four-byte sequence starting from the least   significant bits. Those bits not assigned a bit in the four-byte   sequence are padded with ZERO bits. The following routine converts   the UTF-8 encoding to UCS-4 character codes:   int utf8_to_ucs4 (unsigned long *ucs4_buf, unsigned int utf8_len,                     unsigned char *utf8_buf)   {   const unsigned char *utf8_endbuf = utf8_buf + utf8_len;   unsigned int ucs_len=0;    while (utf8_buf != utf8_endbuf)    {     if ((*utf8_buf & 0x80) == 0x00)  /*ASCII chars no conversion                                        needed */     {      *ucs4_buf++ = (unsigned long) *utf8_buf;      utf8_buf++;      ucs_len++;     }     else      if ((*utf8_buf & 0xE0)== 0xC0) //In the 2 byte utf-8 range      {        *ucs4_buf++ = (unsigned long) (((*utf8_buf - 0xC0) * 0x40)                       + ( *(utf8_buf+1) - 0x80));        utf8_buf += 2;        ucs_len++;      }      else        if ( (*utf8_buf & 0xF0) == 0xE0 ) /*In the 3 byte utf-8                                            range */        {        *ucs4_buf++ = (unsigned long) (((*utf8_buf - 0xE0) * 0x1000)                      + (( *(utf8_buf+1) -  0x80) * 0x40)                      + ( *(utf8_buf+2) - 0x80));Curtin                     Proposed Standard                   [Page 23]RFC 2640                  FTP Internalization                  July 1999         utf8_buf+=3;         ucs_len++;        }        else         if ((*utf8_buf & 0xF8) == 0xF0) /* In the 4 byte utf-8                                            range */         {          *ucs4_buf++ = (unsigned long)                          (((*utf8_buf - 0xF0) * 0x040000)                          + (( *(utf8_buf+1) -  0x80) * 0x1000)                          + (( *(utf8_buf+2) -  0x80) * 0x40)                          + ( *(utf8_buf+3) - 0x80));          utf8_buf+=4;          ucs_len++;         }         else          if ((*utf8_buf & 0xFC) == 0xF8) /* In the 5 byte utf-8                                             range */          {           *ucs4_buf++ = (unsigned long)                          (((*utf8_buf - 0xF8) * 0x01000000)                          + ((*(utf8_buf+1) - 0x80) * 0x040000)                          + (( *(utf8_buf+2) -  0x80) * 0x1000)                          + (( *(utf8_buf+3) -  0x80) * 0x40)                          + ( *(utf8_buf+4) - 0x80));           utf8_buf+=5;           ucs_len++;          }          else           if ((*utf8_buf & 0xFE) == 0xFC) /* In the 6 byte utf-8                                              range */           {             *ucs4_buf++ = (unsigned long)                           (((*utf8_buf - 0xFC) * 0x40000000)                            + ((*(utf8_buf+1) - 0x80) * 0x010000000)                            + ((*(utf8_buf+2) - 0x80) * 0x040000)                            + (( *(utf8_buf+3) -  0x80) * 0x1000)                            + (( *(utf8_buf+4) -  0x80) * 0x40)                            + ( *(utf8_buf+5) - 0x80));             utf8_buf+=6;             ucs_len++;           }    }   return (ucs_len);   }Curtin                     Proposed Standard                   [Page 24]RFC 2640                  FTP Internalization                  July 1999B.2.3 ISO/IEC 8859-8 Example   This example demonstrates mapping ISO/IEC 8859-8 character set to   UTF-8 and back to ISO/IEC 8859-8. As noted earlier, the Hebrew letter   "VAV" is convertd from the ISO/IEC 8859-8 character code 0xE4 to the   corresponding 4 byte ISO/IEC 10646 code of 0x000005D5 by a simple   lookup of a conversion/mapping file.   The UCS-4 character code is transformed into UTF-8 using the   ucs4_to_utf8 routine described earlier by:   1. Because the UCS-4 character is between 0x80 and 0x07FF it will map      to a 2 byte UTF-8 sequence.   2. The first byte is defined by (0xC0 + (0x000005D5 / 0x40)) = 0xD7.   3. The second byte is defined by (0x80 + (0x000005D5 % 0x40)) = 0x95.   The UTF-8 encoding is transferred back to UCS-4 by using the   utf8_to_ucs4 routine described earlier by:   1. Because the first byte of the sequence, when the '&' operator with      a value of 0xE0 is applied, will produce 0xC0 (0xD7 & 0xE0 = 0xC0)      the UTF-8 is a 2 byte sequence.   2. The four byte UCS-4 character code is produced by (((0xD7 - 0xC0)      * 0x40) + (0x95 -0x80)) = 0x000005D5.   Finally, the UCS-4 character code is converted to ISO/IEC 8859-8   character code (using the mapping table which matches ISO/IEC 8859-8   to UCS-4 ) to produce the original 0xE4 code for the Hebrew letter   "VAV".B.2.4 Vendor Codepage Example   This example demonstrates the mapping of a codepage to UTF-8 and back   to a vendor codepage. Mapping between vendor codepages can be done in   a very similar manner as described above. For instance both the PC   and Mac codepages reflect the character set from the Thai standard   TIS 620-2533. The character code on both platforms for the Thai   letter "SO SO" is 0xAB. This character can then be mapped into the   UCS-4 by way of a conversion/mapping file to produce the UCS-4 code   of 0x0E0B.   The UCS-4 character code is transformed into UTF-8 using the   ucs4_to_utf8 routine described earlier by:   1. Because the UCS-4 character is between 0x0800 and 0xFFFF it will      map to a 3 byte UTF-8 sequence.   2. The first byte is defined by (0xE0 + (0x00000E0B / 0x1000) = 0xE0.Curtin                     Proposed Standard                   [Page 25]RFC 2640                  FTP Internalization                  July 1999   3. The second byte is defined by (0x80 + ((0x00000E0B / 0x40) %      0x40))) = 0xB8.   4. The third byte is defined by (0x80 + (0x00000E0B % 0x40)) = 0x8B.   The UTF-8 encoding is transferred back to UCS-4 by using the   utf8_to_ucs4 routine described earlier by:   1. Because the first byte of the sequence, when the '&' operator with      a value of 0xF0 is applied, will produce 0xE0 (0xE0 & 0xF0 = 0xE0)      the UTF-8 is a 3 byte sequence.   2. The four byte UCS-4 character code is produced by (((0xE0 - 0xE0)      * 0x1000) + ((0xB8 - 0x80) * 0x40) + (0x8B -0x80) = 0x0000E0B.   Finally, the UCS-4 character code is converted to either the PC or   MAC codepage character code (using the mapping table which matches   codepage to UCS-4 ) to produce the original 0xAB code for the Thai   letter "SO SO".B.3 Pseudo Code for a High-Quality Translating Server   if utf8_valid(fn)     {     attempt to convert fn to the local charset, producing localfn     if (conversion fails temporarily) return error     if (conversion succeeds)     {       attempt to open localfn       if (open fails temporarily) return error       if (open succeeds) return success     }     }   attempt to open fn   if (open fails temporarily) return error   if (open succeeds) return success   return permanent errorCurtin                     Proposed Standard                   [Page 26]RFC 2640                  FTP Internalization                  July 1999Full Copyright Statement   Copyright (C) The Internet Society (1999).  All Rights Reserved.   This document and translations of it may be copied and furnished to   others, and derivative works that comment on or otherwise explain it   or assist in its implementation may be prepared, copied, published   and distributed, in whole or in part, without restriction of any   kind, provided that the above copyright notice and this paragraph are   included on all such copies and derivative works.  However, this   document itself may not be modified in any way, such as by removing   the copyright notice or references to the Internet Society or other   Internet organizations, except as needed for the purpose of   developing Internet standards in which case the procedures for   copyrights defined in the Internet Standards process must be   followed, or as required to translate it into languages other than   English.   The limited permissions granted above are perpetual and will not be   revoked by the Internet Society or its successors or assigns.   This document and the information contained herein is provided on an   "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING   TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING   BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION   HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.Acknowledgement   Funding for the RFC Editor function is currently provided by the   Internet Society.Curtin                     Proposed Standard                   [Page 27]
上一页 1 2 34
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -