📄 rfc2640.txt

📁 RFC 的详细文档！
💻 TXT
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
                                          // to load UTF8 values

    while (ucs4_buf != ucs4_endbuf)
    {
     if ( *ucs4_buf <= 0x7F)    // ASCII chars no conversion needed
     {
      *t_utf8_buf++ = (unsigned char) *ucs4_buf;
      utf8_len++;
      ucs4_buf++;
     }
     else
      if ( *ucs4_buf <= 0x07FF ) // In the 2 byte utf-8 range
      {
        *t_utf8_buf++= (unsigned char) (0xC0 + (*ucs4_buf/0x40));
        *t_utf8_buf++= (unsigned char) (0x80 + (*ucs4_buf%0x40));
        utf8_len+=2;
        ucs4_buf++;
      }
      else
        if ( *ucs4_buf <= 0xFFFF ) /* In the 3 byte utf-8 range. The
                                    values 0x0000FFFE, 0x0000FFFF
                                    and 0x0000D800 - 0x0000DFFF do
                                    not occur in UCS-4 */
        {
         *t_utf8_buf++= (unsigned char) (0xE0 +
                        (*ucs4_buf/0x1000));
         *t_utf8_buf++= (unsigned char) (0x80 +
                        ((*ucs4_buf/0x40)%0x40));
         *t_utf8_buf++= (unsigned char) (0x80 + (*ucs4_buf%0x40));
         utf8_len+=3;
         ucs4_buf++;
         }
        else
         if ( *ucs4_buf <= 0x1FFFFF ) //In the 4 byte utf-8 range
         {
          *t_utf8_buf++= (unsigned char) (0xF0 +
                         (*ucs4_buf/0x040000));



Curtin                     Proposed Standard                   [Page 21]

RFC 2640                  FTP Internalization                  July 1999


          *t_utf8_buf++= (unsigned char) (0x80 +
                         ((*ucs4_buf/0x10000)%0x40));
          *t_utf8_buf++= (unsigned char) (0x80 +
                         ((*ucs4_buf/0x40)%0x40));
          *t_utf8_buf++= (unsigned char) (0x80 + (*ucs4_buf%0x40));
          utf8_len+=4;
          ucs4_buf++;

         }
         else
          if ( *ucs4_buf <= 0x03FFFFFF )//In the 5 byte utf-8 range
          {
           *t_utf8_buf++= (unsigned char) (0xF8 +
                          (*ucs4_buf/0x01000000));
           *t_utf8_buf++= (unsigned char) (0x80 +
                          ((*ucs4_buf/0x040000)%0x40));
           *t_utf8_buf++= (unsigned char) (0x80 +
                          ((*ucs4_buf/0x1000)%0x40));
           *t_utf8_buf++= (unsigned char) (0x80 +
                          ((*ucs4_buf/0x40)%0x40));
           *t_utf8_buf++= (unsigned char) (0x80 +
                          (*ucs4_buf%0x40));
           utf8_len+=5;
           ucs4_buf++;
          }
          else
          if ( *ucs4_buf <= 0x7FFFFFFF )//In the 6 byte utf-8 range
           {
             *t_utf8_buf++= (unsigned char)
                            (0xF8 +(*ucs4_buf/0x40000000));
             *t_utf8_buf++= (unsigned char) (0x80 +
                            ((*ucs4_buf/0x01000000)%0x40));
             *t_utf8_buf++= (unsigned char) (0x80 +
                            ((*ucs4_buf/0x040000)%0x40));
             *t_utf8_buf++= (unsigned char) (0x80 +
                            ((*ucs4_buf/0x1000)%0x40));
             *t_utf8_buf++= (unsigned char) (0x80 +
                            ((*ucs4_buf/0x40)%0x40));
             *t_utf8_buf++= (unsigned char) (0x80 +
                            (*ucs4_buf%0x40));
             utf8_len+=6;
             ucs4_buf++;

           }
    }
    return (utf8_len);
   }




Curtin                     Proposed Standard                   [Page 22]

RFC 2640                  FTP Internalization                  July 1999


B.2.2 Conversion from UTF-8 to Local Character Set

   When moving from UTF-8 encoding to the local character set the
   reverse procedure is used. First the UTF-8 encoding is transformed
   into the UCS-4 character set. The UCS-4 is then converted to the
   local character set from a mapping table (i.e. the opposite of the
   table used to form the UCS-4 character code).

   To convert from UTF-8 to UCS-4 the free bits (those that do not
   define UTF-8 sequence size or signify continuation bytes) in a UTF-8
   sequence are concatenated as a bit string. The bits are then
   distributed into a four-byte sequence starting from the least
   significant bits. Those bits not assigned a bit in the four-byte
   sequence are padded with ZERO bits. The following routine converts
   the UTF-8 encoding to UCS-4 character codes:

   int utf8_to_ucs4 (unsigned long *ucs4_buf, unsigned int utf8_len,
                     unsigned char *utf8_buf)
   {

   const unsigned char *utf8_endbuf = utf8_buf + utf8_len;
   unsigned int ucs_len=0;

    while (utf8_buf != utf8_endbuf)
    {

     if ((*utf8_buf & 0x80) == 0x00)  /*ASCII chars no conversion
                                        needed */
     {
      *ucs4_buf++ = (unsigned long) *utf8_buf;
      utf8_buf++;
      ucs_len++;
     }
     else
      if ((*utf8_buf & 0xE0)== 0xC0) //In the 2 byte utf-8 range
      {
        *ucs4_buf++ = (unsigned long) (((*utf8_buf - 0xC0) * 0x40)
                       + ( *(utf8_buf+1) - 0x80));
        utf8_buf += 2;
        ucs_len++;
      }
      else
        if ( (*utf8_buf & 0xF0) == 0xE0 ) /*In the 3 byte utf-8
                                            range */
        {
        *ucs4_buf++ = (unsigned long) (((*utf8_buf - 0xE0) * 0x1000)
                      + (( *(utf8_buf+1) -  0x80) * 0x40)
                      + ( *(utf8_buf+2) - 0x80));



Curtin                     Proposed Standard                   [Page 23]

RFC 2640                  FTP Internalization                  July 1999


         utf8_buf+=3;
         ucs_len++;
        }
        else
         if ((*utf8_buf & 0xF8) == 0xF0) /* In the 4 byte utf-8
                                            range */
         {
          *ucs4_buf++ = (unsigned long)
                          (((*utf8_buf - 0xF0) * 0x040000)
                          + (( *(utf8_buf+1) -  0x80) * 0x1000)
                          + (( *(utf8_buf+2) -  0x80) * 0x40)
                          + ( *(utf8_buf+3) - 0x80));
          utf8_buf+=4;
          ucs_len++;
         }
         else
          if ((*utf8_buf & 0xFC) == 0xF8) /* In the 5 byte utf-8
                                             range */
          {
           *ucs4_buf++ = (unsigned long)
                          (((*utf8_buf - 0xF8) * 0x01000000)
                          + ((*(utf8_buf+1) - 0x80) * 0x040000)
                          + (( *(utf8_buf+2) -  0x80) * 0x1000)
                          + (( *(utf8_buf+3) -  0x80) * 0x40)
                          + ( *(utf8_buf+4) - 0x80));
           utf8_buf+=5;
           ucs_len++;
          }
          else
           if ((*utf8_buf & 0xFE) == 0xFC) /* In the 6 byte utf-8
                                              range */
           {
             *ucs4_buf++ = (unsigned long)
                           (((*utf8_buf - 0xFC) * 0x40000000)
                            + ((*(utf8_buf+1) - 0x80) * 0x010000000)
                            + ((*(utf8_buf+2) - 0x80) * 0x040000)
                            + (( *(utf8_buf+3) -  0x80) * 0x1000)
                            + (( *(utf8_buf+4) -  0x80) * 0x40)
                            + ( *(utf8_buf+5) - 0x80));
             utf8_buf+=6;
             ucs_len++;
           }

    }
   return (ucs_len);
   }





Curtin                     Proposed Standard                   [Page 24]

RFC 2640                  FTP Internalization                  July 1999


B.2.3 ISO/IEC 8859-8 Example

   This example demonstrates mapping ISO/IEC 8859-8 character set to
   UTF-8 and back to ISO/IEC 8859-8. As noted earlier, the Hebrew letter
   "VAV" is convertd from the ISO/IEC 8859-8 character code 0xE4 to the
   corresponding 4 byte ISO/IEC 10646 code of 0x000005D5 by a simple
   lookup of a conversion/mapping file.

   The UCS-4 character code is transformed into UTF-8 using the
   ucs4_to_utf8 routine described earlier by:

   1. Because the UCS-4 character is between 0x80 and 0x07FF it will map
      to a 2 byte UTF-8 sequence.
   2. The first byte is defined by (0xC0 + (0x000005D5 / 0x40)) = 0xD7.

   3. The second byte is defined by (0x80 + (0x000005D5 % 0x40)) = 0x95.

   The UTF-8 encoding is transferred back to UCS-4 by using the
   utf8_to_ucs4 routine described earlier by:

   1. Because the first byte of the sequence, when the '&' operator with
      a value of 0xE0 is applied, will produce 0xC0 (0xD7 & 0xE0 = 0xC0)
      the UTF-8 is a 2 byte sequence.
   2. The four byte UCS-4 character code is produced by (((0xD7 - 0xC0)
      * 0x40) + (0x95 -0x80)) = 0x000005D5.

   Finally, the UCS-4 character code is converted to ISO/IEC 8859-8
   character code (using the mapping table which matches ISO/IEC 8859-8
   to UCS-4 ) to produce the original 0xE4 code for the Hebrew letter
   "VAV".

B.2.4 Vendor Codepage Example

   This example demonstrates the mapping of a codepage to UTF-8 and back
   to a vendor codepage. Mapping between vendor codepages can be done in
   a very similar manner as described above. For instance both the PC
   and Mac codepages reflect the character set from the Thai standard
   TIS 620-2533. The character code on both platforms for the Thai
   letter "SO SO" is 0xAB. This character can then be mapped into the
   UCS-4 by way of a conversion/mapping file to produce the UCS-4 code
   of 0x0E0B.

   The UCS-4 character code is transformed into UTF-8 using the
   ucs4_to_utf8 routine described earlier by:

   1. Because the UCS-4 character is between 0x0800 and 0xFFFF it will
      map to a 3 byte UTF-8 sequence.
   2. The first byte is defined by (0xE0 + (0x00000E0B / 0x1000) = 0xE0.



Curtin                     Proposed Standard                   [Page 25]

RFC 2640                  FTP Internalization                  July 1999


   3. The second byte is defined by (0x80 + ((0x00000E0B / 0x40) %
      0x40))) = 0xB8.
   4. The third byte is defined by (0x80 + (0x00000E0B % 0x40)) = 0x8B.

   The UTF-8 encoding is transferred back to UCS-4 by using the
   utf8_to_ucs4 routine described earlier by:

   1. Because the first byte of the sequence, when the '&' operator with
      a value of 0xF0 is applied, will produce 0xE0 (0xE0 & 0xF0 = 0xE0)
      the UTF-8 is a 3 byte sequence.
   2. The four byte UCS-4 character code is produced by (((0xE0 - 0xE0)
      * 0x1000) + ((0xB8 - 0x80) * 0x40) + (0x8B -0x80) = 0x0000E0B.

   Finally, the UCS-4 character code is converted to either the PC or
   MAC codepage character code (using the mapping table which matches
   codepage to UCS-4 ) to produce the original 0xAB code for the Thai
   letter "SO SO".

B.3 Pseudo Code for a High-Quality Translating Server

   if utf8_valid(fn)
     {
     attempt to convert fn to the local charset, producing localfn
     if (conversion fails temporarily) return error
     if (conversion succeeds)
     {
       attempt to open localfn
       if (open fails temporarily) return error
       if (open succeeds) return success
     }
     }
   attempt to open fn
   if (open fails temporarily) return error
   if (open succeeds) return success
   return permanent error
















Curtin                     Proposed Standard                   [Page 26]

RFC 2640                  FTP Internalization                  July 1999


Full Copyright Statement

   Copyright (C) The Internet Society (1999).  All Rights Reserved.

   This document and translations of it may be copied and furnished to
   others, and derivative works that comment on or otherwise explain it
   or assist in its implementation may be prepared, copied, published
   and distributed, in whole or in part, without restriction of any
   kind, provided that the above copyright notice and this paragraph are
   included on all such copies and derivative works.  However, this
   document itself may not be modified in any way, such as by removing
   the copyright notice or references to the Internet Society or other
   Internet organizations, except as needed for the purpose of
   developing Internet standards in which case the procedures for
   copyrights defined in the Internet Standards process must be
   followed, or as required to translate it into languages other than
   English.

   The limited permissions granted above are perpetual and will not be
   revoked by the Internet Society or its successors or assigns.

   This document and the information contained herein is provided on an
   "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
   TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
   BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
   HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.

Acknowledgement

   Funding for the RFC Editor function is currently provided by the
   Internet Society.



















Curtin                     Proposed Standard                   [Page 27]
上一页 1 2 34
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -