unicode.c

来自「磁盘格式解读」· C语言 代码 · 共 138 行

C
138
字号

#include "usuals.h"
#include "ntfs.h"

/*
 * Transcode UCS2 to UTF8. 
 *
 * Since the nature of the transformation is that the 
 * resulting length is unpredictable, this function
 * allocates it's own memory.
 */
char* unicode_transcode16to8(const ntfs_char* src, size_t len)
{
  char* ret = NULL;
  size_t alloc = 0;
  size_t pos = 0;
  const ntfs_char* c;
  const ntfs_char* e;

  /* Allocate 1.25 times the length initially */
  alloc = len + (len / 4) + 1;
  ret = (char*)mallocf(alloc * sizeof(char));

  c = src;
  e = c + len;

  for( ; c < e; c++)
  {
    /* Make sure we have enough memory */
    if(pos + 4 >= alloc)
    {
      alloc += (len / 2) + 1;
      ret = (char*)reallocf(ret, alloc * sizeof(char));
    }

    /* Encode as one character */
    if(*c <= 0x007F)
    {
      ret[pos++] = (char)*c;
    }

    /* Encode as two characters */
    else if(*c <= 0x07FF)
    {
      ret[pos++] = (char)(192 | (*c >> 6));
      ret[pos++] = (char)(128 | (*c & 63));
    }

    /* Encode as three characters */
    else 
    {
      ret[pos++] = (char)(224 | (*c >> 12));
      ret[pos++] = (char)(128 | ((*c >> 6) & 63));
      ret[pos++] = (char)(128 | (*c & 63));
    }
  }

  ret[pos] = 0;
  return ret;
}

/*
 * Transcode UTF-8 to UCS2
 * 
 * Since a semi predictable length of the resulting data is 
 * known, the caller should allocate the memory for this conversion.
 */
ntfs_char* unicode_transcode8to16(const char* src, ntfs_char* out, size_t len)
{
  /* Note: out should always be at least as long as src in chars */

  size_t pos = 0;
  const char* c;
  const char* e;

  c = src;
  e = c + len;

  for( ; c < e; c++)
  {
    /* We never have to reallocate here. We will always
       be using the same or less number of output characters 
       than input chars. That's just the nature of the encoding. */
    
    /* First 4 bits set */
    if((c + 3) < e && 
       (c[0] & 0xF8) == 0xF0 && 
       (c[1] & 0xC0) == 0x80 &&
       (c[2] & 0xC0) == 0x80 &&
       (c[3] & 0xC0) == 0x80)
    {
      out[pos++] = (ntfs_char)(((ntfs_char)c[0] & 7) << 18 |
                             ((ntfs_char)c[1] & 63) << 12 |
                             ((ntfs_char)c[2] & 63) << 6 |
                             ((ntfs_char)c[3] & 63));
      c += 3;
    }

    /* First 3 bits set */
    else if((c + 2) < e && 
            (c[0] & 0xF0) == 0xE0 &&
            (c[1] & 0xC0) == 0x80 &&
            (c[2] & 0xC0) == 0x80)
    {
      out[pos++] = (ntfs_char)(((ntfs_char)c[0] & 15) << 12 |
                             ((ntfs_char)c[1] & 63) << 6 |
                             ((ntfs_char)c[2] & 63));
      c += 2;
    }

    /* First 2 bits set */
    else if((c + 1) < e && 
            (c[0] & 0xE0) == 0xC0 &&
            (c[1] & 0xC0) == 0x80)
    {
      out[pos++] = (ntfs_char)(((ntfs_char)c[0] & 31) << 6 |
                             ((ntfs_char)c[1] & 63));
      c += 1;
    }

    /* First bit set */
    else if(!(c[0] & 0x80))  
    {
      out[pos++] = (ntfs_char)c[0];
    }

    /* Invalid encoding */
    else
    {
      out[pos++] = L'?';
    }
  }

  out[pos] = 0;
  return out;
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?