convert_utf.c.svn-base

来自「SumatraPDF是一款小型开源的pdf阅读工具。虽然玲珑小巧(只有800多K」· SVN-BASE 代码 · 共 534 行 · 第 1/2 页

SVN-BASE
534
字号
	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;	    case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);    }    target += bytesToWrite;  }*sourceStart = source;*targetStart = target;return result;}/* --------------------------------------------------------------------- *//* * Utility routine to tell whether a sequence of bytes is legal UTF-8. * This must be called with the length pre-determined by the first byte. * If not calling this from ConvertUTF8to*, then the length can be set by: *  length = trailingBytesForUTF8[*source]+1; * and the sequence is illegal right away if there aren't that many bytes * available. * If presented with a length > 4, this returns false.  The Unicode * definition of UTF-8 goes up to 4-byte sequences. */static Boolean isLegalUTF8(const UTF8 *source, int length) {  UTF8 a;  const UTF8 *srcptr = source+length;  switch (length) {    default: return false;      /* Everything else falls through when "true"... */    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;    case 2: if ((a = (*--srcptr)) > 0xBF) return false;      switch (*source) {        /* no fall-through in this inner switch */        case 0xE0: if (a < 0xA0) return false; break;        case 0xED: if (a > 0x9F) return false; break;        case 0xF0: if (a < 0x90) return false; break;        case 0xF4: if (a > 0x8F) return false; break;        default:   if (a < 0x80) return false;      }      case 1: if (*source >= 0x80 && *source < 0xC2) return false;  }  if (*source > 0xF4) return false;  return true;}/* --------------------------------------------------------------------- *//* * Exported function to return whether a UTF-8 sequence is legal or not. * This is not used here; it's just exported. */Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {  int length = trailingBytesForUTF8[*source]+1;  if (source+length > sourceEnd) {    return false;  }  return isLegalUTF8(source, length);}/* --------------------------------------------------------------------- */ConversionResult ConvertUTF8toUTF16 (const UTF8** sourceStart, const UTF8* sourceEnd,                                     UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {  ConversionResult result = conversionOK;  const UTF8* source = *sourceStart;  UTF16* target = *targetStart;  while (source < sourceEnd) {    UTF32 ch = 0;    unsigned short extraBytesToRead = trailingBytesForUTF8[*source];    if (source + extraBytesToRead >= sourceEnd) {	    result = sourceExhausted; break;    }    /* Do this check whether lenient or strict */    if (! isLegalUTF8(source, extraBytesToRead+1)) {	    result = sourceIllegal;	    break;    }    /*     * The cases all fall through. See "Note A" below.     */    switch (extraBytesToRead) {	    case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */	    case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */	    case 3: ch += *source++; ch <<= 6;	    case 2: ch += *source++; ch <<= 6;	    case 1: ch += *source++; ch <<= 6;	    case 0: ch += *source++;    }    ch -= offsetsFromUTF8[extraBytesToRead];    if (target >= targetEnd) {	    source -= (extraBytesToRead+1); /* Back up source pointer! */	    result = targetExhausted; break;    }    if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */	    /* UTF-16 surrogate values are illegal in UTF-32 */	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {        if (flags == strictConversion) {          source -= (extraBytesToRead+1); /* return to the illegal value itself */          result = sourceIllegal;          break;        } else {          *target++ = UNI_REPLACEMENT_CHAR;        }	    } else {        *target++ = (UTF16)ch; /* normal case */	    }    } else if (ch > UNI_MAX_UTF16) {	    if (flags == strictConversion) {        result = sourceIllegal;        source -= (extraBytesToRead+1); /* return to the start */        break; /* Bail out; shouldn't continue */	    } else {        *target++ = UNI_REPLACEMENT_CHAR;	    }    } else {	    /* target is a character in range 0xFFFF - 0x10FFFF. */	    if (target + 1 >= targetEnd) {        source -= (extraBytesToRead+1); /* Back up source pointer! */        result = targetExhausted; break;	    }	    ch -= halfBase;	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);    }  }*sourceStart = source;*targetStart = target;return result;}/* --------------------------------------------------------------------- */ConversionResult ConvertUTF32toUTF8 (const UTF32** sourceStart, const UTF32* sourceEnd,                                     UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {  ConversionResult result = conversionOK;  const UTF32* source = *sourceStart;  UTF8* target = *targetStart;  while (source < sourceEnd) {    UTF32 ch;    unsigned short bytesToWrite = 0;    const UTF32 byteMask = 0xBF;    const UTF32 byteMark = 0x80;    ch = *source++;    if (flags == strictConversion ) {	    /* UTF-16 surrogate values are illegal in UTF-32 */	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {        --source; /* return to the illegal value itself */        result = sourceIllegal;        break;	    }    }    /*     * Figure out how many bytes the result will require. Turn any     * illegally large UTF32 things (> Plane 17) into replacement chars.     */    if (ch < (UTF32)0x80) {	     bytesToWrite = 1;    } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;    } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;    } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;    } else {			    bytesToWrite = 3;      ch = UNI_REPLACEMENT_CHAR;      result = sourceIllegal;    }    target += bytesToWrite;    if (target > targetEnd) {	    --source; /* Back up source pointer! */	    target -= bytesToWrite; result = targetExhausted; break;    }    switch (bytesToWrite) { /* note: everything falls through. */	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;	    case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);    }    target += bytesToWrite;  }*sourceStart = source;*targetStart = target;return result;}/* --------------------------------------------------------------------- */ConversionResult ConvertUTF8toUTF32 (const UTF8** sourceStart, const UTF8* sourceEnd,                                     UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {  ConversionResult result = conversionOK;  const UTF8* source = *sourceStart;  UTF32* target = *targetStart;  while (source < sourceEnd) {    UTF32 ch = 0;    unsigned short extraBytesToRead = trailingBytesForUTF8[*source];    if (source + extraBytesToRead >= sourceEnd) {	    result = sourceExhausted; break;    }    /* Do this check whether lenient or strict */    if (! isLegalUTF8(source, extraBytesToRead+1)) {	    result = sourceIllegal;	    break;    }    /*     * The cases all fall through. See "Note A" below.     */    switch (extraBytesToRead) {	    case 5: ch += *source++; ch <<= 6;	    case 4: ch += *source++; ch <<= 6;	    case 3: ch += *source++; ch <<= 6;	    case 2: ch += *source++; ch <<= 6;	    case 1: ch += *source++; ch <<= 6;	    case 0: ch += *source++;    }    ch -= offsetsFromUTF8[extraBytesToRead];    if (target >= targetEnd) {	    source -= (extraBytesToRead+1); /* Back up the source pointer! */	    result = targetExhausted; break;    }    if (ch <= UNI_MAX_LEGAL_UTF32) {	    /*	     * UTF-16 surrogate values are illegal in UTF-32, and anything	     * over Plane 17 (> 0x10FFFF) is illegal.	     */	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {        if (flags == strictConversion) {          source -= (extraBytesToRead+1); /* return to the illegal value itself */          result = sourceIllegal;          break;        } else {          *target++ = UNI_REPLACEMENT_CHAR;        }	    } else {        *target++ = ch;	    }    } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */	    result = sourceIllegal;	    *target++ = UNI_REPLACEMENT_CHAR;    }  }  *sourceStart = source;  *targetStart = target;  return result;}/* ---------------------------------------------------------------------Note A.The fall-through switches in UTF-8 reading code save atemp variable, some decrements & conditionals.  The switchesare equivalent to the following loop:{  int tmpBytesToRead = extraBytesToRead+1;  do {		ch += *source++;		--tmpBytesToRead;		if (tmpBytesToRead) ch <<= 6;  } while (tmpBytesToRead > 0);}In UTF-8 writing code, the switches on "bytesToWrite" aresimilarly unrolled loops.--------------------------------------------------------------------- */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?