📄 utf8.c
字号:
/*
* utf8.c - routines to handle UTF-8.
*/
#ifndef ENUM_CHARSETS
#include "charset.h"
#include "internal.h"
void read_utf8(charset_spec const *, long int, charset_state *,
void (*)(void *, long int), void *);
void write_utf8(charset_spec const *, long int,
charset_state *, void (*)(void *, long int), void *);
/*
* UTF-8 has no associated data, so `charset' may be ignored.
*/
void read_utf8(charset_spec const *charset, long int input_chr,
charset_state *state,
void (*emit)(void *ctx, long int output), void *emitctx)
{
UNUSEDARG(charset);
/*
* For reading UTF-8, the `state' word contains:
*
* - in bits 29-31, the number of bytes expected to be in the
* current multibyte character (which we can tell instantly
* from the first byte, of course).
*
* - in bits 26-28, the number of bytes _seen so far_ in the
* current multibyte character.
*
* - in the remainder of the word, the current value of the
* character, which is shifted upwards by 6 bits to
* accommodate each new byte.
*
* As required, the state is zero when we are not in the middle
* of a multibyte character at all.
*
* For example, when reading E9 8D 8B, starting at state=0:
*
* - after E9, the state is 0x64000009
* - after 8D, the state is 0x6800024d
* - after 8B, the state conceptually becomes 0x6c00934b, at
* which point we notice we've got as many characters as we
* were expecting, output U+934B, and reset the state to
* zero.
*
* Note that the maximum number of bits we might need to store
* in the character value field is 25 (U+7FFFFFFF contains 31
* bits, but we will never actually store its full value
* because when we receive the last 6 bits in the final
* continuation byte we will output it and revert the state to
* zero). Hence the character value field never collides with
* the byte counts.
*/
if (input_chr < 0x80) {
/*
* Single-byte character. If the state is nonzero before
* coming here, output an error for an incomplete sequence.
* Then output the character.
*/
if (state->s0 != 0) {
emit(emitctx, ERROR);
state->s0 = 0;
}
emit(emitctx, input_chr);
} else if (input_chr == 0xFE || input_chr == 0xFF) {
/*
* FE and FF bytes should _never_ occur in UTF-8. They are
* automatic errors; if the state was nonzero to start
* with, output a further error for an incomplete sequence.
*/
if (state->s0 != 0) {
emit(emitctx, ERROR);
state->s0 = 0;
}
emit(emitctx, ERROR);
} else if (input_chr >= 0x80 && input_chr < 0xC0) {
/*
* Continuation byte. Output an error for an unexpected
* continuation byte, if the state is zero.
*/
if (state->s0 == 0) {
emit(emitctx, ERROR);
} else {
unsigned long charval;
unsigned long topstuff;
int bytes;
/*
* Otherwise, accumulate more of the character value.
*/
charval = state->s0 & 0x03ffffffL;
charval = (charval << 6) | (input_chr & 0x3F);
/*
* Check the byte counts; if we have not reached the
* end of the character, update the state and return.
*/
topstuff = state->s0 & 0xfc000000L;
topstuff += 0x04000000L; /* add one to the byte count */
if (((topstuff << 3) ^ topstuff) & 0xe0000000L) {
state->s0 = topstuff | charval;
return;
}
/*
* Now we know we've reached the end of the character.
* `charval' is the Unicode value. We should check for
* various invalid things, and then either output
* charval or an error. In all cases we reset the state
* to zero.
*/
bytes = topstuff >> 29;
state->s0 = 0;
if (charval >= 0xD800 && charval < 0xE000) {
/*
* Surrogates (0xD800-0xDFFF) may never be encoded
* in UTF-8. A surrogate pair in Unicode should
* have been encoded as a single UTF-8 character
* occupying more than three bytes.
*/
emit(emitctx, ERROR);
} else if (charval == 0xFFFE || charval == 0xFFFF) {
/*
* U+FFFE and U+FFFF are invalid Unicode characters
* and may never be encoded in UTF-8. (This is one
* reason why U+FFFF is our way of signalling an
* error to our `emit' function :-)
*/
emit(emitctx, ERROR);
} else if ((charval <= 0x7FL /* && bytes > 1 */) ||
(charval <= 0x7FFL && bytes > 2) ||
(charval <= 0xFFFFL && bytes > 3) ||
(charval <= 0x1FFFFFL && bytes > 4) ||
(charval <= 0x3FFFFFFL && bytes > 5)) {
/*
* Overlong sequences are not to be tolerated,
* under any circumstances.
*/
emit(emitctx, ERROR);
} else {
/*
* Oh, all right. We'll let this one off.
*/
emit(emitctx, charval);
}
}
} else {
/*
* Lead byte. First output an error for an incomplete
* sequence, if the state is nonzero.
*/
if (state->s0 != 0)
emit(emitctx, ERROR);
/*
* Now deal with the lead byte: work out the number of
* bytes we expect to see in this character, and extract
* the initial bits of it too.
*/
if (input_chr >= 0xC0 && input_chr < 0xE0) {
state->s0 = 0x44000000L | (input_chr & 0x1F);
} else if (input_chr >= 0xE0 && input_chr < 0xF0) {
state->s0 = 0x64000000L | (input_chr & 0x0F);
} else if (input_chr >= 0xF0 && input_chr < 0xF8) {
state->s0 = 0x84000000L | (input_chr & 0x07);
} else if (input_chr >= 0xF8 && input_chr < 0xFC) {
state->s0 = 0xa4000000L | (input_chr & 0x03);
} else if (input_chr >= 0xFC && input_chr < 0xFE) {
state->s0 = 0xc4000000L | (input_chr & 0x01);
}
}
}
/*
* UTF-8 is a stateless multi-byte encoding (in the sense that just
* after any character has been completed, the state is always the
* same); hence when writing it, there is no need to use the
* charset_state.
*/
void write_utf8(charset_spec const *charset, long int input_chr,
charset_state *state,
void (*emit)(void *ctx, long int output), void *emitctx)
{
UNUSEDARG(charset);
UNUSEDARG(state);
/*
* Refuse to output any illegal code points.
*/
if (input_chr == 0xFFFE || input_chr == 0xFFFF ||
(input_chr >= 0xD800 && input_chr < 0xE000)) {
emit(emitctx, ERROR);
} else if (input_chr < 0x80) { /* one-byte character */
emit(emitctx, input_chr);
} else if (input_chr < 0x800) { /* two-byte character */
emit(emitctx, 0xC0 | (0x1F & (input_chr >> 6)));
emit(emitctx, 0x80 | (0x3F & (input_chr )));
} else if (input_chr < 0x10000) { /* three-byte character */
emit(emitctx, 0xE0 | (0x0F & (input_chr >> 12)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
emit(emitctx, 0x80 | (0x3F & (input_chr )));
} else if (input_chr < 0x200000) { /* four-byte character */
emit(emitctx, 0xF0 | (0x07 & (input_chr >> 18)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
emit(emitctx, 0x80 | (0x3F & (input_chr )));
} else if (input_chr < 0x4000000) {/* five-byte character */
emit(emitctx, 0xF8 | (0x03 & (input_chr >> 24)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
emit(emitctx, 0x80 | (0x3F & (input_chr )));
} else { /* six-byte character */
emit(emitctx, 0xFC | (0x01 & (input_chr >> 30)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 24)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
emit(emitctx, 0x80 | (0x3F & (input_chr )));
}
}
#ifdef TESTMODE
#include <stdio.h>
#include <stdarg.h>
int total_errs = 0;
void utf8_emit(void *ctx, long output)
{
wchar_t **p = (wchar_t **)ctx;
*(*p)++ = output;
}
void utf8_read_test(int line, char *input, int inlen, ...)
{
va_list ap;
wchar_t *p, str[512];
int i;
charset_state state;
unsigned long l;
state.s0 = 0;
p = str;
for (i = 0; i < inlen; i++)
read_utf8(NULL, input[i] & 0xFF, &state, utf8_emit, &p);
va_start(ap, inlen);
l = 0;
for (i = 0; i < p - str; i++) {
l = va_arg(ap, long int);
if (l == -1) {
printf("%d: correct string shorter than output\n", line);
total_errs++;
break;
}
if (l != str[i]) {
printf("%d: char %d came out as %08x, should be %08x\n",
line, i, str[i], l);
total_errs++;
}
}
if (l != -1) {
l = va_arg(ap, long int);
if (l != -1) {
printf("%d: correct string longer than output\n", line);
total_errs++;
}
}
va_end(ap);
}
void utf8_write_test(int line, const long *input, int inlen, ...)
{
va_list ap;
wchar_t *p, str[512];
int i;
charset_state state;
unsigned long l;
state.s0 = 0;
p = str;
for (i = 0; i < inlen; i++)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -