📄 utf8.c

📁 远程登陆工具软件源码用于远程登陆unix
💻 C
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/*
 * utf8.c - routines to handle UTF-8.
 */

#ifndef ENUM_CHARSETS

#include "charset.h"
#include "internal.h"

void read_utf8(charset_spec const *, long int, charset_state *,
	       void (*)(void *, long int), void *);
void write_utf8(charset_spec const *, long int,
		charset_state *, void (*)(void *, long int), void *);

/*
 * UTF-8 has no associated data, so `charset' may be ignored.
 */

void read_utf8(charset_spec const *charset, long int input_chr,
	       charset_state *state,
	       void (*emit)(void *ctx, long int output), void *emitctx)
{
    UNUSEDARG(charset);

    /*
     * For reading UTF-8, the `state' word contains:
     * 
     *  - in bits 29-31, the number of bytes expected to be in the
     *    current multibyte character (which we can tell instantly
     *    from the first byte, of course).
     * 
     *  - in bits 26-28, the number of bytes _seen so far_ in the
     *    current multibyte character.
     * 
     *  - in the remainder of the word, the current value of the
     *    character, which is shifted upwards by 6 bits to
     *    accommodate each new byte.
     * 
     * As required, the state is zero when we are not in the middle
     * of a multibyte character at all.
     * 
     * For example, when reading E9 8D 8B, starting at state=0:
     * 
     *  - after E9, the state is 0x64000009
     *  - after 8D, the state is 0x6800024d
     *  - after 8B, the state conceptually becomes 0x6c00934b, at
     *    which point we notice we've got as many characters as we
     *    were expecting, output U+934B, and reset the state to
     *    zero.
     *
     * Note that the maximum number of bits we might need to store
     * in the character value field is 25 (U+7FFFFFFF contains 31
     * bits, but we will never actually store its full value
     * because when we receive the last 6 bits in the final
     * continuation byte we will output it and revert the state to
     * zero). Hence the character value field never collides with
     * the byte counts.
     */

    if (input_chr < 0x80) {
	/*
	 * Single-byte character. If the state is nonzero before
	 * coming here, output an error for an incomplete sequence.
	 * Then output the character.
	 */
	if (state->s0 != 0) {
	    emit(emitctx, ERROR);
	    state->s0 = 0;
	}
	emit(emitctx, input_chr);
    } else if (input_chr == 0xFE || input_chr == 0xFF) {
	/*
	 * FE and FF bytes should _never_ occur in UTF-8. They are
	 * automatic errors; if the state was nonzero to start
	 * with, output a further error for an incomplete sequence.
	 */
	if (state->s0 != 0) {
	    emit(emitctx, ERROR);
	    state->s0 = 0;
	}
	emit(emitctx, ERROR);
    } else if (input_chr >= 0x80 && input_chr < 0xC0) {
	/*
	 * Continuation byte. Output an error for an unexpected
	 * continuation byte, if the state is zero.
	 */
	if (state->s0 == 0) {
	    emit(emitctx, ERROR);
	} else {
	    unsigned long charval;
	    unsigned long topstuff;
	    int bytes;

	    /*
	     * Otherwise, accumulate more of the character value.
	     */
	    charval = state->s0 & 0x03ffffffL;
	    charval = (charval << 6) | (input_chr & 0x3F);

	    /*
	     * Check the byte counts; if we have not reached the
	     * end of the character, update the state and return.
	     */
	    topstuff = state->s0 & 0xfc000000L;
	    topstuff += 0x04000000L;   /* add one to the byte count */
	    if (((topstuff << 3) ^ topstuff) & 0xe0000000L) {
		state->s0 = topstuff | charval;
		return;
	    }

	    /*
	     * Now we know we've reached the end of the character.
	     * `charval' is the Unicode value. We should check for
	     * various invalid things, and then either output
	     * charval or an error. In all cases we reset the state
	     * to zero.
	     */
	    bytes = topstuff >> 29;
	    state->s0 = 0;

	    if (charval >= 0xD800 && charval < 0xE000) {
		/*
		 * Surrogates (0xD800-0xDFFF) may never be encoded
		 * in UTF-8. A surrogate pair in Unicode should
		 * have been encoded as a single UTF-8 character
		 * occupying more than three bytes.
		 */
		emit(emitctx, ERROR);
	    } else if (charval == 0xFFFE || charval == 0xFFFF) {
		/*
		 * U+FFFE and U+FFFF are invalid Unicode characters
		 * and may never be encoded in UTF-8. (This is one
		 * reason why U+FFFF is our way of signalling an
		 * error to our `emit' function :-)
		 */
		emit(emitctx, ERROR);
	    } else if ((charval <= 0x7FL /* && bytes > 1 */) ||
		       (charval <= 0x7FFL && bytes > 2) ||
		       (charval <= 0xFFFFL && bytes > 3) ||
		       (charval <= 0x1FFFFFL && bytes > 4) ||
		       (charval <= 0x3FFFFFFL && bytes > 5)) {
		/*
		 * Overlong sequences are not to be tolerated,
		 * under any circumstances.
		 */
		emit(emitctx, ERROR);
	    } else {
		/*
		 * Oh, all right. We'll let this one off.
		 */
		emit(emitctx, charval);
	    }
	}

    } else {
	/*
	 * Lead byte. First output an error for an incomplete
	 * sequence, if the state is nonzero.
	 */
	if (state->s0 != 0)
	    emit(emitctx, ERROR);

	/*
	 * Now deal with the lead byte: work out the number of
	 * bytes we expect to see in this character, and extract
	 * the initial bits of it too.
	 */
	if (input_chr >= 0xC0 && input_chr < 0xE0) {
	    state->s0 = 0x44000000L | (input_chr & 0x1F);
	} else if (input_chr >= 0xE0 && input_chr < 0xF0) {
	    state->s0 = 0x64000000L | (input_chr & 0x0F);
	} else if (input_chr >= 0xF0 && input_chr < 0xF8) {
	    state->s0 = 0x84000000L | (input_chr & 0x07);
	} else if (input_chr >= 0xF8 && input_chr < 0xFC) {
	    state->s0 = 0xa4000000L | (input_chr & 0x03);
	} else if (input_chr >= 0xFC && input_chr < 0xFE) {
	    state->s0 = 0xc4000000L | (input_chr & 0x01);
	}
    }
}

/*
 * UTF-8 is a stateless multi-byte encoding (in the sense that just
 * after any character has been completed, the state is always the
 * same); hence when writing it, there is no need to use the
 * charset_state.
 */

void write_utf8(charset_spec const *charset, long int input_chr,
		charset_state *state,
		void (*emit)(void *ctx, long int output), void *emitctx)
{
    UNUSEDARG(charset);
    UNUSEDARG(state);

    /*
     * Refuse to output any illegal code points.
     */
    if (input_chr == 0xFFFE || input_chr == 0xFFFF ||
	(input_chr >= 0xD800 && input_chr < 0xE000)) {
	emit(emitctx, ERROR);
    } else if (input_chr < 0x80) {     /* one-byte character */
	emit(emitctx, input_chr);
    } else if (input_chr < 0x800) {    /* two-byte character */
	emit(emitctx, 0xC0 | (0x1F & (input_chr >>  6)));
	emit(emitctx, 0x80 | (0x3F & (input_chr      )));
    } else if (input_chr < 0x10000) {  /* three-byte character */
	emit(emitctx, 0xE0 | (0x0F & (input_chr >> 12)));
	emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
	emit(emitctx, 0x80 | (0x3F & (input_chr      )));
    } else if (input_chr < 0x200000) { /* four-byte character */
	emit(emitctx, 0xF0 | (0x07 & (input_chr >> 18)));
	emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
	emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
	emit(emitctx, 0x80 | (0x3F & (input_chr      )));
    } else if (input_chr < 0x4000000) {/* five-byte character */
	emit(emitctx, 0xF8 | (0x03 & (input_chr >> 24)));
	emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
	emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
	emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
	emit(emitctx, 0x80 | (0x3F & (input_chr      )));
    } else {			       /* six-byte character */
	emit(emitctx, 0xFC | (0x01 & (input_chr >> 30)));
	emit(emitctx, 0x80 | (0x3F & (input_chr >> 24)));
	emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
	emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
	emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
	emit(emitctx, 0x80 | (0x3F & (input_chr      )));
    }
}

#ifdef TESTMODE

#include <stdio.h>
#include <stdarg.h>

int total_errs = 0;

void utf8_emit(void *ctx, long output)
{
    wchar_t **p = (wchar_t **)ctx;
    *(*p)++ = output;
}

void utf8_read_test(int line, char *input, int inlen, ...)
{
    va_list ap;
    wchar_t *p, str[512];
    int i;
    charset_state state;
    unsigned long l;

    state.s0 = 0;
    p = str;

    for (i = 0; i < inlen; i++)
	read_utf8(NULL, input[i] & 0xFF, &state, utf8_emit, &p);

    va_start(ap, inlen);
    l = 0;
    for (i = 0; i < p - str; i++) {
	l = va_arg(ap, long int);
	if (l == -1) {
	    printf("%d: correct string shorter than output\n", line);
	    total_errs++;
	    break;
	}
	if (l != str[i]) {
	    printf("%d: char %d came out as %08x, should be %08x\n",
		    line, i, str[i], l);
	    total_errs++;
	}
    }
    if (l != -1) {
	l = va_arg(ap, long int);
	if (l != -1) {
	    printf("%d: correct string longer than output\n", line);
	    total_errs++;
	}
    }
    va_end(ap);
}

void utf8_write_test(int line, const long *input, int inlen, ...)
{
    va_list ap;
    wchar_t *p, str[512];
    int i;
    charset_state state;
    unsigned long l;

    state.s0 = 0;
    p = str;

    for (i = 0; i < inlen; i++)
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -